mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-01 18:17:42 +02:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 59377a6c87 | |||
| 1239267cc4 | |||
| 7a4ca3cbd9 | |||
| b4d05a3d2f | |||
| 2dc3ce2166 | |||
| 3bc8d2cf23 | |||
| 8a98ba4582 | |||
| 2634ed207a | |||
| 41ea26144e | |||
| 89f10baad5 |
@@ -4,7 +4,7 @@
|
||||
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
|
||||
# `_module.args.pkgs` (defined in this case by flake-parts).
|
||||
perSystem =
|
||||
{ system, ... }:
|
||||
{ lib, system, ... }:
|
||||
{
|
||||
_module.args = {
|
||||
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
@@ -33,7 +33,7 @@
|
||||
"CUDA EULA"
|
||||
"cuDNN EULA"
|
||||
]
|
||||
) (p.meta.licenses or [ p.meta.license ]);
|
||||
) (p.meta.licenses or (lib.toList p.meta.license));
|
||||
};
|
||||
# Ensure dependencies use ROCm consistently
|
||||
pkgsRocm = import inputs.nixpkgs {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
llamaVersion,
|
||||
numpy,
|
||||
tqdm,
|
||||
requests,
|
||||
sentencepiece,
|
||||
pyyaml,
|
||||
poetry-core,
|
||||
@@ -20,6 +21,7 @@ buildPythonPackage {
|
||||
tqdm
|
||||
sentencepiece
|
||||
pyyaml
|
||||
requests
|
||||
];
|
||||
src = lib.cleanSource ../../gguf-py;
|
||||
pythonImportsCheck = [
|
||||
|
||||
+5
-11
@@ -7,13 +7,6 @@
|
||||
|
||||
let
|
||||
pythonPackages = python3.pkgs;
|
||||
buildPythonPackage = pythonPackages.buildPythonPackage;
|
||||
numpy = pythonPackages.numpy;
|
||||
tqdm = pythonPackages.tqdm;
|
||||
sentencepiece = pythonPackages.sentencepiece;
|
||||
pyyaml = pythonPackages.pyyaml;
|
||||
poetry-core = pythonPackages.poetry-core;
|
||||
pytestCheckHook = pythonPackages.pytestCheckHook;
|
||||
in
|
||||
|
||||
# We're using `makeScope` instead of just writing out an attrset
|
||||
@@ -23,17 +16,18 @@ in
|
||||
lib.makeScope newScope (self: {
|
||||
inherit llamaVersion;
|
||||
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||
inherit
|
||||
buildPythonPackage
|
||||
inherit (pythonPackages)
|
||||
numpy
|
||||
tqdm
|
||||
sentencepiece
|
||||
poetry-core
|
||||
pyyaml
|
||||
pytestCheckHook
|
||||
requests
|
||||
buildPythonPackage
|
||||
poetry-core
|
||||
;
|
||||
};
|
||||
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||
python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
docker = self.callPackage ./docker.nix { };
|
||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023-2024 The ggml authors
|
||||
Copyright (c) 2023-2026 The ggml authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
+191
-13
@@ -7,6 +7,18 @@
|
||||
#include <cstdio>
|
||||
#include <sstream>
|
||||
|
||||
// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32.
|
||||
#define LCG_FACTOR 2654435761UL
|
||||
|
||||
// Compute the LCG hash of a n-gram of size len at offset start.
|
||||
static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) {
|
||||
uint32_t hash = 0;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
hash = hash * LCG_FACTOR + tokens[start + i];
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
|
||||
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
|
||||
std::ostringstream oss;
|
||||
@@ -115,6 +127,100 @@ llama_tokens common_ngram_simple_draft(
|
||||
// maximum number of counted values of a ngram map value.
|
||||
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
|
||||
|
||||
void common_ngram_map_begin(
|
||||
common_ngram_map & map, const llama_tokens & tokens) {
|
||||
size_t size_begin = tokens.size();
|
||||
|
||||
LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__,
|
||||
map.idx_last_check, size_begin, map.keys.size());
|
||||
|
||||
size_t count_map_entries_upd = 0;
|
||||
if (!map.key_map.empty() && size_begin < map.idx_last_check) {
|
||||
if (map.show_key_map_stats) {
|
||||
// Print statistics of hash map map_key.
|
||||
size_t count_nonzero = 0;
|
||||
uint32_t min_idx = UINT32_MAX;
|
||||
uint32_t max_idx = 0;
|
||||
for (size_t i = 0; i < map.key_map.size(); ++i) {
|
||||
uint32_t key_idx = map.key_map[i];
|
||||
if (key_idx != 0) {
|
||||
++count_nonzero;
|
||||
if (key_idx < min_idx) min_idx = key_idx;
|
||||
if (key_idx > max_idx) max_idx = key_idx;
|
||||
}
|
||||
}
|
||||
if (count_nonzero == 0) {
|
||||
min_idx = 0;
|
||||
}
|
||||
LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n",
|
||||
__func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx);
|
||||
}
|
||||
|
||||
// Update the map from hash to key index (clear outdated entries).
|
||||
for (size_t i = 0; i < map.key_map.size(); ++i) {
|
||||
uint32_t key_idx = map.key_map[i];
|
||||
if (key_idx >= map.size_last_begin) {
|
||||
map.key_map[i] = 0;
|
||||
count_map_entries_upd++;
|
||||
}
|
||||
}
|
||||
map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
|
||||
}
|
||||
|
||||
if (size_begin < map.idx_last_check && !map.keys.empty()) {
|
||||
// The next token generation will start at index size_begin.
|
||||
// The tokens between map.size_last_begin and size_begin are no longer valid.
|
||||
//
|
||||
// Refresh map: Remove all entries with index >= map.size_last_begin.
|
||||
size_t count_keys = map.keys.size();
|
||||
size_t count_keys_del = 0;
|
||||
size_t count_values_del = 0;
|
||||
for (int32_t i = map.keys.size() - 1; i >= 0; --i) {
|
||||
common_ngram_map_key & key = map.keys[i];
|
||||
if (key.key_idx >= map.size_last_begin) {
|
||||
// Delete the key.
|
||||
LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin);
|
||||
map.keys.erase(map.keys.begin() + i);
|
||||
count_keys_del++;
|
||||
continue;
|
||||
}
|
||||
if (map.key_only) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check the indices of the values.
|
||||
for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) {
|
||||
common_ngram_map_value & value = key.values[j];
|
||||
if (value.value_idx >= map.size_last_begin) {
|
||||
// Delete the value.
|
||||
count_values_del++;
|
||||
|
||||
// Move all values after this value to the left.
|
||||
for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) {
|
||||
key.values[k] = key.values[k + 1];
|
||||
}
|
||||
// Clear the last value.
|
||||
key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0;
|
||||
key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0;
|
||||
}
|
||||
}
|
||||
if (key.values[0].value_idx == 0) {
|
||||
// No values left, delete the key.
|
||||
LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx);
|
||||
map.keys.erase(map.keys.begin() + i);
|
||||
count_keys_del++;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__,
|
||||
map.idx_last_check, size_begin,
|
||||
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
|
||||
}
|
||||
|
||||
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
|
||||
map.size_last_begin = size_begin;
|
||||
}
|
||||
|
||||
void common_ngram_map_draft(common_ngram_map & map,
|
||||
const llama_tokens & inp, llama_token sampled,
|
||||
llama_tokens & draft) {
|
||||
@@ -129,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
if (cur_len < static_cast<size_t>(2 * n + m)) {
|
||||
return;
|
||||
}
|
||||
if (cur_len >= static_cast<size_t>(UINT32_MAX)) {
|
||||
// key_map uses uint32_t instead of size_t.
|
||||
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
|
||||
}
|
||||
|
||||
// Only check every check_rate tokens to save compute
|
||||
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
|
||||
@@ -147,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
|
||||
// search for the key in the map
|
||||
size_t match_pos = 0;
|
||||
for (size_t j = cur_len - n - m - 1; j > 0; --j) {
|
||||
bool match = true;
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
if (inp[j + k] != key_tokens[k]) {
|
||||
match = false;
|
||||
break;
|
||||
if (map.size_last_begin > cur_len) {
|
||||
GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len);
|
||||
}
|
||||
if (!map.key_map.empty()) {
|
||||
// Search for the key in the map key_map from hash of ngrams to index of ngram.
|
||||
uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size());
|
||||
uint32_t idx_key = map.key_map[idx_hash];
|
||||
if (idx_key != 0 && idx_key < cur_len - n - m - 1) {
|
||||
// Check if the key matches the key at idx_key (because of possible collisions).
|
||||
bool match = true;
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
if (inp[idx_key + k] != key_tokens[k]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0);
|
||||
if (match) {
|
||||
match_pos = idx_key;
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
match_pos = j;
|
||||
break;
|
||||
}
|
||||
if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) {
|
||||
// Search for the key in [1, map.size_last_begin - n - m -1], descending.
|
||||
for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
|
||||
// Check if the key matches the key.
|
||||
bool match = true;
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
if (inp[j + k] != key_tokens[k]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
match_pos = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (match_pos == 0) {
|
||||
// In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later.
|
||||
//
|
||||
// Search in [size_last_begin, cur_len - n - m - 1], descending.
|
||||
for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
|
||||
bool match = true;
|
||||
for (size_t k = 0; k < n; ++k) {
|
||||
if (inp[j + k] != key_tokens[k]) {
|
||||
match = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (match) {
|
||||
match_pos = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (match_pos > 0) {
|
||||
LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
|
||||
LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
|
||||
cur_len, n, m, key_tokens.size(), sampled, match_pos);
|
||||
}
|
||||
|
||||
if (!map.key_map.empty()) {
|
||||
// Add hashes of new ngrams in key_map.
|
||||
//
|
||||
// Use the same order as above.
|
||||
if (map.size_last_begin > (size_t) (n + m + 1)) {
|
||||
for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
|
||||
// compute hash and store index of ngram at idx j in the map.
|
||||
uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
|
||||
if (map.key_map[idx_hash] == 0) {
|
||||
map.key_map[idx_hash] = j; // collisions may occur
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
|
||||
// compute hash and store index of ngram at idx j in the map.
|
||||
uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
|
||||
if (map.key_map[idx_hash] == 0) {
|
||||
map.key_map[idx_hash] = j;
|
||||
}
|
||||
}
|
||||
map.key_map_last_idx = std::max(static_cast<uint32_t>(cur_len - n - m - 1), map.key_map_last_idx);
|
||||
}
|
||||
|
||||
if (match_pos == 0) {
|
||||
return;
|
||||
}
|
||||
@@ -215,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
draft.push_back(inp[match_pos + n + i]);
|
||||
}
|
||||
|
||||
LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
|
||||
key_offset, curr_key.key_num, draft.size());
|
||||
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
|
||||
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
|
||||
|
||||
map.last_draft_created = false;
|
||||
map.last_draft_key_idx = key_offset;
|
||||
@@ -318,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
}
|
||||
}
|
||||
|
||||
if (sum_occur > 0 && max_occur < 3 * sum_occur) {
|
||||
if (sum_occur > 0 && max_occur < 2 * sum_occur) {
|
||||
// The most frequent value is not much more frequent than the other values.
|
||||
// We do not use the draft.
|
||||
return;
|
||||
|
||||
+30
-5
@@ -9,6 +9,8 @@
|
||||
// 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
|
||||
// The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/18471
|
||||
//
|
||||
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
@@ -51,10 +53,13 @@ llama_tokens common_ngram_simple_draft(
|
||||
// maximum number of m-gram values stored for each key n-gram.
|
||||
#define COMMON_NGRAM_MAX_VALUES 4
|
||||
|
||||
// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
|
||||
#define COMMON_NGRAM_HASH_MAP_SIZE 262144
|
||||
|
||||
// statistics of a m-gram after a known n-gram
|
||||
struct common_ngram_map_value {
|
||||
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
|
||||
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
|
||||
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
|
||||
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
|
||||
int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
|
||||
};
|
||||
|
||||
@@ -74,23 +79,43 @@ struct common_ngram_map {
|
||||
|
||||
bool key_only; // true if only key n-grams are used, no values.
|
||||
|
||||
// first draft: vector only, no map.
|
||||
std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
|
||||
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
|
||||
uint16_t min_hits; // minimum number of key hits to consider a draft
|
||||
|
||||
bool show_key_map_stats = false; // true, if statitics of the key_map should be printed.
|
||||
|
||||
common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
|
||||
uint16_t check_rate, uint16_t min_hits)
|
||||
: size_key(sz_key), size_value(sz_value), key_only(only_keys),
|
||||
check_rate(check_rate), min_hits(min_hits) {}
|
||||
check_rate(check_rate), min_hits(min_hits) {
|
||||
key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
|
||||
}
|
||||
|
||||
// In reasoning chats the previous reasoning block will be removed from context history.
|
||||
// A rebuild of the ngram map is needed after that.
|
||||
|
||||
size_t size_last_begin = 0; // number of tokens at previous start of generation
|
||||
|
||||
bool last_draft_created = false; // true if a draft was created at last call.
|
||||
size_t last_draft_key_idx = 0; // index of last key used for draft generation.
|
||||
size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft)
|
||||
uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
|
||||
|
||||
size_t idx_last_check = 0; // index of last check in context history
|
||||
|
||||
// optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
|
||||
//
|
||||
// uint32_t instead of size_t (size of current histories is << UINT32_MAX)
|
||||
std::vector<uint32_t> key_map; // key_map[hash] = index of ngram in context window
|
||||
uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map
|
||||
};
|
||||
|
||||
// Initialize the n-gram map with the given token history.
|
||||
// map: the ngram map to initialize.
|
||||
// tokens: the token history to base the map on.
|
||||
void common_ngram_map_begin(
|
||||
common_ngram_map & map,
|
||||
const llama_tokens & tokens);
|
||||
|
||||
// Searches for the n-gram in the history and checks whether a draft sequence should be generated.
|
||||
// map: the ngram map to search in.
|
||||
|
||||
+17
-8
@@ -124,9 +124,9 @@ struct common_speculative_state {
|
||||
// TODO: track performance of most recent calls
|
||||
const bool gen_perf = true; // whether to generate performance stats.
|
||||
|
||||
// TODO: rename to t_draft_us
|
||||
// TODO: add t_begin_us, t_accept_us
|
||||
int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
|
||||
int64_t t_begin_us = 0; // total time spent in refresh of this implementation in microseconds.
|
||||
int64_t t_draft_us = 0; // total time spent in generating drafts in this implementation in microseconds.
|
||||
int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds.
|
||||
|
||||
common_speculative_state(enum common_speculative_type type) : type(type) {}
|
||||
|
||||
@@ -499,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
|
||||
: common_speculative_state(type), map(std::move(map)) {}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
GGML_UNUSED(prompt);
|
||||
common_ngram_map_begin(map, prompt);
|
||||
}
|
||||
|
||||
void draft(
|
||||
@@ -951,7 +951,12 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
|
||||
}
|
||||
|
||||
for (auto & impl : spec->impls) {
|
||||
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
|
||||
impl->begin(prompt);
|
||||
|
||||
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
impl->t_begin_us += t_now_us - t_start_us; // accumulate duration for this refresh
|
||||
}
|
||||
}
|
||||
|
||||
@@ -973,7 +978,7 @@ llama_tokens common_speculative_draft(
|
||||
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
|
||||
impl->drafts_call_count++;
|
||||
impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation
|
||||
impl->t_draft_us += t_now_us - t_start_us; // accumulate duration for this implementation
|
||||
}
|
||||
|
||||
if (!result.empty()) {
|
||||
@@ -1001,12 +1006,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
|
||||
|
||||
GGML_ASSERT(impl);
|
||||
|
||||
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
if (n_accepted > 0) {
|
||||
impl->drafts_accepted_count++;
|
||||
impl->drafts_accepted_tokens += n_accepted;
|
||||
}
|
||||
|
||||
impl->accept(n_accepted);
|
||||
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
|
||||
impl->t_accept_us += t_now_us - t_start_us; // accumulate duration for this acculumulation
|
||||
}
|
||||
|
||||
void common_speculative_print_stats(const common_speculative * spec) {
|
||||
@@ -1018,13 +1026,14 @@ void common_speculative_print_stats(const common_speculative * spec) {
|
||||
std::string str_perf;
|
||||
if (impl->gen_perf) {
|
||||
std::ostringstream oss;
|
||||
oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0;
|
||||
str_perf = ", dur = " + oss.str() + " ms";
|
||||
oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", ";
|
||||
oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", ";
|
||||
oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0;
|
||||
str_perf = ", dur(b,g,a) = " + oss.str() + " ms";
|
||||
} else {
|
||||
str_perf = "";
|
||||
}
|
||||
|
||||
// TODO: report time for begin() and accept()
|
||||
LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
|
||||
common_speculative_type_to_str(impl->type).c_str(),
|
||||
impl->drafts_call_count,
|
||||
|
||||
+12
-22
@@ -119,7 +119,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
||||
*Notes:*
|
||||
|
||||
- **Memory**
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`.
|
||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||
|
||||
- **Execution Unit (EU)**
|
||||
@@ -423,16 +423,12 @@ Choose one of following methods to run.
|
||||
- Use device 0:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run-llama2.sh 0
|
||||
# OR
|
||||
./examples/sycl/run-llama3.sh 0
|
||||
./examples/sycl/test.sh -mg 0
|
||||
```
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run-llama2.sh
|
||||
# OR
|
||||
./examples/sycl/run-llama3.sh
|
||||
./examples/sycl/test.sh
|
||||
```
|
||||
|
||||
2. Command line
|
||||
@@ -455,13 +451,13 @@ Examples:
|
||||
- Use device 0:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```sh
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer --mmap
|
||||
```
|
||||
|
||||
*Notes:*
|
||||
@@ -577,13 +573,13 @@ Or, use CMake presets to build:
|
||||
|
||||
```sh
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-completion
|
||||
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-completion
|
||||
|
||||
cmake --preset x64-windows-sycl-debug
|
||||
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
|
||||
```
|
||||
|
||||
#### 3. Visual Studio
|
||||
@@ -608,7 +604,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
|
||||
- For a minimal experimental setup, you can build only the inference executable using:
|
||||
|
||||
```Powershell
|
||||
cmake --build build --config Release -j --target llama-cli
|
||||
cmake --build build --config Release -j --target llama-completion
|
||||
```
|
||||
|
||||
##### - Generating a Visual Studio Solution
|
||||
@@ -714,13 +710,7 @@ Choose one of following methods to run.
|
||||
1. Script
|
||||
|
||||
```
|
||||
examples\sycl\win-run-llama-2.bat
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
examples\sycl\win-run-llama-3.bat
|
||||
examples\sycl\win-test.bat
|
||||
```
|
||||
|
||||
2. Command line
|
||||
@@ -744,13 +734,13 @@ Examples:
|
||||
- Use device 0:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
|
||||
build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
|
||||
```
|
||||
|
||||
- Use multiple devices:
|
||||
|
||||
```
|
||||
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
|
||||
build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer --mmap
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -1,10 +1,5 @@
|
||||
{
|
||||
"version": 5,
|
||||
"cmakeMinimumRequired": {
|
||||
"major": 3,
|
||||
"minor": 28,
|
||||
"patch": 0
|
||||
},
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "arm64-android-snapdragon",
|
||||
|
||||
@@ -128,7 +128,7 @@ However, additional settings are required for generating and signing HTP Ops lib
|
||||
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
|
||||
|
||||
> cmake --preset arm64-windows-snapdragon -B build-wos
|
||||
> cmake --preset arm64-windows-snapdragon-release -B build-wos
|
||||
...
|
||||
> cmake --install build-wos --prefix pkg-snapdragon
|
||||
```
|
||||
|
||||
@@ -9,7 +9,7 @@ Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch m
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
|
||||
@@ -8,11 +8,11 @@ Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
|
||||
@@ -8,7 +8,7 @@ Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch m
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
|
||||
@@ -8,11 +8,11 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250731
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
@@ -8,11 +8,11 @@ Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch m
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250826
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
|
||||
+69
-5
@@ -6,7 +6,7 @@ llama.cpp supports speculative decoding, a technique that can significantly acce
|
||||
|
||||
## Implementations
|
||||
|
||||
The `llama-server` application supports several implementations of speculative decoding:
|
||||
The `llama-server` application supports several implementations of speculative decoding. An implementation with draft model can be mixed with an implementation without draft model.
|
||||
|
||||
### Draft Model (`draft`)
|
||||
|
||||
@@ -32,12 +32,21 @@ An example to use this approach can be the rewriting of source code by a LLM.
|
||||
|
||||
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
|
||||
|
||||
```
|
||||
llama-server [...] --spec-type ngram-simple --draft-max 64
|
||||
```
|
||||
|
||||
#### n-gram Map Key (`ngram-map-k`)
|
||||
|
||||
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts.
|
||||
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
|
||||
|
||||
The number of accepted tokens is stored for each used n-gram.
|
||||
|
||||
**Example:**
|
||||
```
|
||||
llama-server [...] --spec-type ngram-map-k --draft-max 64
|
||||
```
|
||||
|
||||
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
|
||||
|
||||
This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft.
|
||||
@@ -45,17 +54,65 @@ This experimental implementation looks for the current n-gram of size n (called
|
||||
The number of accepted tokens is stored for each used n-gram.
|
||||
|
||||
**Example:** Server options to be used if there are a lot of longer repetitions.
|
||||
```bash
|
||||
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2
|
||||
```
|
||||
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
|
||||
```
|
||||
|
||||
### n-gram Mod (`ngram-mod`)
|
||||
|
||||
Add basic ngram hasher for speculative decoding:
|
||||
|
||||
- For each ngram, compute a hash using LCG
|
||||
- For each computed hash, store the next token
|
||||
- During speculation, iteratively compute the rolling hash of the last n tokens and pick the next token from the storage
|
||||
|
||||
Some characteristics:
|
||||
|
||||
- Lightweight (~16 MB)
|
||||
- Constant memory and complexity
|
||||
- Can generate variable draft lengths (i.e. m is not fixed)
|
||||
|
||||
Currently, a single hash pool is shared across all server slots, so different requests can benefit from each other.
|
||||
|
||||
**Sample usage:**
|
||||
|
||||
```
|
||||
# notes:
|
||||
# - small `n` are not recommended
|
||||
# - MoEs require long drafts
|
||||
# - dense models: can reduce `--draft-min` and `--draft-max`
|
||||
|
||||
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
|
||||
```
|
||||
|
||||
Applications:
|
||||
|
||||
- Iterating over a block of text/code (e.g. in llama.vim)
|
||||
- Reasoning models (when they have to repeat their thinking in the final answer)
|
||||
- Summarization
|
||||
|
||||
Example Video:
|
||||
|
||||
- See #19164
|
||||
|
||||
### Differences between ngram-simple, ngram-map and ngram-mod
|
||||
|
||||
- ngram-simple looks for a previous matching n-gram and inserts the following m-gram.
|
||||
- ngram-map-k looks for a previous matching n-gram and inserts the following m-gram but uses an internal hash-map of n-grams in the current context window.
|
||||
- ngram-mod uses a hash pool which is shared across all server slots. The hash pool is a map from n-gram hash to the next token (not the next m-gram as in ngram-map).
|
||||
|
||||
## Command-Line Options
|
||||
|
||||
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
|
||||
|
||||
```
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]
|
||||
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
|
||||
(env: LLAMA_ARG_DRAFT_MAX)
|
||||
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
|
||||
(default: 0)
|
||||
(env: LLAMA_ARG_DRAFT_MIN)
|
||||
[...]
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
(default: none)
|
||||
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
|
||||
@@ -78,6 +135,7 @@ Specifies a type of speculative decoding without draft model.
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
|
||||
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
|
||||
| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
|
||||
|
||||
**Example:** Server-instance used to refactor source code.
|
||||
```bash
|
||||
@@ -112,9 +170,15 @@ statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tok
|
||||
statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98
|
||||
```
|
||||
|
||||
```
|
||||
draft acceptance rate = 0.70312 ( 90 accepted / 128 generated)
|
||||
statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
|
||||
```
|
||||
|
||||
- `#calls`: number of calls of this implementations
|
||||
- `#gen drafts`: number of drafts generated by this implementation
|
||||
- `#acc drafts`: number of drafts accepted (partially) by the main model
|
||||
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
|
||||
- `#acc tokens`: number of tokens accepted by the main model
|
||||
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Migration notice for binary filenames
|
||||
|
||||
> [!IMPORTANT]
|
||||
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggml-org/llama.cpp/pull/7809)
|
||||
|
||||
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
|
||||
fprintf(stdout, "\n");
|
||||
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
|
||||
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
|
||||
fprintf(stdout, " See https://github.com/ggml-org/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
|
||||
fprintf(stdout, "\n");
|
||||
|
||||
return EXIT_FAILURE;
|
||||
|
||||
@@ -402,7 +402,7 @@ class SchemaConverter:
|
||||
Transforms a regular expression pattern into a GBNF rule.
|
||||
|
||||
Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
|
||||
Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
|
||||
Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md
|
||||
|
||||
Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
|
||||
|
||||
|
||||
@@ -18,13 +18,14 @@ CONTEXT=4096
|
||||
#support malloc device memory more than 4GB.
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
LOAD_MODE='--mmap'
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
|
||||
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
|
||||
fi
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
# If you want more control, DPC++ Allows selecting a specific device through the
|
||||
# following environment variable
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
|
||||
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
|
||||
CONTEXT=4096
|
||||
|
||||
#support malloc device memory more than 4GB.
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "Using $GGML_SYCL_DEVICE as the main GPU"
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
||||
fi
|
||||
Executable
+130
@@ -0,0 +1,130 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
Help() {
|
||||
cat << EOF
|
||||
Usage: $(basename "$0") [OPTIONS]
|
||||
|
||||
This script processes files with specified options.
|
||||
|
||||
Options:
|
||||
-h, --help Display this help message and exit.
|
||||
-c, --context <value> Set context length. Bigger need more memory.
|
||||
-p, --promote <value> Prompt to start generation with.
|
||||
-m, --model <value> Full model file path.
|
||||
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
|
||||
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
|
||||
- none: use one GPU only
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
|
||||
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
ignored. Values:
|
||||
- 0: generic output
|
||||
- 1: error
|
||||
- 2: warning
|
||||
- 3: info
|
||||
- 4: debug
|
||||
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
BIN_FILE=./build/bin/llama-completion
|
||||
SEED=0
|
||||
GPUS_SETTING=""
|
||||
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
||||
NGL=99
|
||||
CONTEXT=4096
|
||||
GGML_SYCL_DEVICE=-1
|
||||
SPLIT_MODE=layer
|
||||
LOG_VERBOSE=3
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-c|--context)
|
||||
CONTEXT=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-p|--promote)
|
||||
# Option that is a simple flag (boolean)
|
||||
INPUT_PROMPT="$2"
|
||||
# Shift once to consume the option flag
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-m|--model)
|
||||
MODEL_FILE="$2"
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-mg|--main-gpu)
|
||||
GGML_SYCL_DEVICE=$2
|
||||
SPLIT_MODE=none
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-sm|--split-mode)
|
||||
SPLIT_MODE=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-ngl|--n-gpu-layers)
|
||||
NGL=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-lv|--log-verbosity)
|
||||
LOG_VERBOSE=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
Help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# Handle unknown options or stop processing options
|
||||
echo "Invalid option: $1"
|
||||
# Optional: exit script or shift to treat remaining as positional args
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
#support malloc device memory more than 4GB.
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
|
||||
|
||||
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
fi
|
||||
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
|
||||
|
||||
@@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
|
||||
:: support malloc device memory more than 4GB.
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
||||
set LOAD_MODE="--mmap"
|
||||
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
|
||||
|
||||
@@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
|
||||
:: support malloc device memory more than 4GB.
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
|
||||
set LOAD_MODE="--mmap"
|
||||
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
|
||||
project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
+1
-1
@@ -6,7 +6,7 @@
|
||||
// This documentation is still a work in progress.
|
||||
// If you wish some specific topics to be covered, feel free to drop a comment:
|
||||
//
|
||||
// https://github.com/ggerganov/whisper.cpp/issues/40
|
||||
// https://github.com/ggml-org/whisper.cpp/issues/40
|
||||
//
|
||||
// ## Overview
|
||||
//
|
||||
|
||||
@@ -258,6 +258,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
|
||||
if (backend->iface.set_tensor_async == NULL) {
|
||||
ggml_backend_synchronize(backend);
|
||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||
} else {
|
||||
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
||||
@@ -271,6 +272,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
|
||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
|
||||
if (backend->iface.get_tensor_async == NULL) {
|
||||
ggml_backend_synchronize(backend);
|
||||
ggml_backend_tensor_get(tensor, data, offset, size);
|
||||
} else {
|
||||
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2023-2024 The ggml authors
|
||||
* Copyright (c) 2023-2026 The ggml authors
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
|
||||
@@ -1,8 +1,20 @@
|
||||
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
|
||||
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
|
||||
|
||||
if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}" OR NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
|
||||
message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT and HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
|
||||
if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}")
|
||||
message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT point to the correct Hexagon SDK installation.")
|
||||
endif()
|
||||
|
||||
if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
|
||||
message("Try to read HEXAGON_TOOLS_ROOT from hexagon_sdk.json")
|
||||
file(READ "${HEXAGON_SDK_ROOT}/hexagon_sdk.json" HEXAGON_SDK_CONFIG_PATH)
|
||||
string(JSON HEXAGON_TOOLS_PATH GET ${HEXAGON_SDK_CONFIG_PATH} "root" "tools" "info" 0 "path")
|
||||
message("Found HEXAGON_TOOLS_PATH: ${HEXAGON_TOOLS_PATH}")
|
||||
set(HEXAGON_TOOLS_ROOT "${HEXAGON_SDK_ROOT}/${HEXAGON_TOOLS_PATH}")
|
||||
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
|
||||
if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
|
||||
message(FATAL_ERROR "Make sure HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels")
|
||||
|
||||
@@ -17,6 +17,12 @@
|
||||
#include "htp-msg.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
static inline HVX_Vector hvx_load_f32_to_f16(const HVX_Vector * restrict src, const HVX_Vector zero) {
|
||||
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(src[0], zero); // 32 elements
|
||||
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(src[1], zero); // 32 elements
|
||||
return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||
}
|
||||
|
||||
// Dot product of FP32 and FP16 vectors, accumulating to float
|
||||
static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
|
||||
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
|
||||
@@ -33,23 +39,19 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
|
||||
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
|
||||
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
|
||||
|
||||
// Load x (fp16)
|
||||
HVX_Vector x_hf = vx[i];
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
|
||||
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
|
||||
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
|
||||
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
|
||||
|
||||
// Load x (fp16)
|
||||
HVX_Vector x_hf = vx[i];
|
||||
@@ -62,13 +64,72 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
|
||||
}
|
||||
|
||||
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
|
||||
rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
|
||||
hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
|
||||
}
|
||||
|
||||
hvx_vec_store_u(r, 4, rsum);
|
||||
// Dot product of FP32 and FP16 vectors, accumulating to float
|
||||
static inline void hvx_dot_f32_f16_aa_rx2(float * restrict r,
|
||||
const void * restrict y,
|
||||
const void * restrict x0,
|
||||
const void * restrict x1,
|
||||
unsigned int n,
|
||||
float s) {
|
||||
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
|
||||
const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16
|
||||
const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16
|
||||
|
||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum0 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum1 = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(2)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
|
||||
// Load x (fp16)
|
||||
HVX_Vector x0_hf = vx0[i];
|
||||
HVX_Vector x1_hf = vx1[i];
|
||||
|
||||
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
|
||||
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
|
||||
|
||||
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
|
||||
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
// Load y (fp32) and convert into fp16
|
||||
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
|
||||
|
||||
// Load x (fp16)
|
||||
HVX_Vector x0_hf = vx0[i];
|
||||
HVX_Vector x1_hf = vx1[i];
|
||||
|
||||
// Zero-out unused elements
|
||||
// Note that we need to clear both x and y because they may contain NANs
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
|
||||
x0_hf = Q6_V_vand_QV(bmask, x0_hf);
|
||||
x1_hf = Q6_V_vand_QV(bmask, x1_hf);
|
||||
y_hf = Q6_V_vand_QV(bmask, y_hf);
|
||||
|
||||
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
|
||||
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
|
||||
|
||||
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
|
||||
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
|
||||
}
|
||||
|
||||
HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
|
||||
hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
|
||||
}
|
||||
|
||||
// Dot product of two F16 vectors, accumulating to float
|
||||
@@ -91,7 +152,7 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
@@ -103,12 +164,62 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
|
||||
|
||||
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
|
||||
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
|
||||
}
|
||||
|
||||
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
|
||||
hvx_vec_store_u(r, 4, rsum);
|
||||
rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
|
||||
hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
|
||||
}
|
||||
|
||||
static inline void hvx_dot_f16_f16_aa_rx2(float * restrict r,
|
||||
const void * restrict y,
|
||||
const void * restrict x0,
|
||||
const void * restrict x1,
|
||||
unsigned int n,
|
||||
float s) {
|
||||
const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16
|
||||
const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16
|
||||
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp16
|
||||
|
||||
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
|
||||
uint32_t nloe = n % VLEN_FP16; // leftover elements
|
||||
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum0 = Q6_V_vsplat_R(0);
|
||||
HVX_Vector rsum1 = Q6_V_vsplat_R(0);
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
#pragma unroll(4)
|
||||
for (i = 0; i < nvec; i++) {
|
||||
HVX_Vector y_hf = vy[i];
|
||||
HVX_Vector x0_hf = vx0[i];
|
||||
HVX_Vector x1_hf = vx1[i];
|
||||
|
||||
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
|
||||
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
|
||||
|
||||
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
|
||||
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
HVX_Vector y_hf = vy[i];
|
||||
|
||||
// Load x (fp16) and zero-out unused elements
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
|
||||
HVX_Vector x0_hf = Q6_V_vand_QV(bmask, vx0[i]);
|
||||
HVX_Vector x1_hf = Q6_V_vand_QV(bmask, vx1[i]);
|
||||
|
||||
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
|
||||
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
|
||||
|
||||
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
|
||||
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
|
||||
}
|
||||
|
||||
HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
|
||||
hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
|
||||
}
|
||||
|
||||
// MAD: y (F32) += x (F16) * s (float)
|
||||
@@ -317,20 +428,22 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
|
||||
// Inner loop processing the block from VTCM
|
||||
uint32_t ic = 0;
|
||||
|
||||
const bool is_q_fp32 = (q->type == HTP_TYPE_F32);
|
||||
|
||||
// Process in blocks of 32 (VLEN_FP32)
|
||||
static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 == 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
|
||||
static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 <= 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
|
||||
HVX_Vector_x4 scores_x4;
|
||||
HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
|
||||
for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
|
||||
// 1. Compute scores
|
||||
float __attribute__((aligned(VLEN))) scores_arr[FLASH_ATTN_BLOCK_SIZE];
|
||||
for (int j = 0; j < VLEN_FP32; ++j) {
|
||||
float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
|
||||
for (int j = 0; j < VLEN_FP32; j += 2) {
|
||||
const uint32_t cur_ic = ic + j;
|
||||
const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded;
|
||||
if (q->type == HTP_TYPE_F32) {
|
||||
hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
|
||||
if (is_q_fp32) {
|
||||
hvx_dot_f32_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
|
||||
} else {
|
||||
hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
|
||||
hvx_dot_f16_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -403,7 +516,7 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
|
||||
float s_val;
|
||||
const uint8_t * k_ptr = k_base + ic * size_k_row_padded;
|
||||
|
||||
if (q->type == HTP_TYPE_F32) {
|
||||
if (is_q_fp32) {
|
||||
hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
|
||||
} else {
|
||||
hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
|
||||
|
||||
@@ -28,19 +28,16 @@ static void hvx_vec_dump_f16(char * pref, HVX_Vector v) {
|
||||
}
|
||||
|
||||
static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) {
|
||||
union {
|
||||
HVX_Vector v;
|
||||
float d[32];
|
||||
} u = { .v = v };
|
||||
HVX_VectorAlias u = { .v = v };
|
||||
|
||||
const uint32_t n0 = n / 16;
|
||||
const uint32_t n1 = n % 16;
|
||||
int i = 0;
|
||||
for (; i < n0; i++) {
|
||||
hex_dump_f32_line(pref, u.d + (16 * i), 16);
|
||||
hex_dump_f32_line(pref, u.fp32 + (16 * i), 16);
|
||||
}
|
||||
if (n1) {
|
||||
hex_dump_f32_line(pref, u.d + (16 * i), n1);
|
||||
hex_dump_f32_line(pref, u.fp32 + (16 * i), n1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -44,6 +44,45 @@ static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
|
||||
return hvx_vec_reduce_sum_n_qf32(in, 32);
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ > 75
|
||||
|
||||
static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
|
||||
HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
|
||||
HVX_Vector sum_sf = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
|
||||
|
||||
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 2));
|
||||
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 4));
|
||||
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 8));
|
||||
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 16));
|
||||
return sum_sf;
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
|
||||
unsigned int total = n * 4; // total vec nbytes
|
||||
unsigned int width = 4; // fp32 nbytes
|
||||
|
||||
HVX_Vector sum = in, sum_t;
|
||||
while (width < total) {
|
||||
sum_t = Q6_V_vror_VR(sum, width); // rotate right
|
||||
sum = Q6_Vsf_vadd_VsfVsf(sum, sum_t); // elementwise sum
|
||||
width = width << 1;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
|
||||
HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
|
||||
HVX_Vector sum_qf = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
|
||||
|
||||
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 2));
|
||||
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 4));
|
||||
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 8));
|
||||
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 16));
|
||||
return Q6_Vsf_equals_Vqf32(sum_qf);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
|
||||
unsigned int total = n * 4; // total vec nbytes
|
||||
unsigned int width = 4; // fp32 nbytes
|
||||
@@ -57,6 +96,8 @@ static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n)
|
||||
return sum;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) {
|
||||
return hvx_vec_reduce_sum_n_f32(in, 32);
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "hvx-dump.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
@@ -320,7 +321,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (qf32)
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
@@ -344,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
@@ -362,14 +363,14 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
|
||||
// Zero out unused scales
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Reduce and convert into fp32
|
||||
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
|
||||
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
|
||||
|
||||
hvx_vec_store_u(&s[0], 4, r0_sum);
|
||||
}
|
||||
@@ -402,7 +403,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (qf32)
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
||||
|
||||
@@ -432,8 +433,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
@@ -456,20 +457,18 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Convert into fp32 and reduce
|
||||
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
|
||||
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
|
||||
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
|
||||
|
||||
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
|
||||
hvx_vec_store_u(&s[0], 8, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
@@ -493,7 +492,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (qf32)
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
@@ -517,7 +516,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
@@ -535,14 +534,14 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
|
||||
// Zero out unused scales
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Reduce and convert into fp32
|
||||
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
|
||||
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
|
||||
|
||||
hvx_vec_store_u(&s[0], 4, r0_sum);
|
||||
}
|
||||
@@ -605,8 +604,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
|
||||
@@ -629,20 +628,18 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Convert into fp32 and reduce
|
||||
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
|
||||
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
|
||||
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
|
||||
|
||||
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
|
||||
hvx_vec_store_u(&s[0], 8, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
|
||||
@@ -669,7 +666,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (qf32)
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
@@ -708,7 +705,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Process leftovers
|
||||
@@ -741,14 +738,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
|
||||
// Zero-out unused scales
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
}
|
||||
|
||||
// Reduce and convert into fp32
|
||||
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
|
||||
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
|
||||
|
||||
hvx_vec_store_u(&s[0], 4, r0_sum);
|
||||
}
|
||||
@@ -781,13 +778,13 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
|
||||
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
|
||||
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
|
||||
|
||||
// Row sum (qf32)
|
||||
// Row sum (sf)
|
||||
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
|
||||
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
|
||||
|
||||
// Multiply and accumulate into int32.
|
||||
// Compute combined scale (fp32).
|
||||
// Apply scale to acc and accumulate into the row sum (qf32).
|
||||
// Apply scale to acc and accumulate into the row sum (f32).
|
||||
|
||||
const uint32_t nb = n / qk; // num full blocks
|
||||
int32_t nloe = n % qk; // num leftover elemements (must be signed)
|
||||
@@ -829,8 +826,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Process leftovers
|
||||
@@ -867,24 +864,22 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
|
||||
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
|
||||
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
|
||||
|
||||
// Zero-out unused scales
|
||||
// Zero-out unused values
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
|
||||
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
|
||||
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
|
||||
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
|
||||
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
|
||||
|
||||
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
|
||||
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
|
||||
|
||||
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
|
||||
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
|
||||
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
|
||||
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
|
||||
}
|
||||
|
||||
// Convert into fp32 and reduce
|
||||
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
|
||||
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
|
||||
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
|
||||
|
||||
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
|
||||
hvx_vec_store_u(&s[0], 8, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
@@ -913,7 +908,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
|
||||
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
|
||||
hvx_vec_store_u(&s[0], 4, rsum);
|
||||
}
|
||||
|
||||
@@ -957,11 +952,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n,
|
||||
rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
|
||||
}
|
||||
|
||||
rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0));
|
||||
rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1));
|
||||
HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4);
|
||||
|
||||
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
|
||||
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1));
|
||||
hvx_vec_store_u(&s[0], 8, rsum);
|
||||
}
|
||||
|
||||
static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
@@ -990,7 +982,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
|
||||
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
|
||||
hvx_vec_store_u(&s[0], 4, rsum);
|
||||
}
|
||||
|
||||
@@ -1042,7 +1034,8 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res
|
||||
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
|
||||
}
|
||||
|
||||
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
|
||||
// Convert into fp32 and reduce
|
||||
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
|
||||
hvx_vec_store_u(&s[0], 4, rsum);
|
||||
}
|
||||
|
||||
|
||||
@@ -154,8 +154,8 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
|
||||
v_pad[i] = v3;
|
||||
}
|
||||
|
||||
v = hvx_vec_reduce_sum_qf32(sum_vec);
|
||||
sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
|
||||
v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec));
|
||||
sum_vec = hvx_vec_repl4(v);
|
||||
|
||||
HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
|
||||
HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec);
|
||||
|
||||
@@ -57,8 +57,8 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
||||
}
|
||||
|
||||
HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v);
|
||||
sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
|
||||
HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
|
||||
sum_v = hvx_vec_repl4(reduced_sum);
|
||||
|
||||
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
|
||||
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);
|
||||
|
||||
@@ -71,7 +71,7 @@ else()
|
||||
# disabling fast math is needed in order to pass tests/test-backend-ops
|
||||
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
|
||||
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
|
||||
# ref: https://github.com/ggml-org/whisper.cpp/issues/1720
|
||||
# note: adding -g causes segmentation fault during compile
|
||||
#set(XC_FLAGS -fno-fast-math -fno-inline -g)
|
||||
set(XC_FLAGS -fno-fast-math -fno-inline)
|
||||
|
||||
@@ -3740,7 +3740,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
|
||||
// Reuse extra of the parent tensor. The offset of this view tensor
|
||||
// becomes `extra->offset + view_offs` and needs to be calculated when
|
||||
// it is used. This changes is needed because of the change to
|
||||
// ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
|
||||
// ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
|
||||
// `buffer` passed in here will always be `tensor->buffer`. It is OK
|
||||
// to allocate extras from the same buffer context for ordinary
|
||||
// intermediate tensors. But for views into kv cache tensors, doing so
|
||||
|
||||
@@ -3390,7 +3390,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
|
||||
|
||||
// mmvq and mmq need the __dp4a instruction which is available for gen12+
|
||||
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
||||
// Workaround in https://github.com/ggml-org/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
|
||||
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||
#ifdef SYCL_USE_XMX
|
||||
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||
|
||||
@@ -330,7 +330,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
|
||||
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, in_path, "-o", out_path};
|
||||
#endif
|
||||
|
||||
// disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734
|
||||
// disable spirv-opt for coopmat shaders for https://github.com/ggml-org/llama.cpp/issues/10734
|
||||
// disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344
|
||||
// disable spirv-opt for rope shaders for https://github.com/ggml-org/llama.cpp/issues/16860
|
||||
if (!coopmat && name.find("bf16") == std::string::npos && name.find("rope") == std::string::npos) {
|
||||
|
||||
@@ -146,8 +146,13 @@ struct webgpu_submission_futures {
|
||||
struct webgpu_buf_pool {
|
||||
std::vector<webgpu_pool_bufs> free;
|
||||
|
||||
std::mutex mutex;
|
||||
|
||||
// The pool must be synchronized because
|
||||
// 1. The memset pool is shared globally by every ggml buffer,
|
||||
// since allocating a pool per ggml buffer would consume too much memory.
|
||||
// 2. For the per-thread buffer pools in webgpu_context,
|
||||
// buffers are allocated and freed in Dawn callbacks,
|
||||
// which can run on a different thread than the calling thread.
|
||||
std::mutex mutex;
|
||||
std::condition_variable cv;
|
||||
|
||||
void init(wgpu::Device device,
|
||||
@@ -266,7 +271,7 @@ struct webgpu_command {
|
||||
#endif
|
||||
};
|
||||
|
||||
struct webgpu_capabilities_base {
|
||||
struct webgpu_capabilities {
|
||||
wgpu::Limits limits;
|
||||
bool supports_subgroup_matrix = false;
|
||||
|
||||
@@ -286,11 +291,11 @@ struct webgpu_global_context_struct {
|
||||
wgpu::Device device;
|
||||
wgpu::Queue queue;
|
||||
|
||||
webgpu_capabilities_base capabilities;
|
||||
webgpu_capabilities capabilities;
|
||||
// Shared buffer to move data from device to host
|
||||
wgpu::Buffer get_tensor_staging_buf;
|
||||
wgpu::Buffer get_tensor_staging_buf;
|
||||
// Global mutex for pipeline and staging buffer, will be refactored to exclude pipeline caches.
|
||||
std::recursive_mutex mutex;
|
||||
std::recursive_mutex mutex;
|
||||
|
||||
webgpu_buf_pool memset_buf_pool;
|
||||
std::map<int, webgpu_pipeline> memset_pipelines; // variant or type index
|
||||
@@ -361,7 +366,6 @@ struct webgpu_context_struct {
|
||||
std::unordered_map<ggml_webgpu_pad_pipeline_key, webgpu_pipeline, ggml_webgpu_pad_pipeline_key_hash> pad_pipelines;
|
||||
|
||||
size_t memset_bytes_per_thread;
|
||||
|
||||
};
|
||||
|
||||
typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
|
||||
@@ -383,9 +387,8 @@ struct ggml_backend_webgpu_device_context {
|
||||
|
||||
// Per-thread data required to actually run WebGPU operations in a backend instance
|
||||
struct ggml_backend_webgpu_context {
|
||||
webgpu_context webgpu_ctx;
|
||||
std::once_flag init_once;
|
||||
std::string name;
|
||||
webgpu_context webgpu_ctx;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
// Per-thread data related to buffers
|
||||
@@ -861,20 +864,15 @@ static webgpu_command ggml_webgpu_pad(webgpu_context & ctx, ggml_tensor * src, g
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline;
|
||||
{
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->pad_pipelines.find(pipeline_key);
|
||||
if (it != ctx->pad_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_pad_shader(ctx->p, wgsl_pad, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->pad_pipelines.emplace(pipeline_key, pipeline);
|
||||
}
|
||||
auto it = ctx->pad_pipelines.find(pipeline_key);
|
||||
if (it != ctx->pad_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed = ggml_webgpu_preprocess_pad_shader(ctx->p, wgsl_pad, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->pad_pipelines.emplace(pipeline_key, pipeline);
|
||||
}
|
||||
|
||||
ggml_webgpu_generic_shader_decisions decisions =
|
||||
@@ -944,20 +942,16 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline;
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->set_rows_pipelines.find(key);
|
||||
if (it != ctx->set_rows_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_set_rows_shader(ctx->p, wgsl_set_rows, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->set_rows_pipelines.emplace(key, pipeline);
|
||||
}
|
||||
auto it = ctx->set_rows_pipelines.find(key);
|
||||
if (it != ctx->set_rows_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_set_rows_shader(ctx->p, wgsl_set_rows, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->set_rows_pipelines.emplace(key, pipeline);
|
||||
}
|
||||
|
||||
ggml_webgpu_generic_shader_decisions decisions =
|
||||
@@ -1261,29 +1255,25 @@ static webgpu_command ggml_webgpu_flash_attn(webgpu_context & ctx,
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline;
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->flash_attn_pipelines.find(key);
|
||||
if (it != ctx->flash_attn_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_flash_attn_shader_lib_context shader_lib_ctx = {
|
||||
.key = key,
|
||||
.sg_mat_m = ctx->global_ctx->capabilities.sg_mat_m,
|
||||
.sg_mat_n = ctx->global_ctx->capabilities.sg_mat_n,
|
||||
.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k,
|
||||
.wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize,
|
||||
.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size
|
||||
};
|
||||
auto it = ctx->flash_attn_pipelines.find(key);
|
||||
if (it != ctx->flash_attn_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_flash_attn_shader_lib_context shader_lib_ctx = {
|
||||
.key = key,
|
||||
.sg_mat_m = ctx->global_ctx->capabilities.sg_mat_m,
|
||||
.sg_mat_n = ctx->global_ctx->capabilities.sg_mat_n,
|
||||
.sg_mat_k = ctx->global_ctx->capabilities.sg_mat_k,
|
||||
.wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize,
|
||||
.max_subgroup_size = ctx->global_ctx->capabilities.max_subgroup_size
|
||||
};
|
||||
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_shader(ctx->p, wgsl_flash_attn, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->flash_attn_pipelines.emplace(key, pipeline);
|
||||
}
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_flash_attn_shader(ctx->p, wgsl_flash_attn, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->flash_attn_pipelines.emplace(key, pipeline);
|
||||
}
|
||||
|
||||
ggml_webgpu_flash_attn_shader_decisions decisions =
|
||||
@@ -1308,20 +1298,16 @@ static webgpu_command ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * s
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline;
|
||||
{
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->unary_pipelines.find(pipeline_key);
|
||||
if (it != ctx->unary_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_unary_shader(ctx->p, wgsl_unary, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->unary_pipelines.emplace(pipeline_key, pipeline);
|
||||
}
|
||||
auto it = ctx->unary_pipelines.find(pipeline_key);
|
||||
if (it != ctx->unary_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_unary_shader(ctx->p, wgsl_unary, shader_lib_ctx);
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
pipeline.context = processed.decisions;
|
||||
ctx->unary_pipelines.emplace(pipeline_key, pipeline);
|
||||
}
|
||||
|
||||
ggml_webgpu_generic_shader_decisions decisions =
|
||||
@@ -1743,19 +1729,15 @@ static webgpu_command ggml_webgpu_argmax(webgpu_context & ctx, ggml_tensor * src
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline;
|
||||
{
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->argmax_pipelines.find(shader_lib_ctx.vec4);
|
||||
if (it != ctx->argmax_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_argmax, shader_lib_ctx, "argmax");
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
ctx->argmax_pipelines.emplace(shader_lib_ctx.vec4, pipeline);
|
||||
}
|
||||
auto it = ctx->argmax_pipelines.find(shader_lib_ctx.vec4);
|
||||
if (it != ctx->argmax_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_argmax, shader_lib_ctx, "argmax");
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
ctx->argmax_pipelines.emplace(shader_lib_ctx.vec4, pipeline);
|
||||
}
|
||||
uint32_t wg_x = ggml_nelements(dst);
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
|
||||
@@ -1772,9 +1754,8 @@ static webgpu_command ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * sr
|
||||
.order = order
|
||||
};
|
||||
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
webgpu_pipeline argsort_pipeline;
|
||||
auto it = ctx->argsort_pipelines.find(order);
|
||||
webgpu_pipeline argsort_pipeline;
|
||||
auto it = ctx->argsort_pipelines.find(order);
|
||||
if (it != ctx->argsort_pipelines.end()) {
|
||||
argsort_pipeline = it->second;
|
||||
} else {
|
||||
@@ -1963,19 +1944,15 @@ static webgpu_command ggml_webgpu_cumsum(webgpu_context & ctx, ggml_tensor * src
|
||||
.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup,
|
||||
};
|
||||
webgpu_pipeline pipeline;
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->cumsum_pipelines.find(1);
|
||||
if (it != ctx->cumsum_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_cumsum, shader_lib_ctx, "cumsum");
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
ctx->cumsum_pipelines.emplace(1, pipeline);
|
||||
}
|
||||
auto it = ctx->cumsum_pipelines.find(1);
|
||||
if (it != ctx->cumsum_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_cumsum, shader_lib_ctx, "cumsum");
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
ctx->cumsum_pipelines.emplace(1, pipeline);
|
||||
}
|
||||
uint32_t wg_x = ggml_nrows(dst);
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
|
||||
@@ -2009,19 +1986,15 @@ static webgpu_command ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor * s
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline;
|
||||
{
|
||||
// TODO: remove guard once pipeline caches are per-thread
|
||||
std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
|
||||
auto it = ctx->sum_rows_pipelines.find(1);
|
||||
if (it != ctx->sum_rows_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_sum_rows, shader_lib_ctx, "sum_rows");
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
ctx->sum_rows_pipelines.emplace(1, pipeline);
|
||||
}
|
||||
auto it = ctx->sum_rows_pipelines.find(1);
|
||||
if (it != ctx->sum_rows_pipelines.end()) {
|
||||
pipeline = it->second;
|
||||
} else {
|
||||
ggml_webgpu_processed_shader processed =
|
||||
ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_sum_rows, shader_lib_ctx, "sum_rows");
|
||||
pipeline =
|
||||
ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
|
||||
ctx->sum_rows_pipelines.emplace(1, pipeline);
|
||||
}
|
||||
uint32_t wg_x = total_sum ? 1 : ggml_nrows(dst);
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
|
||||
@@ -3016,10 +2989,10 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
// Initialize buffer pool for timestamp queries, used for profiling
|
||||
ctx->webgpu_global_ctx->timestamp_query_buf_pool.init(ctx->webgpu_global_ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS,
|
||||
WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
|
||||
wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
|
||||
ctx->webgpu_global_ctx->timestamp_query_buf_pool.init(
|
||||
ctx->webgpu_global_ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
|
||||
wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
|
||||
wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
|
||||
#endif
|
||||
|
||||
GGML_LOG_INFO(
|
||||
|
||||
+1
-1
@@ -6562,7 +6562,7 @@ static void ggml_compute_backward(
|
||||
case GGML_OP_DIAG_MASK_INF: {
|
||||
if (src0_needs_grads) {
|
||||
/* ggml_diag_mask_inf_impl() shouldn't be here */
|
||||
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
|
||||
/* ref: https://github.com/ggml-org/llama.cpp/pull/4203#discussion_r1412377992 */
|
||||
const int n_past = ((const int32_t *) tensor->op_params)[0];
|
||||
ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
|
||||
}
|
||||
|
||||
+1
-1
@@ -233,7 +233,7 @@ int32_t llm_chat_apply_template(
|
||||
llm_chat_template tmpl,
|
||||
const std::vector<const llama_chat_message *> & chat,
|
||||
std::string & dest, bool add_ass) {
|
||||
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
||||
// Taken from the research: https://github.com/ggml-org/llama.cpp/issues/5527
|
||||
std::stringstream ss;
|
||||
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
|
||||
// chatml template
|
||||
|
||||
+1
-1
@@ -195,7 +195,7 @@ struct llama_hparams {
|
||||
uint32_t n_deepstack_layers = 0;
|
||||
|
||||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
uint32_t dec_n_layer = 0;
|
||||
|
||||
|
||||
+4
-4
@@ -90,7 +90,7 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
|
||||
//
|
||||
// SPM tokenizer
|
||||
// original implementation:
|
||||
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||
// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||
//
|
||||
|
||||
struct llm_bigram_spm {
|
||||
@@ -285,7 +285,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
// original regex from tokenizer.json
|
||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
||||
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
// adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
@@ -2390,7 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
// maintain a list of tokens that cause end-of-generation
|
||||
// this is currently determined based on the token text, which is obviously not ideal
|
||||
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/9606
|
||||
special_eog_ids.clear();
|
||||
|
||||
if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
|
||||
@@ -3079,7 +3079,7 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
||||
}
|
||||
|
||||
int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
|
||||
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
|
||||
const llama_token_attr attr = token_get_attr(token);
|
||||
if (!special && (attr & attr_special)) {
|
||||
|
||||
@@ -14,7 +14,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
||||
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
||||
// See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
|
||||
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
||||
|
||||
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
|
||||
// ref: https://github.com/ggml-org/llama.cpp/issues/4952#issuecomment-1892864763
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
@@ -290,7 +290,7 @@ static void power_iteration(
|
||||
ggml_gallocr_free(allocr);
|
||||
|
||||
// TODO @ngxson : The output vector is randomly inverted
|
||||
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
|
||||
// Solution: https://github.com/ggml-org/llama.cpp/pull/8069#issuecomment-2185328171
|
||||
}
|
||||
|
||||
static void run_pca(
|
||||
|
||||
@@ -190,7 +190,7 @@ struct lora_merge_ctx {
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
|
||||
|
||||
// check if all lora adapters have the same tensors
|
||||
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
|
||||
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggml-org/llama.cpp/pull/8607#discussion_r1686027777
|
||||
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
|
||||
if (adapters.size() > 1) {
|
||||
for (size_t i = 1; i < adapters.size(); ++i) {
|
||||
|
||||
@@ -29,7 +29,7 @@ In addition to the KL divergence the following statistics are calculated with `-
|
||||
* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
|
||||
* Pearson correlation coefficient of the "correct" token probabilites between models.
|
||||
* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
|
||||
* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
|
||||
* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggml-org/llama.cpp/discussions/2875 .
|
||||
* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
|
||||
|
||||
## LLaMA 3 8b Scoreboard
|
||||
|
||||
@@ -1096,7 +1096,7 @@ return html`
|
||||
</section>
|
||||
<footer>
|
||||
<p><${ModelGenerationInfo} /></p>
|
||||
<p>Powered By <a href="https://github.com/ggerganov/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
|
||||
<p>Powered By <a href="https://github.com/ggml-org/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
|
||||
</footer>
|
||||
</div>
|
||||
`;
|
||||
|
||||
@@ -1281,7 +1281,7 @@
|
||||
|
||||
<footer>
|
||||
<p><${ModelGenerationInfo} /></p>
|
||||
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||
<p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||
</footer>
|
||||
</div>
|
||||
`;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/* Author: Yazan Agha-Schrader */
|
||||
/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
|
||||
/* Inspiration from llama.cpp logo/banner https://github.com/ggml-org/llama.cpp#readme */
|
||||
|
||||
.theme-mangotango {
|
||||
|
||||
|
||||
@@ -1032,7 +1032,7 @@
|
||||
|
||||
<footer>
|
||||
<p><${ModelGenerationInfo} /></p>
|
||||
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||
<p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||
</footer>
|
||||
</div>
|
||||
`;
|
||||
|
||||
@@ -1036,7 +1036,7 @@
|
||||
|
||||
<footer>
|
||||
<p><${ModelGenerationInfo} /></p>
|
||||
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||
<p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||
</footer>
|
||||
</div>
|
||||
`;
|
||||
|
||||
Reference in New Issue
Block a user