Compare commits

...

4 Commits

Author SHA1 Message Date
Johannes Gäßler 59377a6c87 ggml-backend: fix async set/get fallback sync (#19179) 2026-02-02 10:00:05 +01:00
Georgi Gerganov 1239267cc4 authors : update (#19263)
[no ci]
2026-02-02 08:51:25 +02:00
Christian Kastner 7a4ca3cbd9 docs : Minor cleanups (#19252)
* Update old URLs to github.com/ggml-org/

* Bump copyrights
2026-02-02 08:38:55 +02:00
Sascha Rogmann b4d05a3d2f spec : various improvements ton ngram-map + docs (#19253)
* spec: ngram-map and reasoning chats

* spec: add t_begin and t_accept

* ngram-map : add internal hash map

* docs : update ngram-map, add ngram-mod

* docs : fix ngram-map-k

* docs : differences between implementations
2026-02-02 08:26:58 +02:00
42 changed files with 1100 additions and 409 deletions
+749 -336
View File
File diff suppressed because it is too large Load Diff
+1 -1
View File
@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2023-2026 The ggml authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
+191 -13
View File
@@ -7,6 +7,18 @@
#include <cstdio>
#include <sstream>
// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32.
#define LCG_FACTOR 2654435761UL
// Compute the LCG hash of a n-gram of size len at offset start.
static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) {
uint32_t hash = 0;
for (size_t i = 0; i < len; ++i) {
hash = hash * LCG_FACTOR + tokens[start + i];
}
return hash;
}
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
@@ -115,6 +127,100 @@ llama_tokens common_ngram_simple_draft(
// maximum number of counted values of a ngram map value.
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
void common_ngram_map_begin(
common_ngram_map & map, const llama_tokens & tokens) {
size_t size_begin = tokens.size();
LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__,
map.idx_last_check, size_begin, map.keys.size());
size_t count_map_entries_upd = 0;
if (!map.key_map.empty() && size_begin < map.idx_last_check) {
if (map.show_key_map_stats) {
// Print statistics of hash map map_key.
size_t count_nonzero = 0;
uint32_t min_idx = UINT32_MAX;
uint32_t max_idx = 0;
for (size_t i = 0; i < map.key_map.size(); ++i) {
uint32_t key_idx = map.key_map[i];
if (key_idx != 0) {
++count_nonzero;
if (key_idx < min_idx) min_idx = key_idx;
if (key_idx > max_idx) max_idx = key_idx;
}
}
if (count_nonzero == 0) {
min_idx = 0;
}
LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n",
__func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx);
}
// Update the map from hash to key index (clear outdated entries).
for (size_t i = 0; i < map.key_map.size(); ++i) {
uint32_t key_idx = map.key_map[i];
if (key_idx >= map.size_last_begin) {
map.key_map[i] = 0;
count_map_entries_upd++;
}
}
map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
}
if (size_begin < map.idx_last_check && !map.keys.empty()) {
// The next token generation will start at index size_begin.
// The tokens between map.size_last_begin and size_begin are no longer valid.
//
// Refresh map: Remove all entries with index >= map.size_last_begin.
size_t count_keys = map.keys.size();
size_t count_keys_del = 0;
size_t count_values_del = 0;
for (int32_t i = map.keys.size() - 1; i >= 0; --i) {
common_ngram_map_key & key = map.keys[i];
if (key.key_idx >= map.size_last_begin) {
// Delete the key.
LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin);
map.keys.erase(map.keys.begin() + i);
count_keys_del++;
continue;
}
if (map.key_only) {
continue;
}
// Check the indices of the values.
for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) {
common_ngram_map_value & value = key.values[j];
if (value.value_idx >= map.size_last_begin) {
// Delete the value.
count_values_del++;
// Move all values after this value to the left.
for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) {
key.values[k] = key.values[k + 1];
}
// Clear the last value.
key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0;
key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0;
}
}
if (key.values[0].value_idx == 0) {
// No values left, delete the key.
LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx);
map.keys.erase(map.keys.begin() + i);
count_keys_del++;
}
}
LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__,
map.idx_last_check, size_begin,
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
}
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
map.size_last_begin = size_begin;
}
void common_ngram_map_draft(common_ngram_map & map,
const llama_tokens & inp, llama_token sampled,
llama_tokens & draft) {
@@ -129,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map,
if (cur_len < static_cast<size_t>(2 * n + m)) {
return;
}
if (cur_len >= static_cast<size_t>(UINT32_MAX)) {
// key_map uses uint32_t instead of size_t.
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
}
// Only check every check_rate tokens to save compute
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
@@ -147,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map,
// search for the key in the map
size_t match_pos = 0;
for (size_t j = cur_len - n - m - 1; j > 0; --j) {
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[j + k] != key_tokens[k]) {
match = false;
break;
if (map.size_last_begin > cur_len) {
GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len);
}
if (!map.key_map.empty()) {
// Search for the key in the map key_map from hash of ngrams to index of ngram.
uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size());
uint32_t idx_key = map.key_map[idx_hash];
if (idx_key != 0 && idx_key < cur_len - n - m - 1) {
// Check if the key matches the key at idx_key (because of possible collisions).
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[idx_key + k] != key_tokens[k]) {
match = false;
break;
}
}
LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0);
if (match) {
match_pos = idx_key;
}
}
if (match) {
match_pos = j;
break;
}
if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) {
// Search for the key in [1, map.size_last_begin - n - m -1], descending.
for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
// Check if the key matches the key.
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[j + k] != key_tokens[k]) {
match = false;
break;
}
}
if (match) {
match_pos = j;
break;
}
}
}
if (match_pos == 0) {
// In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later.
//
// Search in [size_last_begin, cur_len - n - m - 1], descending.
for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[j + k] != key_tokens[k]) {
match = false;
break;
}
}
if (match) {
match_pos = j;
break;
}
}
}
if (match_pos > 0) {
LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
cur_len, n, m, key_tokens.size(), sampled, match_pos);
}
if (!map.key_map.empty()) {
// Add hashes of new ngrams in key_map.
//
// Use the same order as above.
if (map.size_last_begin > (size_t) (n + m + 1)) {
for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
// compute hash and store index of ngram at idx j in the map.
uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
if (map.key_map[idx_hash] == 0) {
map.key_map[idx_hash] = j; // collisions may occur
}
}
}
for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
// compute hash and store index of ngram at idx j in the map.
uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
if (map.key_map[idx_hash] == 0) {
map.key_map[idx_hash] = j;
}
}
map.key_map_last_idx = std::max(static_cast<uint32_t>(cur_len - n - m - 1), map.key_map_last_idx);
}
if (match_pos == 0) {
return;
}
@@ -215,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map,
draft.push_back(inp[match_pos + n + i]);
}
LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
key_offset, curr_key.key_num, draft.size());
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
map.last_draft_created = false;
map.last_draft_key_idx = key_offset;
@@ -318,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map,
}
}
if (sum_occur > 0 && max_occur < 3 * sum_occur) {
if (sum_occur > 0 && max_occur < 2 * sum_occur) {
// The most frequent value is not much more frequent than the other values.
// We do not use the draft.
return;
+30 -5
View File
@@ -9,6 +9,8 @@
// 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
// The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
//
// ref: https://github.com/ggml-org/llama.cpp/pull/18471
//
#include "llama.h"
#include "common.h"
@@ -51,10 +53,13 @@ llama_tokens common_ngram_simple_draft(
// maximum number of m-gram values stored for each key n-gram.
#define COMMON_NGRAM_MAX_VALUES 4
// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
#define COMMON_NGRAM_HASH_MAP_SIZE 262144
// statistics of a m-gram after a known n-gram
struct common_ngram_map_value {
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
};
@@ -74,23 +79,43 @@ struct common_ngram_map {
bool key_only; // true if only key n-grams are used, no values.
// first draft: vector only, no map.
std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
uint16_t min_hits; // minimum number of key hits to consider a draft
bool show_key_map_stats = false; // true, if statitics of the key_map should be printed.
common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
uint16_t check_rate, uint16_t min_hits)
: size_key(sz_key), size_value(sz_value), key_only(only_keys),
check_rate(check_rate), min_hits(min_hits) {}
check_rate(check_rate), min_hits(min_hits) {
key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
}
// In reasoning chats the previous reasoning block will be removed from context history.
// A rebuild of the ngram map is needed after that.
size_t size_last_begin = 0; // number of tokens at previous start of generation
bool last_draft_created = false; // true if a draft was created at last call.
size_t last_draft_key_idx = 0; // index of last key used for draft generation.
size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft)
uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
size_t idx_last_check = 0; // index of last check in context history
// optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
//
// uint32_t instead of size_t (size of current histories is << UINT32_MAX)
std::vector<uint32_t> key_map; // key_map[hash] = index of ngram in context window
uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map
};
// Initialize the n-gram map with the given token history.
// map: the ngram map to initialize.
// tokens: the token history to base the map on.
void common_ngram_map_begin(
common_ngram_map & map,
const llama_tokens & tokens);
// Searches for the n-gram in the history and checks whether a draft sequence should be generated.
// map: the ngram map to search in.
+17 -8
View File
@@ -124,9 +124,9 @@ struct common_speculative_state {
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
// TODO: rename to t_draft_us
// TODO: add t_begin_us, t_accept_us
int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
int64_t t_begin_us = 0; // total time spent in refresh of this implementation in microseconds.
int64_t t_draft_us = 0; // total time spent in generating drafts in this implementation in microseconds.
int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds.
common_speculative_state(enum common_speculative_type type) : type(type) {}
@@ -499,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
: common_speculative_state(type), map(std::move(map)) {}
void begin(const llama_tokens & prompt) override {
GGML_UNUSED(prompt);
common_ngram_map_begin(map, prompt);
}
void draft(
@@ -951,7 +951,12 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
}
for (auto & impl : spec->impls) {
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
impl->begin(prompt);
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
impl->t_begin_us += t_now_us - t_start_us; // accumulate duration for this refresh
}
}
@@ -973,7 +978,7 @@ llama_tokens common_speculative_draft(
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
impl->drafts_call_count++;
impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation
impl->t_draft_us += t_now_us - t_start_us; // accumulate duration for this implementation
}
if (!result.empty()) {
@@ -1001,12 +1006,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
GGML_ASSERT(impl);
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
if (n_accepted > 0) {
impl->drafts_accepted_count++;
impl->drafts_accepted_tokens += n_accepted;
}
impl->accept(n_accepted);
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
impl->t_accept_us += t_now_us - t_start_us; // accumulate duration for this acculumulation
}
void common_speculative_print_stats(const common_speculative * spec) {
@@ -1018,13 +1026,14 @@ void common_speculative_print_stats(const common_speculative * spec) {
std::string str_perf;
if (impl->gen_perf) {
std::ostringstream oss;
oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0;
str_perf = ", dur = " + oss.str() + " ms";
oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", ";
oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", ";
oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0;
str_perf = ", dur(b,g,a) = " + oss.str() + " ms";
} else {
str_perf = "";
}
// TODO: report time for begin() and accept()
LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->drafts_call_count,
+1 -1
View File
@@ -9,7 +9,7 @@ Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch m
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
+2 -2
View File
@@ -8,11 +8,11 @@ Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
+1 -1
View File
@@ -8,7 +8,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
+1 -1
View File
@@ -8,7 +8,7 @@ Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch m
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
+2 -2
View File
@@ -8,11 +8,11 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
### Build llama.cpp
Readme modification time: 20250731
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
+2 -2
View File
@@ -8,11 +8,11 @@ Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch m
### Build llama.cpp
Readme modification time: 20250826
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
+69 -5
View File
@@ -6,7 +6,7 @@ llama.cpp supports speculative decoding, a technique that can significantly acce
## Implementations
The `llama-server` application supports several implementations of speculative decoding:
The `llama-server` application supports several implementations of speculative decoding. An implementation with draft model can be mixed with an implementation without draft model.
### Draft Model (`draft`)
@@ -32,12 +32,21 @@ An example to use this approach can be the rewriting of source code by a LLM.
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
```
llama-server [...] --spec-type ngram-simple --draft-max 64
```
#### n-gram Map Key (`ngram-map-k`)
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts.
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
The number of accepted tokens is stored for each used n-gram.
**Example:**
```
llama-server [...] --spec-type ngram-map-k --draft-max 64
```
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft.
@@ -45,17 +54,65 @@ This experimental implementation looks for the current n-gram of size n (called
The number of accepted tokens is stored for each used n-gram.
**Example:** Server options to be used if there are a lot of longer repetitions.
```bash
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2
```
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
```
### n-gram Mod (`ngram-mod`)
Add basic ngram hasher for speculative decoding:
- For each ngram, compute a hash using LCG
- For each computed hash, store the next token
- During speculation, iteratively compute the rolling hash of the last n tokens and pick the next token from the storage
Some characteristics:
- Lightweight (~16 MB)
- Constant memory and complexity
- Can generate variable draft lengths (i.e. m is not fixed)
Currently, a single hash pool is shared across all server slots, so different requests can benefit from each other.
**Sample usage:**
```
# notes:
# - small `n` are not recommended
# - MoEs require long drafts
# - dense models: can reduce `--draft-min` and `--draft-max`
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
```
Applications:
- Iterating over a block of text/code (e.g. in llama.vim)
- Reasoning models (when they have to repeat their thinking in the final answer)
- Summarization
Example Video:
- See #19164
### Differences between ngram-simple, ngram-map and ngram-mod
- ngram-simple looks for a previous matching n-gram and inserts the following m-gram.
- ngram-map-k looks for a previous matching n-gram and inserts the following m-gram but uses an internal hash-map of n-grams in the current context window.
- ngram-mod uses a hash pool which is shared across all server slots. The hash pool is a map from n-gram hash to the next token (not the next m-gram as in ngram-map).
## Command-Line Options
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
```
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
(default: 0)
(env: LLAMA_ARG_DRAFT_MIN)
[...]
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
type of speculative decoding to use when no draft model is provided
(default: none)
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
@@ -78,6 +135,7 @@ Specifies a type of speculative decoding without draft model.
| `ngram-simple` | Use simple n-gram pattern matching |
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
**Example:** Server-instance used to refactor source code.
```bash
@@ -112,9 +170,15 @@ statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tok
statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98
```
```
draft acceptance rate = 0.70312 ( 90 accepted / 128 generated)
statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
```
- `#calls`: number of calls of this implementations
- `#gen drafts`: number of drafts generated by this implementation
- `#acc drafts`: number of drafts accepted (partially) by the main model
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
- `#acc tokens`: number of tokens accepted by the main model
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
+1 -1
View File
@@ -1,7 +1,7 @@
# Migration notice for binary filenames
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggml-org/llama.cpp/pull/7809)
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
fprintf(stdout, "\n");
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, " See https://github.com/ggml-org/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, "\n");
return EXIT_FAILURE;
+1 -1
View File
@@ -402,7 +402,7 @@ class SchemaConverter:
Transforms a regular expression pattern into a GBNF rule.
Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md
Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -6,7 +6,7 @@
// This documentation is still a work in progress.
// If you wish some specific topics to be covered, feel free to drop a comment:
//
// https://github.com/ggerganov/whisper.cpp/issues/40
// https://github.com/ggml-org/whisper.cpp/issues/40
//
// ## Overview
//
+2
View File
@@ -258,6 +258,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
if (backend->iface.set_tensor_async == NULL) {
ggml_backend_synchronize(backend);
ggml_backend_tensor_set(tensor, data, offset, size);
} else {
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
@@ -271,6 +272,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
if (backend->iface.get_tensor_async == NULL) {
ggml_backend_synchronize(backend);
ggml_backend_tensor_get(tensor, data, offset, size);
} else {
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
+1 -1
View File
@@ -71,7 +71,7 @@ else()
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
# ref: https://github.com/ggml-org/whisper.cpp/issues/1720
# note: adding -g causes segmentation fault during compile
#set(XC_FLAGS -fno-fast-math -fno-inline -g)
set(XC_FLAGS -fno-fast-math -fno-inline)
+1 -1
View File
@@ -3740,7 +3740,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
// Reuse extra of the parent tensor. The offset of this view tensor
// becomes `extra->offset + view_offs` and needs to be calculated when
// it is used. This changes is needed because of the change to
// ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
// ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
// `buffer` passed in here will always be `tensor->buffer`. It is OK
// to allocate extras from the same buffer context for ordinary
// intermediate tensors. But for views into kv cache tensors, doing so
+1 -1
View File
@@ -3390,7 +3390,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// mmvq and mmq need the __dp4a instruction which is available for gen12+
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
// Workaround in https://github.com/ggml-org/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
#ifdef SYCL_USE_XMX
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
@@ -330,7 +330,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, in_path, "-o", out_path};
#endif
// disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734
// disable spirv-opt for coopmat shaders for https://github.com/ggml-org/llama.cpp/issues/10734
// disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344
// disable spirv-opt for rope shaders for https://github.com/ggml-org/llama.cpp/issues/16860
if (!coopmat && name.find("bf16") == std::string::npos && name.find("rope") == std::string::npos) {
+1 -1
View File
@@ -6562,7 +6562,7 @@ static void ggml_compute_backward(
case GGML_OP_DIAG_MASK_INF: {
if (src0_needs_grads) {
/* ggml_diag_mask_inf_impl() shouldn't be here */
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
/* ref: https://github.com/ggml-org/llama.cpp/pull/4203#discussion_r1412377992 */
const int n_past = ((const int32_t *) tensor->op_params)[0];
ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
}
+1 -1
View File
@@ -233,7 +233,7 @@ int32_t llm_chat_apply_template(
llm_chat_template tmpl,
const std::vector<const llama_chat_message *> & chat,
std::string & dest, bool add_ass) {
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
// Taken from the research: https://github.com/ggml-org/llama.cpp/issues/5527
std::stringstream ss;
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
// chatml template
+1 -1
View File
@@ -195,7 +195,7 @@ struct llama_hparams {
uint32_t n_deepstack_layers = 0;
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
// ref: https://github.com/ggml-org/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
uint32_t dec_n_layer = 0;
+4 -4
View File
@@ -90,7 +90,7 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
//
// SPM tokenizer
// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
//
struct llm_bigram_spm {
@@ -285,7 +285,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
// original regex from tokenizer.json
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
// adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
@@ -2390,7 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
// maintain a list of tokens that cause end-of-generation
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
// ref: https://github.com/ggml-org/llama.cpp/issues/9606
special_eog_ids.clear();
if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
@@ -3079,7 +3079,7 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
}
int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
// ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
const llama_token_attr attr = token_get_attr(token);
if (!special && (attr & attr_special)) {
+1 -1
View File
@@ -14,7 +14,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
const uint32_t kv_lora_rank = hparams.n_lora_kv;
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
// See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+1 -1
View File
@@ -1,4 +1,4 @@
// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
// ref: https://github.com/ggml-org/llama.cpp/issues/4952#issuecomment-1892864763
#include <cstdio>
#include <string>
+1 -1
View File
@@ -290,7 +290,7 @@ static void power_iteration(
ggml_gallocr_free(allocr);
// TODO @ngxson : The output vector is randomly inverted
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
// Solution: https://github.com/ggml-org/llama.cpp/pull/8069#issuecomment-2185328171
}
static void run_pca(
+1 -1
View File
@@ -190,7 +190,7 @@ struct lora_merge_ctx {
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
// check if all lora adapters have the same tensors
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggml-org/llama.cpp/pull/8607#discussion_r1686027777
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
if (adapters.size() > 1) {
for (size_t i = 1; i < adapters.size(); ++i) {
+1 -1
View File
@@ -29,7 +29,7 @@ In addition to the KL divergence the following statistics are calculated with `-
* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
* Pearson correlation coefficient of the "correct" token probabilites between models.
* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggml-org/llama.cpp/discussions/2875 .
* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
## LLaMA 3 8b Scoreboard
+1 -1
View File
@@ -1096,7 +1096,7 @@ return html`
</section>
<footer>
<p><${ModelGenerationInfo} /></p>
<p>Powered By <a href="https://github.com/ggerganov/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
<p>Powered By <a href="https://github.com/ggml-org/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
</footer>
</div>
`;
+1 -1
View File
@@ -1281,7 +1281,7 @@
<footer>
<p><${ModelGenerationInfo} /></p>
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
<p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
</footer>
</div>
`;
@@ -1,5 +1,5 @@
/* Author: Yazan Agha-Schrader */
/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
/* Inspiration from llama.cpp logo/banner https://github.com/ggml-org/llama.cpp#readme */
.theme-mangotango {
+1 -1
View File
@@ -1032,7 +1032,7 @@
<footer>
<p><${ModelGenerationInfo} /></p>
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
<p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
</footer>
</div>
`;
+1 -1
View File
@@ -1036,7 +1036,7 @@
<footer>
<p><${ModelGenerationInfo} /></p>
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
<p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
</footer>
</div>
`;