mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 07:16:44 +02:00
kv-cache: follow the source cache size when sharing cells (#24267)
A fitted target context can end up smaller than the draft default, the oversized assistant views then overflow the shared K/V tensors and trip the ggml_view_4d size assert during graph reserve.
This commit is contained in:
@@ -97,6 +97,17 @@ llama_kv_cache::llama_kv_cache(
|
||||
model(model), hparams(hparams), v_trans(v_trans),
|
||||
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
|
||||
|
||||
// shared cells view the source cache's K/V tensors, so the cell count
|
||||
// follows the source allocation: a fitted target can be smaller than the
|
||||
// draft default and oversized views would overflow the source tensors
|
||||
if (mem_other) {
|
||||
const uint32_t size_other = static_cast<llama_kv_cache *>(mem_other)->get_size();
|
||||
if (kv_size != size_other) {
|
||||
LLAMA_LOG_WARN("%s: kv_size = %u overridden to %u to match the shared source cache\n", __func__, kv_size, size_other);
|
||||
kv_size = size_other;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(kv_size % n_pad == 0);
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer_all;
|
||||
|
||||
Reference in New Issue
Block a user