context : fix off-by-one comparisons to n_gpu_layers (#24208)

This commit is contained in:
Sigbjørn Skjæret
2026-06-06 06:06:47 +02:00
committed by GitHub
parent 308f61c31f
commit 603300b008
+2 -2
View File
@@ -341,7 +341,7 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.n_gpu_layers() > model.hparams.n_layer() &&
model.n_gpu_layers() > model.hparams.n_layer_all &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);