Compare commits

...

2 Commits
b9536 ... b9538

Author SHA1 Message Date
Sigbjørn Skjæret
5343f4502a model : rename local n_layer_all variable (#24209) 2026-06-06 07:07:20 +03:00
Sigbjørn Skjæret
603300b008 context : fix off-by-one comparisons to n_gpu_layers (#24208) 2026-06-06 07:06:47 +03:00
2 changed files with 14 additions and 14 deletions

View File

@@ -341,7 +341,7 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.n_gpu_layers() > model.hparams.n_layer() &&
model.n_gpu_layers() > model.hparams.n_layer_all &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);

View File

@@ -1205,7 +1205,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
const auto & use_mlock = params.use_mlock;
const auto & tensor_split = params.tensor_split;
const int n_layer = hparams.n_layer_all;
const int n_layer_all = hparams.n_layer_all;
const int n_gpu_layers = this->n_gpu_layers();
const bool use_mmap_buffer = true;
@@ -1262,10 +1262,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
splits[i] /= split_sum;
}
const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0);
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1);
const int i_gpu_start = std::max(n_layer_all + 1 - n_gpu_layers, 0);
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer_all + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
const bool is_swa = il < n_layer && hparams.is_swa(il);
const bool is_swa = il < n_layer_all && hparams.is_swa(il);
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
return {cpu_dev, &pimpl->cpu_buft_list};
@@ -1281,13 +1281,13 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
// assign the repeating layers to the devices according to the splits
pimpl->dev_layer.resize(n_layer);
for (int il = 0; il < n_layer; ++il) {
pimpl->dev_layer.resize(n_layer_all);
for (int il = 0; il < n_layer_all; ++il) {
pimpl->dev_layer[il] = get_layer_buft_list(il);
}
// assign the output layer
pimpl->dev_output = get_layer_buft_list(n_layer);
pimpl->dev_output = get_layer_buft_list(n_layer_all);
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
@@ -1303,14 +1303,14 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
throw std::runtime_error("model has expert layers but no expert layers are used");
}
layers.resize(n_layer);
layers.resize(n_layer_all);
// call the per-model loading function
load_arch_tensors(ml);
// generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
// this avoids having to add scale loading to every architecture
for (int i = 0; i < n_layer; ++i) {
for (int i = 0; i < n_layer_all; ++i) {
auto & layer = layers[i];
// attention weight scales (per-tensor, shape {1})
@@ -1568,7 +1568,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
}
if (llama_supports_gpu_offload()) {
const int n_gpu = std::min(n_gpu_layers, n_layer);
const int n_gpu = std::min(n_gpu_layers, n_layer_all);
int n_repeating = n_gpu;
if (n_repeating > 0) {
@@ -1577,8 +1577,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
}
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
const int max_backend_supported_layers = n_layer + 1;
const int max_offloadable_layers = n_layer + 1;
const int max_backend_supported_layers = n_layer_all + 1;
const int max_offloadable_layers = n_layer_all + 1;
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
}