model : rename local n_layer_all variable (#24209 )

context : fix off-by-one comparisons to n_gpu_layers (#24208 )
2026-06-06 19:13:00 +02:00 · 2026-06-06 07:07:20 +03:00 · 2026-06-06 07:06:47 +03:00
2 changed files with 14 additions and 14 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -341,7 +341,7 @@ llama_context::llama_context(
        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
        bool pipeline_parallel =
            model.n_devices() > 1 &&
-            model.n_gpu_layers() > model.hparams.n_layer() &&
+            model.n_gpu_layers() > model.hparams.n_layer_all &&
            model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
            cparams.offload_kqv &&
            !model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
+        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all;
        if (ubatch.n_tokens < 32 || full_offload) {
            if (il != -1 && strcmp(name, "norm") == 0) {
                const auto & dev_layer = model.dev_layer(il);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1205,7 +1205,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    const auto & use_mlock    = params.use_mlock;
    const auto & tensor_split = params.tensor_split;

-    const int n_layer = hparams.n_layer_all;
+    const int n_layer_all = hparams.n_layer_all;
    const int n_gpu_layers = this->n_gpu_layers();

    const bool use_mmap_buffer = true;
@@ -1262,10 +1262,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
        splits[i] /= split_sum;
    }

-    const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1);
+    const int i_gpu_start = std::max(n_layer_all + 1 - n_gpu_layers, 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer_all + 1);
    auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < n_layer && hparams.is_swa(il);
+        const bool is_swa = il < n_layer_all && hparams.is_swa(il);
        if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
            return {cpu_dev, &pimpl->cpu_buft_list};
@@ -1281,13 +1281,13 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };

    // assign the repeating layers to the devices according to the splits
-    pimpl->dev_layer.resize(n_layer);
-    for (int il = 0; il < n_layer; ++il) {
+    pimpl->dev_layer.resize(n_layer_all);
+    for (int il = 0; il < n_layer_all; ++il) {
        pimpl->dev_layer[il] = get_layer_buft_list(il);
    }

    // assign the output layer
-    pimpl->dev_output = get_layer_buft_list(n_layer);
+    pimpl->dev_output = get_layer_buft_list(n_layer_all);

    const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;

@@ -1303,14 +1303,14 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
            throw std::runtime_error("model has expert layers but no expert layers are used");
        }

-        layers.resize(n_layer);
+        layers.resize(n_layer_all);

        // call the per-model loading function
        load_arch_tensors(ml);

        // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
        // this avoids having to add scale loading to every architecture
-        for (int i = 0; i < n_layer; ++i) {
+        for (int i = 0; i < n_layer_all; ++i) {
            auto & layer = layers[i];

            // attention weight scales (per-tensor, shape {1})
@@ -1568,7 +1568,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
    }

    if (llama_supports_gpu_offload()) {
-        const int n_gpu = std::min(n_gpu_layers, n_layer);
+        const int n_gpu = std::min(n_gpu_layers, n_layer_all);

        int n_repeating = n_gpu;
        if (n_repeating > 0) {
@@ -1577,8 +1577,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
        }
        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);

-        const int max_backend_supported_layers = n_layer + 1;
-        const int max_offloadable_layers       = n_layer + 1;
+        const int max_backend_supported_layers = n_layer_all + 1;
+        const int max_offloadable_layers       = n_layer_all + 1;

        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
    }
Author	SHA1	Message	Date
Sigbjørn Skjæret	5343f4502a	model : rename local n_layer_all variable (#24209 )	2026-06-06 07:07:20 +03:00
Sigbjørn Skjæret	603300b008	context : fix off-by-one comparisons to n_gpu_layers (#24208 )	2026-06-06 07:06:47 +03:00