model : register t_layer_inp for qwen3next (#25141 )

* Fix input assignment in layer processing loop Fix DFLASH for qwen-coder-next * add line break Added tensor for attention normalization in Qwen3 model.
common,server: handle bracketed IPv6 literals in URL authority (#25140 )
2026-06-30 19:23:05 +02:00 · 2026-06-30 17:57:14 +02:00 · 2026-06-30 16:16:44 +02:00 · 2026-06-30 14:16:24 +02:00
7 changed files with 54 additions and 23 deletions
@@ -11,6 +11,11 @@ struct common_http_url {
    std::string path;
 };

+// bracket an IPv6 literal host for a URL authority (RFC 3986)
+static std::string common_http_format_host(const std::string & host) {
+    return host.find(':') != std::string::npos ? "[" + host + "]" : host;
+}
+
 static common_http_url common_http_parse_url(const std::string & url) {
    common_http_url parts;
    auto scheme_end = url.find("://");
@@ -49,11 +54,28 @@ static common_http_url common_http_parse_url(const std::string & url) {
        parts.path = "/";
    }

-    auto colon_pos = parts.host.find(':');
+    // split the authority into host and optional port, a bracketed IPv6 literal keeps its inner colons (RFC 3986)
+    std::string port_str;
+    if (!parts.host.empty() && parts.host.front() == '[') {
+        auto close = parts.host.find(']');
+        if (close == std::string::npos) {
+            throw std::runtime_error("invalid IPv6 URL authority: " + parts.host);
+        }
+        auto after = parts.host.substr(close + 1);
+        if (!after.empty() && after.front() == ':') {
+            port_str = after.substr(1);
+        }
+        parts.host = parts.host.substr(1, close - 1);
+    } else {
+        auto colon_pos = parts.host.find(':');
+        if (colon_pos != std::string::npos) {
+            port_str = parts.host.substr(colon_pos + 1);
+            parts.host = parts.host.substr(0, colon_pos);
+        }
+    }

-    if (colon_pos != std::string::npos) {
-        parts.port = std::stoi(parts.host.substr(colon_pos + 1));
-        parts.host = parts.host.substr(0, colon_pos);
+    if (!port_str.empty()) {
+        parts.port = std::stoi(port_str);
    } else if (parts.scheme == "http") {
        parts.port = 80;
    } else if (parts.scheme == "https") {
@@ -83,7 +105,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
    }
 #endif

-    httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
+    httplib::Client cli(parts.scheme + "://" + common_http_format_host(parts.host) + ":" + std::to_string(parts.port));

    if (!parts.user.empty()) {
        cli.set_basic_auth(parts.user, parts.password);
@@ -95,5 +117,5 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
 }

 static std::string common_http_show_masked_url(const common_http_url & parts) {
-    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
+    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + common_http_format_host(parts.host) + parts.path;
 }
@@ -78,26 +78,29 @@ static __global__ void k_get_rows_float(

 template<typename grad_t, typename dst_t>
 static __global__ void k_get_rows_back_float(
-        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
+        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst,
+        const int64_t ncols, const int64_t nrows_grad, const int64_t nrows_dst) {
    const int col = blockIdx.x*blockDim.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

-    const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
-
-    float sum = 0.0f;
-
    ggml_cuda_pdl_sync();
-    for (int64_t i = 0; i < nrows_grad; ++i) {
-        if (rows[i] != dst_row) {
-            continue;
-        }
-        sum += grad[i*ncols + col];
-    }

-    dst[dst_row*ncols + col] = sum;
+    // grid.y is clamped to the CUDA grid limit, so stride over the destination rows
+    for (int64_t dst_row = blockIdx.y; dst_row < nrows_dst; dst_row += gridDim.y) {
+        float sum = 0.0f;
+
+        for (int64_t i = 0; i < nrows_grad; ++i) {
+            if (rows[i] != dst_row) {
+                continue;
+            }
+            sum += grad[i*ncols + col];
+        }
+
+        dst[dst_row*ncols + col] = sum;
+    }
 }

 template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
@@ -302,7 +305,7 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d

    const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne1, 1);
+    const dim3 block_nums(block_num_x, MIN(ne1, (int64_t)UINT16_MAX), 1);

-    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
+    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10, ne1);
 }
@@ -121,6 +121,8 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -7759,6 +7759,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    }

    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 70000, 4, 1, false)); // row count > CUDA grid-y limit (65535)
    for (ggml_type type : all_types) {
        for (bool v : {false, true}) {
            test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
@@ -39,7 +39,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
        throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
    }

-    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
+    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), common_http_format_host(parsed_url.host).c_str(), parsed_url.port, parsed_url.path.c_str());

    std::map<std::string, std::string> headers;
    const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "http.h"
 #include "server-http.h"
 #include "server-stream.h"
 #include "server-common.h"
@@ -441,7 +442,7 @@ bool server_http_context::start() {
    srv->wait_until_ready();

    listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
-                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
+                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", common_http_format_host(hostname).c_str(), port);
    return true;
 }

@@ -1,4 +1,5 @@
 #include "server-common.h"
+#include "http.h"
 #include "server-models.h"
 #include "server-context.h"
 #include "server-stream.h"
@@ -2263,7 +2264,8 @@ server_http_proxy::server_http_proxy(
            }
            if (lowered == "host") {
                bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
-                req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
+                const std::string url_host = common_http_format_host(host);
+                req.set_header(key, is_default_port ? url_host : url_host + ":" + std::to_string(port));
            } else {
                req.set_header(key, value);
            }