Compare commits

..

3 Commits

Author SHA1 Message Date
Jürgen Schmied 4f31eedb0c model : register t_layer_inp for qwen3next (#25141)
* Fix input assignment in layer processing loop

Fix DFLASH for qwen-coder-next

* add line break

Added tensor for attention normalization in Qwen3 model.
2026-06-30 17:57:14 +02:00
Pascal 799fcc04a5 common,server: handle bracketed IPv6 literals in URL authority (#25140)
* common,server: handle bracketed IPv6 literals in URL authority

Parse the [host]:port form (RFC 3986) and bracket IPv6 hosts when
formatting a URL authority: listening log, proxy Host header, proxy
log, client rebuild. The per-request remote_addr stays bare.

* common: restore unsupported scheme throw in url parser

Address @ngxson review: keep the explicit reject in port resolution so
the block stays self-contained. Non-http(s) schemes still throw (also
gated at the top of common_http_parse_url).
2026-06-30 16:16:44 +02:00
Matt Jallo 931eb37f8c CUDA: fix get_rows_back for tables with more than 65535 rows (grid-y clamp + stride) (#25103) 2026-06-30 14:16:24 +02:00
7 changed files with 54 additions and 23 deletions
+28 -6
View File
@@ -11,6 +11,11 @@ struct common_http_url {
std::string path;
};
// bracket an IPv6 literal host for a URL authority (RFC 3986)
static std::string common_http_format_host(const std::string & host) {
return host.find(':') != std::string::npos ? "[" + host + "]" : host;
}
static common_http_url common_http_parse_url(const std::string & url) {
common_http_url parts;
auto scheme_end = url.find("://");
@@ -49,11 +54,28 @@ static common_http_url common_http_parse_url(const std::string & url) {
parts.path = "/";
}
auto colon_pos = parts.host.find(':');
// split the authority into host and optional port, a bracketed IPv6 literal keeps its inner colons (RFC 3986)
std::string port_str;
if (!parts.host.empty() && parts.host.front() == '[') {
auto close = parts.host.find(']');
if (close == std::string::npos) {
throw std::runtime_error("invalid IPv6 URL authority: " + parts.host);
}
auto after = parts.host.substr(close + 1);
if (!after.empty() && after.front() == ':') {
port_str = after.substr(1);
}
parts.host = parts.host.substr(1, close - 1);
} else {
auto colon_pos = parts.host.find(':');
if (colon_pos != std::string::npos) {
port_str = parts.host.substr(colon_pos + 1);
parts.host = parts.host.substr(0, colon_pos);
}
}
if (colon_pos != std::string::npos) {
parts.port = std::stoi(parts.host.substr(colon_pos + 1));
parts.host = parts.host.substr(0, colon_pos);
if (!port_str.empty()) {
parts.port = std::stoi(port_str);
} else if (parts.scheme == "http") {
parts.port = 80;
} else if (parts.scheme == "https") {
@@ -83,7 +105,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
}
#endif
httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
httplib::Client cli(parts.scheme + "://" + common_http_format_host(parts.host) + ":" + std::to_string(parts.port));
if (!parts.user.empty()) {
cli.set_basic_auth(parts.user, parts.password);
@@ -95,5 +117,5 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
}
static std::string common_http_show_masked_url(const common_http_url & parts) {
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + common_http_format_host(parts.host) + parts.path;
}
+17 -14
View File
@@ -78,26 +78,29 @@ static __global__ void k_get_rows_float(
template<typename grad_t, typename dst_t>
static __global__ void k_get_rows_back_float(
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst,
const int64_t ncols, const int64_t nrows_grad, const int64_t nrows_dst) {
const int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
float sum = 0.0f;
ggml_cuda_pdl_sync();
for (int64_t i = 0; i < nrows_grad; ++i) {
if (rows[i] != dst_row) {
continue;
}
sum += grad[i*ncols + col];
}
dst[dst_row*ncols + col] = sum;
// grid.y is clamped to the CUDA grid limit, so stride over the destination rows
for (int64_t dst_row = blockIdx.y; dst_row < nrows_dst; dst_row += gridDim.y) {
float sum = 0.0f;
for (int64_t i = 0; i < nrows_grad; ++i) {
if (rows[i] != dst_row) {
continue;
}
sum += grad[i*ncols + col];
}
dst[dst_row*ncols + col] = sum;
}
}
template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
@@ -302,7 +305,7 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d
const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne1, 1);
const dim3 block_nums(block_num_x, MIN(ne1, (int64_t)UINT16_MAX), 1);
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10, ne1);
}
+2
View File
@@ -121,6 +121,8 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+1
View File
@@ -7759,6 +7759,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 70000, 4, 1, false)); // row count > CUDA grid-y limit (65535)
for (ggml_type type : all_types) {
for (bool v : {false, true}) {
test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
+1 -1
View File
@@ -39,7 +39,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
}
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), common_http_format_host(parsed_url.host).c_str(), parsed_url.port, parsed_url.path.c_str());
std::map<std::string, std::string> headers;
const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
+2 -1
View File
@@ -1,4 +1,5 @@
#include "common.h"
#include "http.h"
#include "server-http.h"
#include "server-stream.h"
#include "server-common.h"
@@ -441,7 +442,7 @@ bool server_http_context::start() {
srv->wait_until_ready();
listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
: string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
: string_format("%s://%s:%d", is_ssl ? "https" : "http", common_http_format_host(hostname).c_str(), port);
return true;
}
+3 -1
View File
@@ -1,4 +1,5 @@
#include "server-common.h"
#include "http.h"
#include "server-models.h"
#include "server-context.h"
#include "server-stream.h"
@@ -2263,7 +2264,8 @@ server_http_proxy::server_http_proxy(
}
if (lowered == "host") {
bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
const std::string url_host = common_http_format_host(host);
req.set_header(key, is_default_port ? url_host : url_host + ":" + std::to_string(port));
} else {
req.set_header(key, value);
}