ggml-webgpu: add Q1_0 support (#22374 )

* add fast matmul matvec q1_0 kernel * ggml-webgpu: drop redundant zero-fills in Q1_0 shmem init
server: (router) Forward form-data to model server (Fixes #22044 ) (#22118 )
2026-04-28 05:50:07 +02:00 · 2026-04-27 15:50:59 -07:00 · 2026-04-27 23:55:00 +02:00
12 changed files with 229 additions and 12 deletions
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -1287,6 +1287,7 @@ class ggml_webgpu_shader_lib {
                    std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);

                    switch (key.src_type) {
+                        case GGML_TYPE_Q1_0:
                        case GGML_TYPE_Q4_0:
                        case GGML_TYPE_Q5_0:
                        case GGML_TYPE_Q8_0:
@@ -1323,7 +1324,9 @@ class ggml_webgpu_shader_lib {

                    defines.push_back("DST_TYPE=f32");

-                    if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
+                    if (key.src_type == GGML_TYPE_Q1_0) {
+                        defines.push_back("BLOCK_SIZE=128u");
+                    } else if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
                        key.src_type == GGML_TYPE_IQ4_NL) {
                        defines.push_back("BLOCK_SIZE=32u");
                    } else if (key.src_type >= GGML_TYPE_Q2_K) {
@@ -1657,7 +1660,9 @@ class ggml_webgpu_shader_lib {
        uint32_t wg_size        = WEBGPU_MUL_MAT_VEC_WG_SIZE;
        uint32_t outputs_per_wg = WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG;

-        if (key.src0_type >= GGML_TYPE_Q2_K) {
+        if (key.src0_type == GGML_TYPE_Q1_0) {
+            outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
+        } else if (key.src0_type >= GGML_TYPE_Q2_K) {
            outputs_per_wg = WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG;
        } else if (key.src0_type >= GGML_TYPE_Q4_0) {
            outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1389,6 +1389,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
                case GGML_TYPE_Q5_K:
                case GGML_TYPE_Q3_K:
                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q1_0:
                    use_fast = true;
                    break;
                case GGML_TYPE_IQ1_S:
@@ -3736,6 +3737,7 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm

 static bool ggml_webgpu_supported_qtype(ggml_type type) {
    switch (type) {
+        case GGML_TYPE_Q1_0:
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q4_1:
        case GGML_TYPE_Q5_0:
@@ -3830,6 +3832,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                        switch (src0->type) {
                            case GGML_TYPE_F32:
                            case GGML_TYPE_F16:
+                            case GGML_TYPE_Q1_0:
                            case GGML_TYPE_Q4_0:
                            case GGML_TYPE_Q4_1:
                            case GGML_TYPE_Q5_0:
@@ -3868,6 +3871,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                    switch (src0->type) {
                        case GGML_TYPE_F32:
                        case GGML_TYPE_F16:
+                        case GGML_TYPE_Q1_0:
                        case GGML_TYPE_Q4_0:
                        case GGML_TYPE_Q4_1:
                        case GGML_TYPE_Q5_0:
--- a/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl
@@ -27,6 +27,24 @@ fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
 }
 #endif

+#ifdef Q1_0
+fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
+    let block_byte_base = (src_base + offset) * 18;
+    let d = load_f16_as_f32_at_src(block_byte_base);
+    for (var j: u32 = 0u; j < 4u; j++) {
+        let q_packed = load_u32_at_src(block_byte_base + 2u + j * 4u);
+        let dst_base128 = dst_base + offset * 128u + j * 32u;
+        for (var k: u32 = 0; k < 4u; k++) {
+            let q_byte = get_byte(q_packed, k);
+            for (var bit: u32 = 0; bit < 8u; bit++) {
+                let w = select(-d, d, ((q_byte >> bit) & 1u) != 0u);
+                dst[dst_base128 + k * 8u + bit] = w;
+            }
+        }
+    }
+}
+#endif
+
 #ifdef Q4_0
 fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
    let block_byte_base = (src_base + offset) * 18; // Block stride: 18 bytes
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -61,6 +61,39 @@ fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u3
 #endif // INIT_SRC1_SHMEM_FLOAT
 #endif

+#ifdef INIT_SRC0_SHMEM_Q1_0
+const BLOCK_SIZE = 128u;
+const BLOCK_SIZE_BYTES = 18u;
+const NQ = 8u; // 8 weights (1 byte of qs) per thread per iteration
+
+fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
+    for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
+        let tile_m = i / TILE_K;
+        let tile_k_start = i % TILE_K;
+        let global_m = offset_m + tile_m;
+        let global_k_start = k_outer + tile_k_start;
+
+        if (global_m >= params.m) {
+            break;
+        }
+
+        let block_k = global_k_start / BLOCK_SIZE;
+        let byte_in_block = (global_k_start % BLOCK_SIZE) / 8u;
+        let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
+        let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
+        let d = load_f16_at_src0(block_byte_base);
+        let q_byte = load_u32_at_src0(block_byte_base + 2u + byte_in_block) & 0xFFu;
+
+        for (var bit = 0u; bit < NQ; bit++) {
+            let global_k = global_k_start + bit;
+            if (global_k < params.k) {
+                shmem[i + bit] = select(-d, d, ((q_byte >> bit) & 1u) != 0u);
+            }
+        }
+    }
+}
+#endif // INIT_SRC0_SHMEM_Q1_0
+
 #ifdef INIT_SRC0_SHMEM_Q4_0
 const BLOCK_SIZE = 32u;
 const BLOCK_SIZE_BYTES = 18u;
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl
@@ -128,6 +128,38 @@ fn main(
    }
 #endif

+#ifdef MUL_ACC_Q1_0
+#define BLOCK_SIZE 128
+#define BLOCK_SIZE_BYTES 18
+#define THREADS_PER_BLOCK 16
+#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
+
+    let num_blocks = params.k / BLOCK_SIZE;
+    let thread_within_block = thread_id % THREADS_PER_BLOCK;
+    for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) {
+        let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * ELEMS_PER_THREAD;
+        var x_block: array<f32, ELEMS_PER_THREAD>;
+        for (var i = 0u; i < ELEMS_PER_THREAD; i++) {
+            x_block[i] = f32(src1[x_base + i]);
+        }
+
+        for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
+            let output_row = row_base + row;
+            if (output_row < params.m) {
+                let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
+                let d = f32(load_f16_at_src0(block_byte_base));
+                let q_byte = load_u32_at_src0(block_byte_base + 2u + thread_within_block) & 0xFFu;
+                var row_sum = 0.0;
+                for (var bit = 0u; bit < 8u; bit++) {
+                    let w = select(-d, d, ((q_byte >> bit) & 1u) != 0u);
+                    row_sum += w * x_block[bit];
+                }
+                acc[row] += row_sum;
+            }
+        }
+    }
+#endif
+
 #ifdef MUL_ACC_Q4_0
 #define BLOCK_SIZE 32
 #define BLOCK_SIZE_BYTES 18
--- a/tools/server/server-chat.cpp
+++ b/tools/server/server-chat.cpp
@@ -575,14 +575,14 @@ json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
 json convert_transcriptions_to_chatcmpl(
        const json & inp_body,
        const common_chat_templates * tmpls,
-        const std::map<std::string, raw_buffer> & in_files,
+        const std::map<std::string, uploaded_file> & in_files,
        std::vector<raw_buffer> & out_files) {
    // TODO @ngxson : this function may need to be improved in the future
    // handle input files
    out_files.clear();
    auto it = in_files.find("file");
    if (it != in_files.end()) {
-        out_files.push_back(it->second);
+        out_files.push_back(it->second.data);
    } else {
        throw std::invalid_argument("No input file found for transcription");
    }
--- a/tools/server/server-chat.h
+++ b/tools/server/server-chat.h
@@ -4,6 +4,7 @@

 #include "chat.h"
 #include "server-common.h"
+#include "server-http.h"

 #include <nlohmann/json_fwd.hpp>

@@ -19,7 +20,7 @@ json server_chat_convert_anthropic_to_oai(const json & body);
 json convert_transcriptions_to_chatcmpl(
    const json & body,
    const common_chat_templates * tmpls,
-    const std::map<std::string, raw_buffer> & in_files,
+    const std::map<std::string, uploaded_file> & in_files,
    std::vector<raw_buffer> & out_files);

 json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
--- a/tools/server/server-cors-proxy.h
+++ b/tools/server/server-cors-proxy.h
@@ -49,6 +49,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
            parsed_url.path,
            headers,
            req.body,
+            req.files,
            req.should_stop,
            600, // timeout_read (default to 10 minutes)
            600  // timeout_write (default to 10 minutes)
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -438,7 +438,7 @@ void server_http_context::get(const std::string & path, const server_http_contex
 void server_http_context::post(const std::string & path, const server_http_context::handler_t & handler) const {
    pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
        std::string body = req.body;
-        std::map<std::string, raw_buffer> files;
+        std::map<std::string, uploaded_file> files;

        if (req.is_multipart_form_data()) {
            // translate text fields to a JSON object and use it as the body
@@ -459,7 +459,11 @@ void server_http_context::post(const std::string & path, const server_http_conte

            // populate files from multipart form
            for (const auto & [key, file] : req.form.files) {
-                files[key] = raw_buffer(file.content.begin(), file.content.end());
+                files[key] = uploaded_file{
+                    raw_buffer(file.content.begin(), file.content.end()),
+                    file.filename,
+                    file.content_type,
+                };
            }
        }

--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -36,13 +36,19 @@ struct server_http_res {
 using server_http_res_ptr = std::unique_ptr<server_http_res>;
 using raw_buffer = std::vector<uint8_t>;

+struct uploaded_file {
+    raw_buffer data;
+    std::string filename;
+    std::string content_type;
+};
+
 struct server_http_req {
    std::map<std::string, std::string> params; // path_params + query_params
    std::map<std::string, std::string> headers; // used by MCP proxy
    std::string path;
    std::string query_string; // query parameters string (e.g. "action=save")
    std::string body;
-    std::map<std::string, raw_buffer> files; // used for file uploads (form data)
+    std::map<std::string, uploaded_file> files; // used for file uploads (form data)
    const std::function<bool()> & should_stop;

    std::string get_param(const std::string & key, const std::string & def = "") const {
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -18,6 +18,8 @@
 #include <chrono>
 #include <queue>
 #include <filesystem>
+#include <random>
+#include <sstream>
 #include <cstring>

 #ifdef _WIN32
@@ -823,6 +825,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
            proxy_path,
            req.headers,
            req.body,
+            req.files,
            req.should_stop,
            base_params.timeout_read,
            base_params.timeout_write
@@ -1126,6 +1129,77 @@ static bool should_strip_proxy_header(const std::string & header_name) {
    return false;
 }

+static std::string generate_multipart_boundary() {
+    thread_local std::mt19937 gen(std::random_device{}());
+    static const char chars[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+    std::uniform_int_distribution<> dis(0, sizeof(chars) - 2);
+    std::string boundary = "----llama-cpp-proxy-";
+    for (int i = 0; i < 16; i++) {
+        boundary += chars[dis(gen)];
+    }
+    return boundary;
+}
+
+static std::string build_multipart_body(
+        const json & form_fields,
+        const std::map<std::string, uploaded_file> & files,
+        const std::string & boundary) {
+    static auto sanitize_field = [](const std::string & text) {
+        std::string result;
+        result.reserve(text.size());
+        for (char c : text) {
+            if (c != '\n' && c != '\r' && c != '"') {
+                result += c;
+            }
+        }
+        return result;
+    };
+
+    std::ostringstream body;
+
+    for (const auto & [key, value] : form_fields.items()) {
+        if (value.is_array()) {
+            for (const auto & item : value) {
+                body << "--" << boundary << "\r\n";
+                body << "Content-Disposition: form-data; name=\"" << sanitize_field(key) << "\"\r\n";
+                body << "\r\n";
+                if (!item.is_string()) {
+                    throw std::invalid_argument("expected string");
+                }
+                body << item.get<std::string>() << "\r\n";
+            }
+        } else {
+            body << "--" << boundary << "\r\n";
+            body << "Content-Disposition: form-data; name=\"" << sanitize_field(key) << "\"\r\n";
+            body << "\r\n";
+            if (!value.is_string()) {
+                throw std::invalid_argument("expected string");
+            }
+            body << value.get<std::string>() << "\r\n";
+        }
+    }
+
+    for (const auto & [key, file] : files) {
+        body << "--" << boundary << "\r\n";
+        body << "Content-Disposition: form-data; name=\"" << sanitize_field(key) << "\"";
+        if (!file.filename.empty()) {
+            body << "; filename=\"" << sanitize_field(file.filename) << "\"";
+        }
+        body << "\r\n";
+        if (!file.content_type.empty()) {
+            body << "Content-Type: " << sanitize_field(file.content_type) << "\r\n";
+        } else {
+            body << "Content-Type: application/octet-stream\r\n";
+        }
+        body << "\r\n";
+        body.write(reinterpret_cast<const char*>(file.data.data()), file.data.size());
+        body << "\r\n";
+    }
+
+    body << "--" << boundary << "--\r\n";
+    return body.str();
+}
+
 server_http_proxy::server_http_proxy(
        const std::string & method,
        const std::string & scheme,
@@ -1134,6 +1208,7 @@ server_http_proxy::server_http_proxy(
        const std::string & path,
        const std::map<std::string, std::string> & headers,
        const std::string & body,
+        const std::map<std::string, uploaded_file> & files,
        const std::function<bool()> should_stop,
        int32_t timeout_read,
        int32_t timeout_write
@@ -1195,28 +1270,65 @@ server_http_proxy::server_http_proxy(
        return pipe->write({{}, 0, std::string(data, data_length), ""});
    };

+    // when files are present, the body was converted from multipart form data to JSON
+    // we need to reconstruct the multipart body for the downstream server
+    std::string effective_body = body;
+    std::string override_content_type;
+    bool has_files = !files.empty();
+
+    if (has_files) {
+        json form_fields = json::parse(body, nullptr, false);
+        if (!form_fields.is_discarded()) {
+            auto boundary = generate_multipart_boundary();
+            effective_body = build_multipart_body(form_fields, files, boundary);
+            override_content_type = "multipart/form-data; boundary=" + boundary;
+        } else {
+            throw std::runtime_error("failed to parse multipart form fields JSON");
+        }
+    }
+
    // prepare the request to destination server
    httplib::Request req;
    {
        req.method = method;
        req.path = path;
        for (const auto & [key, value] : headers) {
-            if (key == "Accept-Encoding") {
+            const auto lowered = to_lower_copy(key);
+            if (lowered == "accept-encoding") {
                // disable Accept-Encoding to avoid compressed responses
                continue;
            }
-            if (key == "Transfer-Encoding") {
+            if (lowered == "transfer-encoding") {
                // the body is already decoded
                continue;
            }
-            if (key == "Host" || key == "host") {
+            if (lowered == "content-length") {
+                // let httplib calculate Content-Length from the actual body
+                continue;
+            }
+            if (lowered == "content-type") {
+                if (has_files) {
+                    // we set our own Content-Type with the new boundary
+                    continue;
+                }
+                // when no files but the original request was multipart,
+                // the body is now JSON, so correct the Content-Type
+                if (value.find("multipart/form-data") != std::string::npos) {
+                    override_content_type = "application/json; charset=utf-8";
+                    continue;
+                }
+            }
+            if (lowered == "host") {
                bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
                req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
            } else {
                req.set_header(key, value);
            }
        }
-        req.body = body;
+        req.body = effective_body;
+        if (!override_content_type.empty()) {
+            req.set_header("Content-Type", override_content_type);
+        }
        req.response_handler = response_handler;
        req.content_receiver = content_receiver;
    }
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -202,6 +202,7 @@ public:
                      const std::string & path,
                      const std::map<std::string, std::string> & headers,
                      const std::string & body,
+                      const std::map<std::string, uploaded_file> & files,
                      const std::function<bool()> should_stop,
                      int32_t timeout_read,
                      int32_t timeout_write