context : fix off-by-one comparisons to n_gpu_layers (#24208 )

opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160 )
* opencl: allow multiple workgroups for large rows * opencl: improve small cpy * opencl: packed concat for small input * opencl: tweak flat q6_K gemv, increase N_DST and remap threads
2026-06-08 12:02:58 +02:00 · 2026-06-06 07:06:47 +03:00 · 2026-06-05 13:45:25 -07:00 · 2026-06-05 14:31:56 -05:00
10 changed files with 389 additions and 218 deletions
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -87,6 +87,8 @@ static std::string normalize_quotes_to_json(const std::string & input) {
    bool in_single_quoted = false;
    bool in_double_quoted = false;

+    auto is_word_char = [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; };
+
    for (size_t i = 0; i < input.size(); ++i) {
        char c = input[i];

@@ -151,6 +153,29 @@ static std::string normalize_quotes_to_json(const std::string & input) {
                in_single_quoted = true;
                result += '"';
            }
+        } else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') &&
+                   (i == 0 || !is_word_char(input[i - 1]))) {
+            // Python literals -> JSON; prefix match keeps streamed partials monotonic.
+            static constexpr std::pair<std::string_view, std::string_view> literals[] = {
+                { "True", "true" }, { "False", "false" }, { "None", "null" },
+            };
+            size_t n = 0;
+            while (i + n < input.size() && is_word_char(input[i + n])) {
+                ++n;
+            }
+            std::string_view token(input.data() + i, n);
+            bool matched = false;
+            for (const auto & [py, js] : literals) {
+                if (py.substr(0, n) == token) {
+                    result += js.substr(0, n);
+                    i += n - 1;
+                    matched = true;
+                    break;
+                }
+            }
+            if (!matched) {
+                result += c;
+            }
        } else {
            result += c;
        }
@@ -353,12 +378,8 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            }
            value_to_add += escape_json_string_inner(value_content);
        } else if (!value_content.empty()) {
-            // For potential containers, normalize Python-style single quotes to JSON double quotes
-            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
-            if (is_potential_container) {
-                value_content = normalize_container_value(value_content);
-            }
-            value_to_add += value_content;
+            // Pythonic scalars/containers -> JSON.
+            value_to_add += normalize_container_value(value_content);
        }

        args_target() += value_to_add;
@@ -466,11 +487,34 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
    return force_tool_calls ? section : optional(section);
 }

+// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5
+common_peg_parser common_chat_peg_builder::python_or_json_value() {
+    return rule("python-or-json-value", [this]() {
+        auto ws    = space();
+        auto value = python_or_json_value();
+
+        auto member  = sequence({ python_string(), ws, literal(":"), ws, value });
+        auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
+        auto dict    = rule("python-or-json-dict", [&]() {
+            return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws });
+        });
+
+        auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) });
+        auto array    = rule("python-or-json-array", [&]() {
+            return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws });
+        });
+
+        return choice({ dict, array, python_string(), python_number(),
+                        python_bool(), python_null(), json_bool(), json_null() });
+    });
+}
+
 // Python-style tool calls: name(arg1="value1", arg2=123)
 // Used only by LFM2 for now, so we don't merge it into autoparser
 common_peg_parser common_chat_peg_builder::python_style_tool_calls(
    const ordered_json & tools,
-    bool                 parallel_tool_calls) {
+    bool                 parallel_tool_calls,
+    bool                 allow_json_literals) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
@@ -504,7 +548,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
                if (is_string_type) {
                    arg_value_parser = string_value_parser;
                } else {
-                    arg_value_parser = tool_arg_value(python_value());
+                    arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value());
                }

                // Full argument: name="value" or name=value
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -132,9 +132,13 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
    // Used by LFM2 and similar templates
    common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
-                                              bool                           parallel_tool_calls);
+                                              bool                           parallel_tool_calls,
+                                              bool                           allow_json_literals);

  private:
+    // Python values plus JSON true/false/null.
+    common_peg_parser python_or_json_value();
+
    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
    common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
                                                       const std::string &            args_key,
@@ -195,4 +199,3 @@ struct tagged_peg_parser {

 tagged_peg_parser build_tagged_peg_parser(
    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
-
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1608,42 +1608,40 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
    return data;
 }

-// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
-// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Content: text before a tool call (optional)
-// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
-//   Tool calls can appear multiple times (parallel tool calls supported)
-static common_chat_params common_chat_params_init_lfm2(const common_chat_template &    tmpl,
-                                                       const autoparser::generation_params & inputs) {
+// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable
+// (except dotted names and JSON literals true/false/null).
+// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional <think> reasoning.
+// tool_list_tokens preserves LFM2 system tool-list markers.
+static common_chat_params common_chat_params_init_lfm2(const common_chat_template &          tmpl,
+                                                       const autoparser::generation_params & inputs,
+                                                       bool tool_list_tokens) {
    common_chat_params data;

+    const std::string TOOL_CALL_START = "<|tool_call_start|>";
+    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
+    const std::string TOOL_LIST_START = "<|tool_list_start|>";
+    const std::string TOOL_LIST_END   = "<|tool_list_end|>";
+    const std::string THINK_START     = "<think>";
+    const std::string THINK_END       = "</think>";
+    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";
+
    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
-    data.preserved_tokens  = {
-        "<|tool_list_start|>",
-        "<|tool_list_end|>",
-        "<|tool_call_start|>",
-        "<|tool_call_end|>",
-        "<think>",
-        "</think>",
-    };
+    data.preserved_tokens  = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
+    if (tool_list_tokens) {
+        data.preserved_tokens.push_back(TOOL_LIST_START);
+        data.preserved_tokens.push_back(TOOL_LIST_END);
+    }
+
+    data.thinking_start_tag = THINK_START;
+    data.thinking_end_tag   = THINK_END;

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

-    const std::string TOOL_CALL_START = "<|tool_call_start|>";
-    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
-    const std::string THINK_START     = "<think>";
-    const std::string THINK_END       = "</think>";
-    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";
-
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-
    if (inputs.has_continuation()) {
        const auto & msg = inputs.continue_msg;

@@ -1670,7 +1668,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
        auto tool_calls = p.rule("tool-calls",
            p.trigger_rule("tool-call",
                p.literal(TOOL_CALL_START) +
-                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
+                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) +
                p.literal(TOOL_CALL_END)
            )
        );
@@ -1697,93 +1695,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
            { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
        };
    }
-    return data;
-}
-
-// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
-// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
-// - Reasoning: <think>{reasoning}</think> (optional)
-// - Content: text before a tool call (optional)
-// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
-//   Tool calls can appear multiple times (parallel tool calls supported)
-static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template &    tmpl,
-                                                         const autoparser::generation_params & inputs) {
-    common_chat_params data;
-
-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
-    data.preserved_tokens  = {
-        "<|tool_call_start|>",
-        "<|tool_call_end|>",
-        "<think>",
-        "</think>",
-    };
-
-    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
-    auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
-
-    const std::string THINK_START     = "<think>";
-    const std::string THINK_END       = "</think>";
-    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";
-
-    data.thinking_start_tag = THINK_START;
-    data.thinking_end_tag   = THINK_END;
-
-    if (inputs.has_continuation()) {
-        const auto & msg = inputs.continue_msg;
-
-        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += THINK_END + msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
-    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.literal(GEN_PROMPT);
-        auto end = p.end();
-
-        auto reasoning = p.eps();
-        if (extract_reasoning && inputs.enable_thinking) {
-            reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
-        }
-
-        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
-            return generation_prompt + reasoning + p.content(p.rest()) + end;
-        }
-
-        auto tool_calls = p.rule("tool-calls",
-            p.trigger_rule("tool-call",
-                p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
-            )
-        );
-
-        auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
-        auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
-        return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
-    });
-
-    data.parser = parser.save();
-
-    if (include_grammar) {
-        data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                auto         schema   = function.at("parameters");
-                builder.resolve_refs(schema);
-            });
-            parser.build_grammar(builder, data.grammar_lazy);
-        });
-        foreach_function(inputs.tools, [&](const json & tool) {
-            const std::string name = tool.at("function").at("name");
-            data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
-        });
-    }

    return data;
 }
@@ -2298,14 +2209,14 @@ std::optional<common_chat_params> common_chat_try_specialized_template(

    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
-        return common_chat_params_init_lfm2(tmpl, params);
+        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
    }

    // LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
    if (src.find("List of tools: [") != std::string::npos &&
        src.find("<|tool_list_start|>") == std::string::npos) {
        LOG_DBG("Using specialized template: LFM2.5\n");
-        return common_chat_params_init_lfm2_5(tmpl, params);
+        return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false);
    }

    // GigaChatV3 format detection
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
    cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
    cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
-    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
+    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
    cl_kernel kernel_mul_mat_f32_f32;
    cl_kernel kernel_mul_mat_f16_f16;
    cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
    cl_kernel kernel_upscale;
    cl_kernel kernel_upscale_bilinear;
-    cl_kernel kernel_concat_f32;
+    cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
    cl_kernel kernel_conv_2d_f16;
    cl_kernel kernel_conv_2d_f32;
    cl_kernel kernel_conv_2d_f16_f32;
@@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
        CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
        CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
        CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
        GGML_LOG_CONT(".");
    }
@@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        cl_program prog =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
        CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
        CL_CHECK(clReleaseProgram(prog));
        GGML_LOG_CONT(".");
    }
@@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
        nth *= 2;
    }

-    size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
+    int nchunks = 1;
+    if (src0->type == GGML_TYPE_F32) {
+        const int chunk_target = nth * 4;
+        nchunks = (ne00 + chunk_target - 1) / chunk_target;
+        nchunks = MAX(1, MIN(nchunks, 64));
+    }
+
+    size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
    size_t local_work_size[] = {(size_t)nth, 1, 1};

    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con

    int nth = MIN(64, ne0);

-    cl_kernel kernel = backend_ctx->kernel_concat_f32;
+    const bool concat_pack = (dim == 0 && ne0 < 32);
+    cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
+                                   : backend_ctx->kernel_concat_f32;

    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
@@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int),   &dim));

-    size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
-    size_t local_work_size[] = {(size_t)nth, 1, 1};
+    if (concat_pack) {
+        // packed kernel needs the dst dims to unflatten its 1-D row index.
+        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
+        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));

-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
+        const int base  = MIN(64, maxwg);
+        const int tpr   = MIN(ne0, base);                 // threads per row
+        const int rpw   = MAX(1, base / tpr);             // rows per workgroup
+        const int lsz   = tpr * rpw;
+        const int nrows = ne1*ne2*ne3;
+        const int nwg   = (nrows + rpw - 1) / rpw;
+        size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
+        size_t local_work_size[]  = {(size_t)lsz, 1, 1};
+        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
+    } else {
+        size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    }
 }

 static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
            } else if (backend_ctx->gpu_family == ADRENO) {
                nth0 = 64;
                nth1 = 2;
-                ndst = 4;
+                ndst = 16;
            } else {
                GGML_ASSERT(false && "TODO: Unknown GPU");
            }
@@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
                    kernel = backend_ctx->kernel_cpy_f32_f16;
                    break;
                case GGML_TYPE_F32:
-                    kernel = backend_ctx->kernel_cpy_f32_f32;
+                    kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
+                                       : backend_ctx->kernel_cpy_f32_f32;
                    break;
                default:
                    GGML_ASSERT(false && "not implemented");
@@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));

-    const int nth = MIN(64, ne00);
+    if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
+        const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
+        const int base  = MIN(64, maxwg);
+        const int tpr   = MIN(ne00, base);                 // threads per row
+        const int rpw   = MAX(1, base / tpr);              // rows per workgroup
+        const int lsz   = tpr * rpw;                       // <= base <= maxwg
+        const int nrows = ne01*ne02*ne03;
+        const int nwg   = (nrows + rpw - 1) / rpw;

-    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
-    size_t local_work_size[] = {(size_t)nth, 1, 1};
+        size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
+        size_t local_work_size[]  = {(size_t)lsz, 1, 1};

-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
+        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
+    } else {
+        const int nth = MIN(64, ne00);
+
+        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
+    }
 }

 static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
--- a/ggml/src/ggml-opencl/kernels/concat.cl
+++ b/ggml/src/ggml-opencl/kernels/concat.cl
@@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
        *y = *x;
    }
 }
+
+kernel void kernel_concat_f32_pack(
+    global  const char * src0,
+    ulong                offset0,
+    global  const char * src1,
+    ulong                offset1,
+    global        char * dst,
+    ulong                offsetd,
+    int             ne00,
+    int             ne01,
+    int             ne02,
+    int             ne03,
+    ulong           nb00,
+    ulong           nb01,
+    ulong           nb02,
+    ulong           nb03,
+    ulong           nb10,
+    ulong           nb11,
+    ulong           nb12,
+    ulong           nb13,
+    int             ne0,
+    ulong           nb0,
+    ulong           nb1,
+    ulong           nb2,
+    ulong           nb3,
+    int             dim,
+    int             ne1,
+    int             ne2,
+    int             ne3
+) {
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
+
+    int lsz = get_local_size(0);
+    int tpr = min(ne0, lsz);          // threads per row
+    int rpw = lsz / tpr;              // rows per workgroup
+    int lid = get_local_id(0);
+    int row = get_group_id(0)*rpw + lid / tpr;
+    int lane = lid - (lid / tpr) * tpr;
+
+    int nrows = ne1*ne2*ne3;
+    if (row >= nrows) {
+        return;
+    }
+
+    int i1 = row % ne1;
+    int t  = row / ne1;
+    int i2 = t % ne2;
+    int i3 = t / ne2;
+
+    int o[4] = {0, 0, 0, 0};
+    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
+
+    for (int i0 = lane; i0 < ne0; i0 += tpr) {
+        global const float * x;
+        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (global const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
+        } else {
+            x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
+        }
+
+        global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+        *y = *x;
+    }
+}
--- a/ggml/src/ggml-opencl/kernels/cpy.cl
+++ b/ggml/src/ggml-opencl/kernels/cpy.cl
@@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
    }
 }

+kernel void kernel_cpy_f32_f32_pack(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne03,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        int ne0,
+        int ne1,
+        int ne2,
+        int ne3,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+
+    int lsz = get_local_size(0);
+    int tpr = min(ne00, lsz);          // threads per row
+    int rpw = lsz / tpr;               // rows per workgroup
+    int lid = get_local_id(0);
+    int row = get_group_id(0)*rpw + lid / tpr;
+    int lane = lid - (lid / tpr) * tpr;
+
+    int nrows = ne01*ne02*ne03;
+    if (row >= nrows) {
+        return;
+    }
+
+    int i01 = row % ne01;
+    int t   = row / ne01;
+    int i02 = t % ne02;
+    int i03 = t / ne02;
+
+    // linear index of the first element of this row, unflattened over dst dims
+    long n  = (long)row * ne00;
+    int i3  = (int)(n / ((long)ne2*ne1*ne0));
+    long rm = n - (long)i3*ne2*ne1*ne0;
+    int i2  = (int)(rm / ((long)ne1*ne0));
+    rm     -= (long)i2*ne1*ne0;
+    int i1  = (int)(rm / ne0);
+    int i0  = (int)(rm - (long)i1*ne0);
+
+    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+    for (int i00 = lane; i00 < ne00; i00 += tpr) {
+        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        dst_data[i00] = src[0];
+    }
+}
+
 kernel void kernel_cpy_i32_i32(
        global int * src0,
        ulong offset0,
--- a/ggml/src/ggml-opencl/kernels/get_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/get_rows.cl
@@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
    src1 = (global int*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);

-    int i10 = get_group_id(0);
-    int i11 = get_group_id(1);
-    int i12 = get_group_id(2);
+    int nchunks = get_num_groups(0) / ne10;
+    int g       = get_group_id(0);
+    int i10     = g / nchunks;
+    int chunk   = g - i10 * nchunks;
+    int i11     = get_group_id(1);
+    int i12     = get_group_id(2);

    int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];

    int i02 = i11;
    int i03 = i12;

-    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
-        if (ind >= ne00) {
-            return;
-        }
-        ((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
-            ((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
+    global float * dst_row = (global float *) ((global char *) dst  + i12*nb3 + i11*nb2 + i10*nb1);
+    global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
+
+    int span  = (ne00 + nchunks - 1) / nchunks;
+    int start = chunk * span;
+    int end   = min(start + span, ne00);
+
+    for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
+        dst_row[ind] = src_row[ind];
    }
 }

--- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl
@@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat(
    global uchar * blk_qh,
    global char  * blk_scales,
    global half  * blk_d,
-    global float * yy,
    int ib,
    int ip,
    int is,
-    int l0
+    int l0,
+    float4 y0,
+    float4 y1,
+    float4 y2,
+    float4 y3
 ) {
-    int y_offset   = 128*ip + l0;
    int q_offset_l =  64*ip + l0;
    int q_offset_h =  32*ip + l0;

@@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat(
    global uchar * qh = blk_qh     + ib*64 + q_offset_h;
    global char  * sc = blk_scales + ib*16 + is;

-    global float * y = yy + ib * QK_K + y_offset;
-
    float dall = blk_d[ib];

-    float  sumf = 0;
-    float4 sums = {0.f, 0.f, 0.f, 0.f};
+    // Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads.
+    // q_offset_l/h are 4-aligned, so these are aligned vector loads.
+    uchar4 q1v = vload4(0, q1);
+    uchar4 q2v = vload4(0, q2);
+    uchar4 qhv = vload4(0, qh);

-    sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[0+64] * ((float)((q1[0]  >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[0+96] * ((float)((q2[0]  >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
+    int4 q1i = convert_int4(q1v);
+    int4 q2i = convert_int4(q2v);
+    int4 qhi = convert_int4(qhv);

-    sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[1+64] * ((float)((q1[1]  >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[1+96] * ((float)((q2[1]  >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
+    // Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the
+    // matching 2-bit plane of qh), same arithmetic as the scalar version, then dot()
+    // against the cached activation lanes.
+    float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f;
+    float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f;
+    float4 w2 = convert_float4((q1i >> 4)  | ((qhi & Q6_K_MASK3)     )) - 32.f;
+    float4 w3 = convert_float4((q2i >> 4)  | ((qhi & Q6_K_MASK4) >> 2)) - 32.f;

-    sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[2+64] * ((float)((q1[2]  >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[2+96] * ((float)((q2[2]  >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
-
-    sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
-    sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
-    sums.s2 += y[3+64] * ((float)((q1[3]  >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
-    sums.s3 += y[3+96] * ((float)((q2[3]  >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
-
-    sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
-
-    return sumf;
+    return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] +
+                   dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]);
 }

 #undef N_DST
@@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat(
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 16
 #elif defined (ADRENO_GPU)
-#define N_DST 4
+#define N_DST 16
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
 #endif
@@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat(
    global half  * blk_d      = (global half  *) src0_d  + offset_src0_d;
    global float * yy         = (global float *) src1    + r1*ne10 + im*ne00*ne1;

-    int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
-    int ix  = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
+    int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15
+    int ix  = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1
    int ip  = tid/8;   // first or second half of (super) block (0 or 1)
    int il  = tid%8;   // each half has 8 parts, one per scale
    int n   = 4;       // 4 scales at a time (and 4 sums)
    int l0  = n*il;    // offset into half-block, 0..28
    int is  = 8*ip + l0/16; // 0, 1, 8, 9

-    float4 sumf = 0;
+    float sumf[N_DST];
+    for (int row = 0; row < N_DST; row++) {
+        sumf[row] = 0.f;
+    }

    for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
-        if (first_row + 0 < ne01) {
-            sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
-        }
-        if (first_row + 1 < ne01) {
-            sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
-        }
-        if (first_row + 2 < ne01) {
-            sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
-        }
-        if (first_row + 3 < ne01) {
-            sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
+        global float * y = yy + ib * QK_K + 128*ip + l0;
+        float4 y0 = vload4(0, y +  0);
+        float4 y1 = vload4(0, y + 32);
+        float4 y2 = vload4(0, y + 64);
+        float4 y3 = vload4(0, y + 96);
+
+        for (int row = 0; row < N_DST; row++) {
+            if (first_row + row < ne01) {
+                sumf[row] += block_q_6_K_dot_y_flat(
+                    blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb,
+                    ib, ip, is, l0, y0, y1, y2, y3);
+            }
        }
    }

-    float4 tot = (float4)(
-        sub_group_reduce_add(sumf.s0),
-        sub_group_reduce_add(sumf.s1),
-        sub_group_reduce_add(sumf.s2),
-        sub_group_reduce_add(sumf.s3)
-    );
-    if (get_sub_group_local_id() == 0) {
-        if (first_row + 0 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
-        }
-        if (first_row + 1 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
-        }
-        if (first_row + 2 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
-        }
-        if (first_row + 3 < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
+    for (int row = 0; row < N_DST; row++) {
+        float tot = sub_group_reduce_add(sumf[row]);
+        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
        }
    }
 }
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -341,7 +341,7 @@ llama_context::llama_context(
        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
        bool pipeline_parallel =
            model.n_devices() > 1 &&
-            model.n_gpu_layers() > model.hparams.n_layer() &&
+            model.n_gpu_layers() > model.hparams.n_layer_all &&
            model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
            cparams.offload_kqv &&
            !model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
-        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
+        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all;
        if (ubatch.n_tokens < 32 || full_offload) {
            if (il != -1 && strcmp(name, "norm") == 0) {
                const auto & dev_layer = model.dev_layer(il);
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -684,6 +684,20 @@ static common_chat_tool config_tool{
    })",
 };

+static common_chat_tool calendar_create_event_tool{
+    /* .name = */ "Calendar.create_event",
+    /* .description = */ "Create a calendar event",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "title": { "type": "string" },
+            "participants": { "type": "array", "items": { "type": "string" } },
+            "metadata": { "type": "object" }
+        },
+        "required": ["title", "participants", "metadata"]
+    })",
+};
+
 static common_chat_tool imaginary_number_tool{
    /* .name = */ "imaginary_number",
    /* .description = */ "Imaginary number converter",
@@ -4130,7 +4144,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();
    }

-    // LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens
+    // LFM2.5 tests - format <|tool_call_start|>[name(args)]<|tool_call_end|>
    {
        auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug);

@@ -4138,19 +4152,57 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
        tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();

        // Single tool call without reasoning
-        tst.test("[special_function(arg1=1)]")
+        tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
            .tools({ special_function_tool })
            .expect(message_assist_call)
            .run();

        // Tool call with string argument
-        tst.test("[get_time(city=\"XYZCITY\")]")
+        tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
            .tools({ get_time_tool })
            .expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
            .run();

+        // Python literals become JSON.
+        tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>")
+            .tools({ toggle_tool })
+            .expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
+            .run();
+
+        tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>")
+            .tools({ nullable_tool })
+            .expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
+            .run();
+
+        // Nested Python literal.
+        tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>")
+            .tools({ config_tool })
+            .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})"))
+            .run();
+
+        // JSON literals are accepted too.
+        tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>")
+            .tools({ config_tool })
+            .expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})"))
+            .run();
+
+        // Dotted function name with structured args.
+        tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], "
+                 "metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>")
+            .tools({ calendar_create_event_tool })
+            .expect(message_with_tool_calls(
+                "Calendar.create_event",
+                R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})"))
+            .run();
+
+        // Markdown links stay content.
+        tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")
+            .tools({ get_time_tool })
+            .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
+            .run();
+
        // Tool call with reasoning (enable_thinking=true)
-        tst.test("<think>I'm\nthinking</think>[special_function(arg1=1)]")
+        tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({ special_function_tool })
@@ -4158,7 +4210,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();

        // Multiple tool calls (parallel)
-        tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]")
+        tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
            .parallel_tool_calls(true)
            .tools({
                special_function_tool, special_function_tool_with_optional_param
@@ -4170,7 +4222,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();

        // Tool call with content before tool call
-        tst.test("Let me check the time.[get_time(city=\"Paris\")]")
+        tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
            .tools({ get_time_tool })
            .expect(message_with_reasoning_content_and_multiple_tool_calls(
                "", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
@@ -4178,14 +4230,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .run();

        // Partial tool call (streaming)
-        tst.test("[special_function(arg1=")
+        tst.test("<|tool_call_start|>[special_function(arg1=")
            .tools({ special_function_tool })
            .is_partial(true)
            .expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
            .run();

        // Tool call with empty arguments
-        tst.test("[empty_args()]")
+        tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
            .tools({ empty_args_tool })
            .expect(simple_assist_msg("", "", "empty_args", "{}"))
            .run();
Author	SHA1	Message	Date
Sigbjørn Skjæret	603300b008	context : fix off-by-one comparisons to n_gpu_layers (#24208 )	2026-06-06 07:06:47 +03:00
lhez	308f61c31f	opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160 ) * opencl: allow multiple workgroups for large rows * opencl: improve small cpy * opencl: packed concat for small input * opencl: tweak flat q6_K gemv, increase N_DST and remap threads	2026-06-05 13:45:25 -07:00
Tarek Dakhran	da87e9b612	common/chat : unify and fix LFM2/LFM2.5 tool parser (#24178 )	2026-06-05 14:31:56 -05:00