Compare commits

...

3 Commits
b9534 ... b9537

Author SHA1 Message Date
Sigbjørn Skjæret
603300b008 context : fix off-by-one comparisons to n_gpu_layers (#24208) 2026-06-06 07:06:47 +03:00
lhez
308f61c31f opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160)
* opencl: allow multiple workgroups for large rows

* opencl: improve small cpy

* opencl: packed concat for small input

* opencl: tweak flat q6_K gemv, increase N_DST and remap threads
2026-06-05 13:45:25 -07:00
Tarek Dakhran
da87e9b612 common/chat : unify and fix LFM2/LFM2.5 tool parser (#24178) 2026-06-05 14:31:56 -05:00
10 changed files with 389 additions and 218 deletions

View File

@@ -87,6 +87,8 @@ static std::string normalize_quotes_to_json(const std::string & input) {
bool in_single_quoted = false;
bool in_double_quoted = false;
auto is_word_char = [](char ch) { return std::isalnum(static_cast<unsigned char>(ch)) || ch == '_'; };
for (size_t i = 0; i < input.size(); ++i) {
char c = input[i];
@@ -151,6 +153,29 @@ static std::string normalize_quotes_to_json(const std::string & input) {
in_single_quoted = true;
result += '"';
}
} else if (!in_single_quoted && !in_double_quoted && (c == 'T' || c == 'F' || c == 'N') &&
(i == 0 || !is_word_char(input[i - 1]))) {
// Python literals -> JSON; prefix match keeps streamed partials monotonic.
static constexpr std::pair<std::string_view, std::string_view> literals[] = {
{ "True", "true" }, { "False", "false" }, { "None", "null" },
};
size_t n = 0;
while (i + n < input.size() && is_word_char(input[i + n])) {
++n;
}
std::string_view token(input.data() + i, n);
bool matched = false;
for (const auto & [py, js] : literals) {
if (py.substr(0, n) == token) {
result += js.substr(0, n);
i += n - 1;
matched = true;
break;
}
}
if (!matched) {
result += c;
}
} else {
result += c;
}
@@ -353,12 +378,8 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
}
value_to_add += escape_json_string_inner(value_content);
} else if (!value_content.empty()) {
// For potential containers, normalize Python-style single quotes to JSON double quotes
bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
if (is_potential_container) {
value_content = normalize_container_value(value_content);
}
value_to_add += value_content;
// Pythonic scalars/containers -> JSON.
value_to_add += normalize_container_value(value_content);
}
args_target() += value_to_add;
@@ -466,11 +487,34 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
return force_tool_calls ? section : optional(section);
}
// Like python_value(), but the leaf also accepts JSON-cased true/false/null, used by LFM2/LFM2.5
common_peg_parser common_chat_peg_builder::python_or_json_value() {
return rule("python-or-json-value", [this]() {
auto ws = space();
auto value = python_or_json_value();
auto member = sequence({ python_string(), ws, literal(":"), ws, value });
auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) });
auto dict = rule("python-or-json-dict", [&]() {
return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }), ws });
});
auto elements = sequence({ value, zero_or_more(sequence({ literal(","), ws, value })) });
auto array = rule("python-or-json-array", [&]() {
return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }), ws });
});
return choice({ dict, array, python_string(), python_number(),
python_bool(), python_null(), json_bool(), json_null() });
});
}
// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const ordered_json & tools,
bool parallel_tool_calls) {
bool parallel_tool_calls,
bool allow_json_literals) {
if (!tools.is_array() || tools.empty()) {
return eps();
}
@@ -504,7 +548,7 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
if (is_string_type) {
arg_value_parser = string_value_parser;
} else {
arg_value_parser = tool_arg_value(python_value());
arg_value_parser = tool_arg_value(allow_json_literals ? python_or_json_value() : python_value());
}
// Full argument: name="value" or name=value

View File

@@ -132,9 +132,13 @@ class common_chat_peg_builder : public common_peg_parser_builder {
// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
bool parallel_tool_calls);
bool parallel_tool_calls,
bool allow_json_literals);
private:
// Python values plus JSON true/false/null.
common_peg_parser python_or_json_value();
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
const std::string & args_key,
@@ -195,4 +199,3 @@ struct tagged_peg_parser {
tagged_peg_parser build_tagged_peg_parser(
const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);

View File

@@ -1608,42 +1608,40 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}
// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
// - Reasoning: <think>{reasoning}</think> (optional)
// - Content: text before a tool call (optional)
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
// Tool calls can appear multiple times (parallel tool calls supported)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
// LFM2/LFM2.5 parser. Tool calls are almost Python-style and parallel-capable
// (except dotted names and JSON literals true/false/null).
// Always wrapped in <|tool_call_start|>[name(args)]<|tool_call_end|> with optional <think> reasoning.
// tool_list_tokens preserves LFM2 system tool-list markers.
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs,
bool tool_list_tokens) {
common_chat_params data;
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string TOOL_LIST_START = "<|tool_list_start|>";
const std::string TOOL_LIST_END = "<|tool_list_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_list_start|>",
"<|tool_list_end|>",
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
data.preserved_tokens = { TOOL_CALL_START, TOOL_CALL_END, THINK_START, THINK_END };
if (tool_list_tokens) {
data.preserved_tokens.push_back(TOOL_LIST_START);
data.preserved_tokens.push_back(TOOL_LIST_END);
}
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
@@ -1670,7 +1668,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call",
p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls, /* allow_json_literals = */ true) +
p.literal(TOOL_CALL_END)
)
);
@@ -1697,93 +1695,6 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}
return data;
}
// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
// - Reasoning: <think>{reasoning}</think> (optional)
// - Content: text before a tool call (optional)
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
// Tool calls can appear multiple times (parallel tool calls supported)
static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
const std::string GEN_PROMPT = "<|im_start|>assistant\n";
data.thinking_start_tag = THINK_START;
data.thinking_end_tag = THINK_END;
if (inputs.has_continuation()) {
const auto & msg = inputs.continue_msg;
data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
data.generation_prompt += THINK_END + msg.render_content();
}
data.prompt += data.generation_prompt;
}
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto generation_prompt = p.literal(GEN_PROMPT);
auto end = p.end();
auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return generation_prompt + reasoning + p.content(p.rest()) + end;
}
auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call",
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
)
);
auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});
foreach_function(inputs.tools, [&](const json & tool) {
const std::string name = tool.at("function").at("name");
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
});
}
return data;
}
@@ -2298,14 +2209,14 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
if (is_lfm2_template(src)) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true);
}
// LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
if (src.find("List of tools: [") != std::string::npos &&
src.find("<|tool_list_start|>") == std::string::npos) {
LOG_DBG("Using specialized template: LFM2.5\n");
return common_chat_params_init_lfm2_5(tmpl, params);
return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ false);
}
// GigaChatV3 format detection

View File

@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
cl_kernel kernel_mul_mat_f32_f32;
cl_kernel kernel_mul_mat_f16_f16;
cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
cl_kernel kernel_upscale;
cl_kernel kernel_upscale_bilinear;
cl_kernel kernel_concat_f32;
cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
cl_kernel kernel_conv_2d_f16;
cl_kernel kernel_conv_2d_f32;
cl_kernel kernel_conv_2d_f16_f32;
@@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
GGML_LOG_CONT(".");
}
@@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
@@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
nth *= 2;
}
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
int nchunks = 1;
if (src0->type == GGML_TYPE_F32) {
const int chunk_target = nth * 4;
nchunks = (ne00 + chunk_target - 1) / chunk_target;
nchunks = MAX(1, MIN(nchunks, 64));
}
size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
int nth = MIN(64, ne0);
cl_kernel kernel = backend_ctx->kernel_concat_f32;
const bool concat_pack = (dim == 0 && ne0 < 32);
cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
: backend_ctx->kernel_concat_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
size_t local_work_size[] = {(size_t)nth, 1, 1};
if (concat_pack) {
// packed kernel needs the dst dims to unflatten its 1-D row index.
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
const int base = MIN(64, maxwg);
const int tpr = MIN(ne0, base); // threads per row
const int rpw = MAX(1, base / tpr); // rows per workgroup
const int lsz = tpr * rpw;
const int nrows = ne1*ne2*ne3;
const int nwg = (nrows + rpw - 1) / rpw;
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
size_t local_work_size[] = {(size_t)lsz, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
} else {
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 2;
ndst = 4;
ndst = 16;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
@@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_cpy_f32_f16;
break;
case GGML_TYPE_F32:
kernel = backend_ctx->kernel_cpy_f32_f32;
kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
: backend_ctx->kernel_cpy_f32_f32;
break;
default:
GGML_ASSERT(false && "not implemented");
@@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
const int nth = MIN(64, ne00);
if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
const int base = MIN(64, maxwg);
const int tpr = MIN(ne00, base); // threads per row
const int rpw = MAX(1, base / tpr); // rows per workgroup
const int lsz = tpr * rpw; // <= base <= maxwg
const int nrows = ne01*ne02*ne03;
const int nwg = (nrows + rpw - 1) / rpw;
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
size_t local_work_size[] = {(size_t)lsz, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
} else {
const int nth = MIN(64, ne00);
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
}
}
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

View File

@@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
*y = *x;
}
}
kernel void kernel_concat_f32_pack(
global const char * src0,
ulong offset0,
global const char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3,
int dim,
int ne1,
int ne2,
int ne3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int lsz = get_local_size(0);
int tpr = min(ne0, lsz); // threads per row
int rpw = lsz / tpr; // rows per workgroup
int lid = get_local_id(0);
int row = get_group_id(0)*rpw + lid / tpr;
int lane = lid - (lid / tpr) * tpr;
int nrows = ne1*ne2*ne3;
if (row >= nrows) {
return;
}
int i1 = row % ne1;
int t = row / ne1;
int i2 = t % ne2;
int i3 = t / ne2;
int o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
for (int i0 = lane; i0 < ne0; i0 += tpr) {
global const float * x;
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
} else {
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
}
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*y = *x;
}
}

View File

@@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
}
}
kernel void kernel_cpy_f32_f32_pack(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne0,
int ne1,
int ne2,
int ne3,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
int lsz = get_local_size(0);
int tpr = min(ne00, lsz); // threads per row
int rpw = lsz / tpr; // rows per workgroup
int lid = get_local_id(0);
int row = get_group_id(0)*rpw + lid / tpr;
int lane = lid - (lid / tpr) * tpr;
int nrows = ne01*ne02*ne03;
if (row >= nrows) {
return;
}
int i01 = row % ne01;
int t = row / ne01;
int i02 = t % ne02;
int i03 = t / ne02;
// linear index of the first element of this row, unflattened over dst dims
long n = (long)row * ne00;
int i3 = (int)(n / ((long)ne2*ne1*ne0));
long rm = n - (long)i3*ne2*ne1*ne0;
int i2 = (int)(rm / ((long)ne1*ne0));
rm -= (long)i2*ne1*ne0;
int i1 = (int)(rm / ne0);
int i0 = (int)(rm - (long)i1*ne0);
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
for (int i00 = lane; i00 < ne00; i00 += tpr) {
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
dst_data[i00] = src[0];
}
}
kernel void kernel_cpy_i32_i32(
global int * src0,
ulong offset0,

View File

@@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
src1 = (global int*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
int i10 = get_group_id(0);
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int nchunks = get_num_groups(0) / ne10;
int g = get_group_id(0);
int i10 = g / nchunks;
int chunk = g - i10 * nchunks;
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
int i02 = i11;
int i03 = i12;
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
if (ind >= ne00) {
return;
}
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1);
global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
int span = (ne00 + nchunks - 1) / nchunks;
int start = chunk * span;
int end = min(start + span, ne00);
for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
dst_row[ind] = src_row[ind];
}
}

View File

@@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat(
global uchar * blk_qh,
global char * blk_scales,
global half * blk_d,
global float * yy,
int ib,
int ip,
int is,
int l0
int l0,
float4 y0,
float4 y1,
float4 y2,
float4 y3
) {
int y_offset = 128*ip + l0;
int q_offset_l = 64*ip + l0;
int q_offset_h = 32*ip + l0;
@@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat(
global uchar * qh = blk_qh + ib*64 + q_offset_h;
global char * sc = blk_scales + ib*16 + is;
global float * y = yy + ib * QK_K + y_offset;
float dall = blk_d[ib];
float sumf = 0;
float4 sums = {0.f, 0.f, 0.f, 0.f};
// Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads.
// q_offset_l/h are 4-aligned, so these are aligned vector loads.
uchar4 q1v = vload4(0, q1);
uchar4 q2v = vload4(0, q2);
uchar4 qhv = vload4(0, qh);
sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
int4 q1i = convert_int4(q1v);
int4 q2i = convert_int4(q2v);
int4 qhi = convert_int4(qhv);
sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
// Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the
// matching 2-bit plane of qh), same arithmetic as the scalar version, then dot()
// against the cached activation lanes.
float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f;
float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f;
float4 w2 = convert_float4((q1i >> 4) | ((qhi & Q6_K_MASK3) )) - 32.f;
float4 w3 = convert_float4((q2i >> 4) | ((qhi & Q6_K_MASK4) >> 2)) - 32.f;
sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
return sumf;
return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] +
dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]);
}
#undef N_DST
@@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat(
#define N_SIMDGROUP 2
#define N_SIMDWIDTH 16
#elif defined (ADRENO_GPU)
#define N_DST 4
#define N_DST 16
#define N_SIMDGROUP 2
#define N_SIMDWIDTH 64
#endif
@@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat(
global half * blk_d = (global half *) src0_d + offset_src0_d;
global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15
int ix = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1
int ip = tid/8; // first or second half of (super) block (0 or 1)
int il = tid%8; // each half has 8 parts, one per scale
int n = 4; // 4 scales at a time (and 4 sums)
int l0 = n*il; // offset into half-block, 0..28
int is = 8*ip + l0/16; // 0, 1, 8, 9
float4 sumf = 0;
float sumf[N_DST];
for (int row = 0; row < N_DST; row++) {
sumf[row] = 0.f;
}
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
if (first_row + 0 < ne01) {
sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
}
if (first_row + 1 < ne01) {
sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
}
if (first_row + 2 < ne01) {
sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
}
if (first_row + 3 < ne01) {
sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
global float * y = yy + ib * QK_K + 128*ip + l0;
float4 y0 = vload4(0, y + 0);
float4 y1 = vload4(0, y + 32);
float4 y2 = vload4(0, y + 64);
float4 y3 = vload4(0, y + 96);
for (int row = 0; row < N_DST; row++) {
if (first_row + row < ne01) {
sumf[row] += block_q_6_K_dot_y_flat(
blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb,
ib, ip, is, l0, y0, y1, y2, y3);
}
}
}
float4 tot = (float4)(
sub_group_reduce_add(sumf.s0),
sub_group_reduce_add(sumf.s1),
sub_group_reduce_add(sumf.s2),
sub_group_reduce_add(sumf.s3)
);
if (get_sub_group_local_id() == 0) {
if (first_row + 0 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
}
if (first_row + 1 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
}
if (first_row + 2 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
}
if (first_row + 3 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
for (int row = 0; row < N_DST; row++) {
float tot = sub_group_reduce_add(sumf[row]);
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
}
}
}

View File

@@ -341,7 +341,7 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.n_gpu_layers() > model.hparams.n_layer() &&
model.n_gpu_layers() > model.hparams.n_layer_all &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);

View File

@@ -684,6 +684,20 @@ static common_chat_tool config_tool{
})",
};
static common_chat_tool calendar_create_event_tool{
/* .name = */ "Calendar.create_event",
/* .description = */ "Create a calendar event",
/* .parameters = */ R"({
"type": "object",
"properties": {
"title": { "type": "string" },
"participants": { "type": "array", "items": { "type": "string" } },
"metadata": { "type": "object" }
},
"required": ["title", "participants", "metadata"]
})",
};
static common_chat_tool imaginary_number_tool{
/* .name = */ "imaginary_number",
/* .description = */ "Imaginary number converter",
@@ -4130,7 +4144,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
}
// LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens
// LFM2.5 tests - format <|tool_call_start|>[name(args)]<|tool_call_end|>
{
auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug);
@@ -4138,19 +4152,57 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
// Single tool call without reasoning
tst.test("[special_function(arg1=1)]")
tst.test("<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.tools({ special_function_tool })
.expect(message_assist_call)
.run();
// Tool call with string argument
tst.test("[get_time(city=\"XYZCITY\")]")
tst.test("<|tool_call_start|>[get_time(city=\"XYZCITY\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
.run();
// Python literals become JSON.
tst.test("<|tool_call_start|>[toggle(enabled=True)]<|tool_call_end|>")
.tools({ toggle_tool })
.expect(message_with_tool_calls("toggle", R"({"enabled": true})"))
.run();
tst.test("<|tool_call_start|>[set_nullable(value=None)]<|tool_call_end|>")
.tools({ nullable_tool })
.expect(message_with_tool_calls("set_nullable", R"({"value": null})"))
.run();
// Nested Python literal.
tst.test("<|tool_call_start|>[set_config(config={\"enabled\": True, \"count\": 3})]<|tool_call_end|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "count": 3}})"))
.run();
// JSON literals are accepted too.
tst.test("<|tool_call_start|>[set_config(config={\"enabled\": true, \"note\": null})]<|tool_call_end|>")
.tools({ config_tool })
.expect(message_with_tool_calls("set_config", R"({"config": {"enabled": true, "note": null}})"))
.run();
// Dotted function name with structured args.
tst.test("<|tool_call_start|>[Calendar.create_event(title=\"demo\", participants=[\"Alice\", \"Bob\"], "
"metadata={\"priority\": \"high\", \"reminder\": true})]<|tool_call_end|>")
.tools({ calendar_create_event_tool })
.expect(message_with_tool_calls(
"Calendar.create_event",
R"({"title": "demo", "participants": ["Alice", "Bob"], "metadata": {"priority": "high", "reminder": true}})"))
.run();
// Markdown links stay content.
tst.test("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")
.tools({ get_time_tool })
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
.run();
// Tool call with reasoning (enable_thinking=true)
tst.test("<think>I'm\nthinking</think>[special_function(arg1=1)]")
tst.test("<think>I'm\nthinking</think><|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
.enable_thinking(true)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
@@ -4158,7 +4210,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
// Multiple tool calls (parallel)
tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]")
tst.test("<|tool_call_start|>[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]<|tool_call_end|>")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
@@ -4170,7 +4222,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
// Tool call with content before tool call
tst.test("Let me check the time.[get_time(city=\"Paris\")]")
tst.test("Let me check the time.<|tool_call_start|>[get_time(city=\"Paris\")]<|tool_call_end|>")
.tools({ get_time_tool })
.expect(message_with_reasoning_content_and_multiple_tool_calls(
"", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
@@ -4178,14 +4230,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.run();
// Partial tool call (streaming)
tst.test("[special_function(arg1=")
tst.test("<|tool_call_start|>[special_function(arg1=")
.tools({ special_function_tool })
.is_partial(true)
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
.run();
// Tool call with empty arguments
tst.test("[empty_args()]")
tst.test("<|tool_call_start|>[empty_args()]<|tool_call_end|>")
.tools({ empty_args_tool })
.expect(simple_assist_msg("", "", "empty_args", "{}"))
.run();