Compare commits

..

1 Commits

Author SHA1 Message Date
Ruben Ortlam 037397792a vulkan: split ggml-vulkan.cpp file 2026-06-22 15:50:01 +02:00
219 changed files with 30523 additions and 32291 deletions
+1 -1
View File
@@ -10,7 +10,7 @@
# ggml-org/ggml-rpc : rgerganov
# ggml-org/ggml-sycl : arthw
# ggml-org/ggml-vulkan : 0cc4m, jeffbolznv
# ggml-org/ggml-webgpu : reeselevine, yomaytk
# ggml-org/ggml-webgpu : reeselevine
# ggml-org/ggml-zdnn : taronaeo
# ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
# ggml-org/llama-mtmd : ngxson
+1 -3
View File
@@ -142,9 +142,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2)
- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25)
- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos)
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
- [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
+2
View File
@@ -80,6 +80,8 @@ add_library(${TARGET}
http.h
imatrix-loader.cpp
imatrix-loader.h
json-partial.cpp
json-partial.h
json-schema-to-grammar.cpp
llguidance.cpp
log.cpp
+5 -14
View File
@@ -301,8 +301,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
const common_download_opts & opts) {
handle_model_result result;
// TODO @ngxson : refactor this into a new common_model_download_context
if (!model.docker_repo.empty()) {
model.path = common_docker_resolve_model(model.docker_repo);
} else if (!model.hf_repo.empty()) {
@@ -398,7 +396,7 @@ static bool parse_bool_value(const std::string & value) {
// CLI argument parsing functions
//
bool common_params_handle_models(common_params & params, llama_example curr_ex, const common_params_handle_models_params & handle_params) {
bool common_params_handle_models(common_params & params, llama_example curr_ex) {
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
params.speculative.types.end(),
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
@@ -409,11 +407,6 @@ bool common_params_handle_models(common_params & params, llama_example curr_ex,
opts.skip_download = params.skip_download;
opts.download_mtp = spec_type_draft_mtp;
opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty();
opts.preset_only = handle_params.preset_only;
if (handle_params.callback) {
opts.callback = handle_params.callback;
}
// sub-models (draft, mmproj, vocoder) are explicitly specified by the user,
// so we should not auto-discover mtp/mmproj siblings for them
@@ -591,19 +584,17 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
const bool skip_model_download =
// server will call common_params_handle_models() later, so we skip it here
ctx_arg.ex == LLAMA_EXAMPLE_SERVER ||
// export_graph_ops loads only metadata
ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
// export_graph_ops loads only metadata
const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
if (!skip_model_download) {
// handle model and download
common_params_handle_models(params, ctx_arg.ex, {});
common_params_handle_models(params, ctx_arg.ex);
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty()
&& ctx_arg.ex != LLAMA_EXAMPLE_SERVER
&& !params.usage
&& !params.completion) {
throw std::invalid_argument("error: --model is required\n");
+1 -10
View File
@@ -1,7 +1,6 @@
#pragma once
#include "common.h"
#include "download.h"
#include <set>
#include <map>
@@ -130,19 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
// see: https://github.com/ggml-org/llama.cpp/issues/18163
void common_params_add_preset_options(std::vector<common_arg> & args);
struct common_params_handle_models_params {
common_download_callback * callback = nullptr;
bool preset_only = false; // if true, only check & download remote preset (for router mode)
};
// populate model paths (main model, mmproj, etc) from -hf if necessary
// return true if the model is ready to use
// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
bool common_params_handle_models(
common_params & params,
llama_example curr_ex,
const common_params_handle_models_params & handle_params);
bool common_params_handle_models(common_params & params, llama_example curr_ex);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+53 -103
View File
@@ -90,93 +90,41 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
return text;
}
common_chat_role common_chat_role_from_string(const std::string & role) {
if (role == "system") { return COMMON_CHAT_ROLE_SYSTEM; }
if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; }
if (role == "user") { return COMMON_CHAT_ROLE_USER; }
if (role == "tool") { return COMMON_CHAT_ROLE_TOOL; }
return COMMON_CHAT_ROLE_UNKNOWN;
}
const char * common_chat_role_to_string(common_chat_role role) {
switch (role) {
case COMMON_CHAT_ROLE_SYSTEM: return "system";
case COMMON_CHAT_ROLE_ASSISTANT: return "assistant";
case COMMON_CHAT_ROLE_USER: return "user";
case COMMON_CHAT_ROLE_TOOL: return "tool";
case COMMON_CHAT_ROLE_UNKNOWN: return "";
}
return "";
}
json common_chat_msg_delimiters::to_json() const {
json result = json::array();
for (const auto & d : delimiters) {
result.push_back({
{ "role", common_chat_role_to_string(d.role) },
{ "delimiter", d.delimiter },
});
}
return result;
}
common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) {
common_chat_msg_delimiters result;
if (!delimiters.is_array()) {
return result;
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
if (delims.empty() || prompt.empty()) {
return {};
}
result.delimiters.reserve(delimiters.size());
for (const auto & d : delimiters) {
if (!d.is_object()) {
continue;
auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
std::vector<std::string> all_delims;
std::vector<common_peg_parser> tagged_messages;
all_delims.reserve(delims.size());
tagged_messages.reserve(delims.size());
for (const auto & d : delims) {
all_delims.push_back(d.delimiter);
}
result.delimiters.push_back({
common_chat_role_from_string(d.value("role", std::string())),
d.value("delimiter", std::string()),
});
}
return result;
}
void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) {
for (auto & d : delimiters) {
d.tokens = common_tokenize(vocab, d.delimiter, false, true);
}
}
common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips) const {
std::vector<std::pair<common_chat_role, size_t>> matches;
auto skip = skips.begin();
for (size_t i = 0; i < tokens.size();) {
if (skip != skips.end() && i == skip->first) {
i += skip->second;
++skip;
continue;
auto any_delim = p.until_one_of(all_delims);
for (const auto & d : delims) {
tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
}
for (const auto & d : delimiters) {
if (i + d.tokens.size() > tokens.size()) {
continue;
}
if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) {
matches.emplace_back(d.role, i);
break;
}
return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
});
common_peg_parse_context ctx(prompt);
const auto result = parser.parse(ctx);
if (!result.success()) {
return {};
}
std::vector<common_chat_msg_span> spans;
ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
if (!node.tag.empty()) {
spans.push_back({ node.tag, node.start, node.end - node.start });
}
i++;
}
matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size());
common_chat_msg_spans spans;
for (size_t i = 0; i + 1 < matches.size(); i++) {
const auto & curr = matches[i];
const auto & next = matches[i + 1];
spans.add(curr.first, curr.second, next.second - curr.second);
}
});
return spans;
}
@@ -1133,13 +1081,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
data.prompt = prompt;
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.message_delimiters = {
{ COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" },
{ COMMON_CHAT_ROLE_USER, "<|start|>user" },
{ COMMON_CHAT_ROLE_SYSTEM, "<|start|>developer" },
{ COMMON_CHAT_ROLE_SYSTEM, "<|start|>system" },
{ COMMON_CHAT_ROLE_TOOL, "<|start|>functions" },
};
data.message_spans = common_chat_split_by_role(prompt, {
{ "assistant", "<|start|>assistant" },
{ "user", "<|start|>user" },
{ "system", "<|start|>developer" },
{ "system", "<|start|>system" },
{ "tool", "<|start|>functions" },
});
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
@@ -1280,10 +1228,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
data.prompt += data.generation_prompt;
}
data.message_delimiters = {
{ COMMON_CHAT_ROLE_USER, "<|turn>user" },
{ COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" },
};
data.message_spans = common_chat_split_by_role(data.prompt, {
{ "user", "<|turn>user\n" },
{ "assistant", "<|turn>model\n" },
});
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
data.supports_thinking = true;
@@ -2082,15 +2030,15 @@ static common_chat_params common_chat_params_init_cohere2moe(const common_chat_t
RESULT_START, RESULT_END,
};
// Declare per-role message delimiters. Tool results are rendered with the
// Split the rendered prompt into per-role message spans. Tool results are rendered with the
// system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before
// the plain "system" one (it is a strict superset, and the role split tries delimiters in order).
data.message_delimiters = {
{ COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX },
{ COMMON_CHAT_ROLE_USER, TURN_START + USER },
{ COMMON_CHAT_ROLE_TOOL, TURN_START + SYSTEM + RESULT_START },
{ COMMON_CHAT_ROLE_SYSTEM, TURN_START + SYSTEM },
};
data.message_spans = common_chat_split_by_role(data.prompt, {
{ "assistant", GEN_PREFIX },
{ "user", TURN_START + USER },
{ "tool", TURN_START + SYSTEM + RESULT_START },
{ "system", TURN_START + SYSTEM },
});
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
@@ -2578,15 +2526,17 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
autoparser.analyze_template(tmpl);
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
common_chat_msg_delimiters delimiters;
std::vector<common_chat_msg_delimiter> delimiters;
if (!autoparser.assistant_start.empty()) {
delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start);
delimiters.push_back({ "assistant", autoparser.assistant_start });
}
if (!autoparser.user_start.empty()) {
delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start);
delimiters.push_back({ "user", autoparser.user_start });
}
auto_params.message_delimiters = std::move(delimiters);
if (!delimiters.empty()) {
auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
}
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
if (auto_params.supports_thinking) {
+6 -65
View File
@@ -143,75 +143,15 @@ struct common_chat_msg_diff {
}
};
enum common_chat_role {
COMMON_CHAT_ROLE_UNKNOWN,
COMMON_CHAT_ROLE_SYSTEM,
COMMON_CHAT_ROLE_ASSISTANT,
COMMON_CHAT_ROLE_USER,
COMMON_CHAT_ROLE_TOOL
};
common_chat_role common_chat_role_from_string(const std::string & role);
const char * common_chat_role_to_string(common_chat_role role);
struct common_chat_msg_span {
common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
std::string role;
std::size_t pos = 0;
std::size_t len = 0;
bool valid() const {
return role != COMMON_CHAT_ROLE_UNKNOWN;
}
};
struct common_chat_msg_spans {
std::vector<common_chat_msg_span> spans;
void add(common_chat_role role, size_t pos, size_t len) {
spans.push_back({ role, pos, len });
}
bool is_user_start(int32_t pos) const {
for (auto it = spans.begin(); it != spans.end(); ++it) {
if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) {
return true;
}
}
return false;
}
int32_t last_user_message_pos() const {
for (auto it = spans.rbegin(); it != spans.rend(); ++it) {
if (it->role == COMMON_CHAT_ROLE_USER) {
return (int32_t) it->pos;
}
}
return -1;
}
};
struct common_chat_msg_delimiter {
common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN;
std::string delimiter;
llama_tokens tokens = {};
};
struct common_chat_msg_delimiters {
std::vector<common_chat_msg_delimiter> delimiters;
common_chat_msg_delimiters() = default;
common_chat_msg_delimiters(std::initializer_list<common_chat_msg_delimiter> delims) : delimiters(delims) {}
void add(common_chat_role role, const std::string & delimiter) {
delimiters.push_back({ role, delimiter });
}
void tokenize(const llama_vocab * vocab);
// split tokens into message spans. skips maps a start index to a length of a region to jump over without matching
common_chat_msg_spans split(const llama_tokens & tokens, const std::map<size_t, size_t> & skips = {}) const;
nlohmann::ordered_json to_json() const;
std::string role;
std::string delimiter;
};
struct common_chat_tool {
@@ -279,7 +219,7 @@ struct common_chat_params {
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
std::string parser;
common_chat_msg_delimiters message_delimiters;
std::vector<common_chat_msg_span> message_spans;
};
// per-message parsing syntax
@@ -385,4 +325,5 @@ struct common_chat_prompt_preset {
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters);
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+1 -1
View File
@@ -609,7 +609,7 @@ struct common_params {
bool cache_prompt = true; // whether to enable prompt caching
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
int32_t checkpoint_min_step = 8192; // minimum spacing between context checkpoints
int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
std::string hostname = "127.0.0.1";
+1 -3
View File
@@ -799,7 +799,6 @@ common_download_model_result common_download_model(const common_params_model &
bool download_mmproj = opts.download_mmproj;
bool download_mtp = opts.download_mtp;
bool preset_only = opts.preset_only;
bool is_hf = !model.hf_repo.empty();
if (is_hf) {
@@ -807,8 +806,7 @@ common_download_model_result common_download_model(const common_params_model &
if (!hf.preset.path.empty()) {
// if preset.ini exists, only download that file alone
tasks.push_back({hf.preset.url, hf.preset.local_path});
} else if (!preset_only) {
// only add other files if we're NOT in preset-only mode (normal run, non-router)
} else {
for (const auto & f : hf.model_files) {
tasks.push_back({f.url, f.local_path});
}
-1
View File
@@ -55,7 +55,6 @@ struct common_download_opts {
bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
bool download_mmproj = false;
bool download_mtp = false;
bool preset_only = false; // if true, only check & download remote preset (for router mode)
common_download_callback * callback = nullptr;
};
+324
View File
@@ -0,0 +1,324 @@
#include "json-partial.h"
#include "log.h"
#include <nlohmann/json.hpp>
#include <string>
#include <regex>
using json = nlohmann::ordered_json;
enum common_json_stack_element_type {
COMMON_JSON_STACK_ELEMENT_OBJECT,
COMMON_JSON_STACK_ELEMENT_KEY,
COMMON_JSON_STACK_ELEMENT_ARRAY,
};
struct common_json_stack_element {
common_json_stack_element_type type;
std::string key;
};
bool common_json_parse(
const std::string & input,
const std::string & healing_marker,
common_json & out)
{
std::string::const_iterator it = input.begin();
const auto end = input.end();
return common_json_parse(it, end, healing_marker, out);
}
bool common_json_parse(
std::string::const_iterator & it,
const std::string::const_iterator & end,
const std::string & healing_marker,
common_json & out)
{
// // https://json.nlohmann.me/features/parsing/sax_interface/
struct json_error_locator : public nlohmann::json_sax<json> {
std::size_t position;
bool found_error;
std::string last_token;
std::string exception_message;
std::vector<common_json_stack_element> stack;
json_error_locator() : position(0), found_error(false) {}
bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT
this->position = position - 1;
this->found_error = true;
this->last_token = last_token;
this->exception_message = ex.what();
return false;
}
void close_value() {
if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) {
stack.pop_back();
}
}
bool null() override { // NOLINT
close_value();
return true;
}
bool boolean(bool) override { // NOLINT
close_value();
return true;
}
bool number_integer(number_integer_t) override { // NOLINT
close_value();
return true;
}
bool number_unsigned(number_unsigned_t) override { // NOLINT
close_value();
return true;
}
bool number_float(number_float_t, const string_t &) override { // NOLINT
close_value();
return true;
}
bool string(string_t &) override { // NOLINT
close_value();
return true;
}
bool binary(binary_t &) override { // NOLINT
close_value();
return true;
}
bool start_object(std::size_t) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""});
return true;
}
bool end_object() override {
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT);
stack.pop_back();
close_value();
return true;
}
bool key(string_t & key) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key});
return true;
}
bool start_array(std::size_t) override { // NOLINT
stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""});
return true;
}
bool end_array() override {
GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY);
stack.pop_back();
close_value();
return true;
}
};
json_error_locator err_loc;
auto start = it;
json::sax_parse(it, end, &err_loc);
if (err_loc.found_error) {
it = start;
auto temptative_end = it + err_loc.position;
// LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str());
auto input = std::string(it, temptative_end);
try {
out.json = json::parse(input);
// out.json = json::parse(it, temptative_end);
it = temptative_end;
return true;
} catch (const std::exception & ex) {
// No, needs healing.
LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str());
}
auto can_parse = [](const std::string & str) {
try {
auto _ = json::parse(str); // NOLINT
return true;
} catch (const std::exception &) {
return false;
}
};
if (!healing_marker.empty() && !err_loc.stack.empty()) {
std::string str(it, temptative_end);
auto last_non_sp_pos = str.find_last_not_of(" \n\r\t");
if (last_non_sp_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
}
auto last_non_sp_char = str[last_non_sp_pos];
// Used to detect stops on a number, which may not be complete.
auto was_maybe_number = [&]() {
if (!str.empty() && std::isspace(str.back())) {
return false;
}
return std::isdigit(last_non_sp_char) ||
last_non_sp_char == '.' ||
last_non_sp_char == 'e' ||
last_non_sp_char == 'E' ||
last_non_sp_char == '-';
};
std::string closing;
for (size_t i = err_loc.stack.size(); i > 0; i--) {
auto & el = err_loc.stack[i - 1];
if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
closing += "}";
} else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
closing += "]";
} else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) {
throw std::runtime_error("Unexpected stack element type");
}
}
// Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
auto is_high_surrogate = [&](const std::string & s) {
// Check if a partial of a high surrogate (U+D800-U+DBFF)
return s.length() >= 4 &&
s[0] == '\\' && s[1] == 'u' &&
std::tolower(s[2]) == 'd' &&
(s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
};
// Initialize the unicode marker to a low surrogate to handle the edge case
// where a high surrogate (U+D800-U+DBFF) is immediately followed by a
// backslash (\)
std::string unicode_marker_padding = "udc00";
std::smatch last_unicode_seq;
if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
std::smatch second_last_seq;
std::string prelude = str.substr(0, last_unicode_seq.position());
// Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
if (is_high_surrogate(last_unicode_seq.str())) {
// If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
unicode_marker_padding += "\\udc00";
} else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
if (is_high_surrogate(second_last_seq.str())) {
// If this follows a high surrogate, pad it to be a low surrogate
if (last_unicode_seq.length() == 2) {
unicode_marker_padding = "dc00";
} else if (last_unicode_seq.length() == 3) {
unicode_marker_padding = "c00";
} else {
// The original unicode_marker_padding is already padded with 0s
}
}
}
}
const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
// We're inside an object value
if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) {
// Was about to create an object value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
} else if (can_parse(str + ": 1" + closing)) {
str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing;
} else if (last_non_sp_char == '{' && can_parse(str + closing)) {
// Was about to create an object
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + "\"" + closing)) {
// Was inside an object value string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
// Was inside an object value string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
// Was inside an object value string after a partial unicode escape
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
} else {
// find last :
auto last_pos = str.find_last_of(':');
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location");
}
// Cutting back to opening : for object value
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) {
if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) {
// Was about to create an array value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
} else if (can_parse(str + "\"" + closing)) {
// Was inside an array value string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
// Was inside an array value string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
} else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
// Was inside an array value string after a partial unicode escape
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
} else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
// Had just finished a value
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
} else {
auto last_pos = str.find_last_of("[,");
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location");
}
// Cutting back to last [ or , for array value
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) {
if ((last_non_sp_char == '{' && can_parse(str + closing)) ||
(last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) {
// Was about to create an object key+value
str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing;
} else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) {
// Was about to create an object key+value
str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + "\": 1" + closing)) {
// Was inside an object key string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing;
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
// Was inside an object key string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
} else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
// Was inside an object key string after a partial unicode escape
str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
} else {
auto last_pos = str.find_last_of(':');
if (last_pos == std::string::npos) {
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
}
// fprintf(stderr, "Cutting back to last : for object key+value\n");
str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing;
}
} else {
throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location");
}
// fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str());
out.json = json::parse(str);
it = temptative_end;
return true;
}
// handle unclosed top-level primitive
if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
std::string str(it, temptative_end);
const auto & magic_seed = out.healing_marker.marker = healing_marker;
if (can_parse(str + "\"")) {
// Was inside an string
str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
} else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
// Was inside an string after an escape
str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
} else {
// TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
// fprintf(stderr, "Closing: TODO\n");
return false;
}
out.json = json::parse(str);
it = temptative_end;
return true;
}
return false;
}
out.json = json::parse(it, end);
it = end;
return true;
}
+39
View File
@@ -0,0 +1,39 @@
#pragma once
// TODO: use json_fwd.hpp when possible
#include <nlohmann/json.hpp>
// Healing marker (empty if the JSON was fully parsed / wasn't healed).
struct common_healing_marker {
// Raw marker.
std::string marker;
// Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
std::string json_dump_marker;
};
// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
struct common_json {
nlohmann::ordered_json json;
common_healing_marker healing_marker;
};
// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
//
// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
//
// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
bool common_json_parse(
const std::string & input,
const std::string & healing_marker,
common_json & out);
// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
bool common_json_parse(
std::string::const_iterator & it,
const std::string::const_iterator & end,
const std::string & healing_marker,
common_json & out);
-6
View File
@@ -46,7 +46,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"DbrxForCausalLM": "dbrx",
"DeciLMForCausalLM": "deci",
"DeepseekForCausalLM": "deepseek",
"DeepseekOCRForCausalLM": "deepseek",
"DeepseekV2ForCausalLM": "deepseek",
"DeepseekV3ForCausalLM": "deepseek",
"DeepseekV32ForCausalLM": "deepseek",
@@ -97,7 +96,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"GraniteMoeHybridForCausalLM": "granite",
"GraniteMoeSharedForCausalLM": "granite",
"GraniteSpeechForConditionalGeneration": "granite",
"GraniteSpeechPlusForConditionalGeneration": "granite",
"Grok1ForCausalLM": "grok",
"GrokForCausalLM": "grok",
"GroveMoeForCausalLM": "grovemoe",
@@ -125,7 +123,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"LLaDAModelLM": "llada",
"LLaMAForCausalLM": "llama",
"Lfm25AudioTokenizer": "lfm2",
"Lfm2BidirectionalModel": "lfm2",
"Lfm2ForCausalLM": "lfm2",
"Lfm2Model": "lfm2",
"Lfm2MoeForCausalLM": "lfm2",
@@ -234,7 +231,6 @@ TEXT_MODEL_MAP: dict[str, str] = {
"UMT5ForConditionalGeneration": "t5",
"UMT5Model": "t5",
"UltravoxModel": "ultravox",
"UnlimitedOCRForCausalLM": "deepseek",
"VLlama3ForCausalLM": "llama",
"VoxtralForConditionalGeneration": "llama",
"WavTokenizerDec": "wavtokenizer",
@@ -265,7 +261,6 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
"GlmasrModel": "ultravox",
"Granite4VisionForConditionalGeneration": "granite",
"GraniteSpeechForConditionalGeneration": "granite",
"GraniteSpeechPlusForConditionalGeneration": "granite",
"HunYuanVLForConditionalGeneration": "hunyuan",
"Idefics3ForConditionalGeneration": "smolvlm",
"InternVisionModel": "internvl",
@@ -301,7 +296,6 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
"StepVLForConditionalGeneration": "step3",
"Step3p7ForConditionalGeneration": "step3",
"UltravoxModel": "ultravox",
"UnlimitedOCRForCausalLM": "deepseek",
"VoxtralForConditionalGeneration": "ultravox",
"YoutuVLForConditionalGeneration": "youtuvl",
}
+2 -10
View File
@@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
from .qwen import QwenModel
@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -205,8 +205,6 @@ class DeepseekModel(TextModel):
@ModelBase.register(
"DeepseekV2ForCausalLM",
"DeepseekV3ForCausalLM",
"DeepseekOCRForCausalLM",
"UnlimitedOCRForCausalLM",
"KimiVLForConditionalGeneration",
"KimiK25ForConditionalGeneration",
"YoutuForCausalLM",
@@ -226,7 +224,7 @@ class DeepseekV2Model(TextModel):
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# special handling for Deepseek OCR
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
@@ -352,12 +350,6 @@ class DeepseekV2Model(TextModel):
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
# Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
if is_ocr:
sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
if sliding_window:
self.gguf_writer.add_sliding_window(sliding_window)
if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
# [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
# note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
-28
View File
@@ -348,34 +348,6 @@ class GraniteSpeechMmprojModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("GraniteSpeechPlusForConditionalGeneration")
class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel):
"""Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation"""
has_vision_encoder = False
has_audio_encoder = True
def set_gguf_parameters(self):
assert self.hparams_audio is not None
super().set_gguf_parameters()
# Add feature_layer if present in encoder config
if feature_layers := self.hparams_audio.get("cat_hidden_layers"):
self.gguf_writer.add_audio_feature_layers(feature_layers)
logger.info(f"gguf: audio feature_layers = {feature_layers}")
# Validate projector dimension matches concatenated encoder output
hidden_dim = self.hparams_audio["hidden_dim"]
expected_dim = hidden_dim * (len(feature_layers) + 1)
projector_dim = self.global_config["projector_config"]["encoder_hidden_size"]
if projector_dim != expected_dim:
raise ValueError(
f"Projector encoder_hidden_size ({projector_dim}) does not match "
f"expected concatenated dimension ({expected_dim}). "
f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}"
)
@ModelBase.register("Granite4VisionForConditionalGeneration")
class Granite4VisionMmprojModel(MmprojModel):
has_vision_encoder = True
+3 -10
View File
@@ -64,17 +64,11 @@ class LFM2Model(TextModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel")
@ModelBase.register("Lfm2Model")
class LFM2ColBertModel(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2
dense_tensor_name = "dense_2"
def set_gguf_parameters(self):
super().set_gguf_parameters()
if self.hf_arch == "Lfm2BidirectionalModel":
self.gguf_writer.add_causal_attention(False)
self._try_set_pooling_type()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith(self.dense_tensor_name):
name = "model." + name
@@ -82,11 +76,10 @@ class LFM2ColBertModel(LFM2Model):
yield from super().modify_tensors(data_torch, name, bid)
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# optional dense tensor is stored in a separate safetensors file
# dense tensor is stored in a separate safetensors file
from safetensors.torch import load_file
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
if not tensors_file.is_file():
return
assert tensors_file.is_file()
tensor = load_file(tensors_file)["linear.weight"]
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
yield f"{self.dense_tensor_name}.weight", tensor.clone()
@@ -24,6 +24,7 @@
"GGML_LLAMAFILE": "OFF",
"GGML_OPENCL": "ON",
"GGML_HEXAGON": "ON",
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
"LLAMA_OPENSSL": "OFF"
}
},
@@ -46,6 +47,7 @@
"GGML_LLAMAFILE": "OFF",
"GGML_OPENCL": "ON",
"GGML_HEXAGON": "ON",
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
"LLAMA_OPENSSL": "OFF"
}
},
@@ -71,6 +73,7 @@
"GGML_LLAMAFILE": "OFF",
"GGML_OPENCL": "OFF",
"GGML_HEXAGON": "ON",
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
"LLAMA_OPENSSL": "OFF"
}
},
+1
View File
@@ -266,6 +266,7 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
"ggml: OpenCL API version to target")
option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF)
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
+23 -50
View File
@@ -3688,6 +3688,8 @@ static void ggml_compute_forward_norm_f32(
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
@@ -3701,49 +3703,25 @@ static void ggml_compute_forward_norm_f32(
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
const float * xf = (const float *) x;
float sum = 0.0;
ggml_vec_sum_f32(ne00, &sum, x);
float mean = sum/ne00;
float sum = 0.0;
ggml_vec_sum_f32(ne00, &sum, xf);
float mean = sum/ne00;
float * yf = (float *) y;
float variance = 0;
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
float variance = 0;
#ifdef GGML_USE_ACCELERATE
mean = -mean;
vDSP_vsadd(xf, 1, &mean, yf, 1, ne00);
vDSP_measqv(yf, 1, &variance, ne00);
mean = -mean;
vDSP_vsadd(x, 1, &mean, y, 1, ne00);
vDSP_measqv(y, 1, &variance, ne00);
#else
variance = ggml_vec_cvar_f32(ne00, yf, xf, mean);
variance = ggml_vec_cvar_f32(ne00, y, x, mean);
#endif //GGML_USE_ACCELERATE
const float scale = 1.0f/sqrtf(variance + eps);
ggml_vec_scale_f32(ne00, yf, scale);
} else {
float sum = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
sum += *(const float *) (x + i00*nb00);
}
const float mean = sum/ne00;
float variance = 0.0f;
for (int64_t i00 = 0; i00 < ne00; i00++) {
const float v = *(const float *) (x + i00*nb00) - mean;
*(float *) (y + i00*nb0) = v;
variance += v * v;
}
variance /= ne00;
const float scale = 1.0f/sqrtf(variance + eps);
for (int64_t i00 = 0; i00 < ne00; i00++) {
*(float *) (y + i00*nb0) *= scale;
}
}
const float scale = 1.0f/sqrtf(variance + eps);
ggml_vec_scale_f32(ne00, y, scale);
}
}
}
@@ -4164,6 +4142,8 @@ static void ggml_compute_forward_l2_norm_f32(
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
@@ -4178,27 +4158,20 @@ static void ggml_compute_forward_l2_norm_f32(
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
ggml_float sum = 0.0;
for (int64_t i00 = 0; i00 < ne00; i00++) {
const float xi = *(const float *) (x + i00*nb00);
sum += (ggml_float)(xi * xi);
sum += (ggml_float)(x[i00] * x[i00]);
}
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
memcpy(y, x, ne00 * sizeof(float));
const float scale = 1.0f/fmaxf(sqrtf(sum), eps);
char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
if (nb00 == sizeof(float) && nb0 == sizeof(float)) {
memcpy(y, x, ne00 * sizeof(float));
ggml_vec_scale_f32(ne00, (float *) y, scale);
} else {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const float xi = *(const float *) (x + i00*nb00);
*(float *) (y + i00*nb0) = xi * scale;
}
}
ggml_vec_scale_f32(ne00, y, scale);
}
}
}
+1 -1
View File
@@ -5334,7 +5334,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_L2_NORM:
return ggml_is_contiguous_rows(op->src[0]);
return true;
case GGML_OP_RMS_NORM_BACK:
return ggml_is_contiguous(op->src[0]);
break;
+4
View File
@@ -25,6 +25,7 @@ include(ExternalProject)
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
add_library(htp_iface OBJECT
${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
@@ -71,12 +72,15 @@ function(build_htp_skel V)
-DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
-DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
-DDSP_VERSION=${V}
-DPREBUILT_LIB_DIR="toolv19_${V}")
list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
endfunction()
build_htp_skel(v68)
build_htp_skel(v69)
build_htp_skel(v73)
build_htp_skel(v75)
build_htp_skel(v79)
File diff suppressed because it is too large Load Diff
+50 -156
View File
@@ -5,12 +5,10 @@
#include "ggml-backend-impl.h"
#include "ggml-common.h"
#include <algorithm>
#include <string>
#include <vector>
#include <stdio.h>
#include "htp-ops.h"
#include "htp/matmul-ops.h"
struct htp_opnode {
ggml_tensor * node = nullptr;
@@ -19,13 +17,6 @@ struct htp_opnode {
htp_op_code opcode = HTP_OP_INVALID;
std::vector<ggml_tensor *> extra_dsts;
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS] = {0};
htp_opnode(ggml_tensor * node = nullptr, std::vector<ggml_tensor *> fused = {}, htp_op_code opcode = HTP_OP_INVALID, std::vector<ggml_tensor *> extra_dsts = {})
: node(node), fused(std::move(fused)), opcode(opcode), extra_dsts(std::move(extra_dsts)) {}
ggml_op op() const {
return node->op;
}
@@ -34,26 +25,6 @@ struct htp_opnode {
return fused.empty() ? node : fused.back();
}
void add_fused(ggml_tensor * t, bool extra_dst = false) {
fused.push_back(t);
if (extra_dst) {
extra_dsts.push_back(t);
}
}
std::vector<const ggml_tensor *> get_outputs() const {
std::vector<const ggml_tensor *> res;
if (extra_dsts.empty()) {
res.push_back(dst());
} else {
res.push_back(node);
for (const auto * x : extra_dsts) {
res.push_back(x);
}
}
return res;
}
const ggml_tensor * src0() const {
return node->src[0];
}
@@ -66,6 +37,10 @@ struct htp_opnode {
return ggml_op_is_empty(node->op);
}
void add_fused(ggml_tensor * t) {
fused.push_back(t);
}
bool stackable() const {
switch (this->op()) {
case GGML_OP_MUL_MAT:
@@ -156,117 +131,87 @@ struct htp_opformat {
char types[16 * GGML_MAX_SRC];
char buffs[64 * GGML_MAX_SRC];
char names[64 * GGML_MAX_SRC];
char kparams[128];
int format_tensor_dims(char * str, size_t max_size, const struct ggml_tensor * t) {
int format_tensor_dims(char * str, const struct ggml_tensor * t) {
if (!t) {
return snprintf(str, max_size, "NONE");
return sprintf(str, "NONE");
}
if (t->ne[2] == 1 && t->ne[3] == 1) {
return snprintf(str, max_size, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
} else {
return snprintf(str, max_size, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
}
}
void format_op_dims(char * str, size_t max_size, const htp_opnode & node) {
void format_op_dims(char * str, const htp_opnode & node) {
char * p = str;
char * p_end = str + max_size;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[0]), (size_t)(p_end - p));
p += format_tensor_dims(p, inputs[0]);
for (size_t i = 1; i < inputs.size(); i++) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
}
if (p < p_end) {
p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[i]), (size_t)(p_end - p));
}
p += sprintf(p, " x ");
p += format_tensor_dims(p, inputs[i]);
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
}
p += sprintf(p, " -> ");
}
char self[64];
format_tensor_dims(self, sizeof(self), node.dst());
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
}
format_tensor_dims(self, node.dst());
p += sprintf(p, "%s", self);
}
int format_tensor_strides(char * str, size_t max_size, const struct ggml_tensor * t) {
int format_tensor_strides(char * str, const struct ggml_tensor * t) {
if (!t) {
return snprintf(str, max_size, "NONE");
return sprintf(str, "NONE");
}
const char * c = ggml_is_contiguous(t) ? "" : "!";
if (t->ne[2] == 1 && t->ne[3] == 1) {
return snprintf(str, max_size, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
} else {
return snprintf(str, max_size, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
}
}
void format_op_strides(char * str, size_t max_size, const htp_opnode & node) {
void format_op_strides(char * str, const htp_opnode & node) {
char * p = str;
char * p_end = str + max_size;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[0]), (size_t)(p_end - p));
p += format_tensor_strides(p, inputs[0]);
for (size_t i = 1; i < inputs.size(); i++) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
}
if (p < p_end) {
p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[i]), (size_t)(p_end - p));
}
p += sprintf(p, " x ");
p += format_tensor_strides(p, inputs[i]);
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
}
p += sprintf(p, " -> ");
}
char self[64];
format_tensor_strides(self, sizeof(self), node.dst());
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p));
}
format_tensor_strides(self, node.dst());
p += sprintf(p, "%s", self);
}
void format_op_types(char * str, size_t max_size, const htp_opnode & node) {
void format_op_types(char * str, const htp_opnode & node) {
char * p = str;
char * p_end = str + max_size;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"), (size_t)(p_end - p));
}
p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE");
for (size_t i = 1; i < inputs.size(); i++) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"), (size_t)(p_end - p));
}
p += sprintf(p, " x ");
p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE");
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
}
p += sprintf(p, " -> ");
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", ggml_type_name(node.dst()->type)), (size_t)(p_end - p));
}
p += sprintf(p, "%s", ggml_type_name(node.dst()->type));
}
const char * tensor_buff_name(const struct ggml_tensor * t) {
@@ -276,102 +221,51 @@ struct htp_opformat {
return "NONE";
}
void format_op_buffs(char * str, size_t max_size, const htp_opnode & node) {
void format_op_buffs(char * str, const htp_opnode & node) {
char * p = str;
char * p_end = str + max_size;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[0])), (size_t)(p_end - p));
}
p += sprintf(p, "%s", tensor_buff_name(inputs[0]));
for (size_t i = 1; i < inputs.size(); i++) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[i])), (size_t)(p_end - p));
}
p += sprintf(p, " x ");
p += sprintf(p, "%s", tensor_buff_name(inputs[i]));
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
}
p += sprintf(p, " -> ");
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(node.dst())), (size_t)(p_end - p));
}
p += sprintf(p, "%s", tensor_buff_name(node.dst()));
}
void format_op_names(char * str, size_t max_size, const htp_opnode & node) {
void format_op_names(char * str, const htp_opnode & node) {
char * p = str;
char * p_end = str + max_size;
auto inputs = node.get_inputs();
if (!inputs.empty()) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? inputs[0]->name : "NONE"), (size_t)(p_end - p));
}
p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE");
for (size_t i = 1; i < inputs.size(); i++) {
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p));
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? inputs[i]->name : "NONE"), (size_t)(p_end - p));
}
p += sprintf(p, " x ");
p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE");
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p));
}
p += sprintf(p, " -> ");
}
if (p < p_end) {
p += std::min((size_t)snprintf(p, p_end - p, "%s", node.dst()->name), (size_t)(p_end - p));
}
}
void format_kernel_params(char * str, size_t max_size, const htp_opnode & node) {
if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID ||
node.opcode == HTP_OP_MUL_MAT_QKV || node.opcode == HTP_OP_MUL_MAT_FFN) {
const auto * kparams = (const struct htp_mm_kernel_params *) node.kernel_params;
const char * path = "unknown";
int32_t type = kparams->kernel_type;
if (type == HTP_MM_KERNEL_HMX_2D || type == HTP_MM_KERNEL_HMX_F16_BATCHED) {
path = "hmx-tiled";
} else if (type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || type == HTP_MM_KERNEL_HVX_F32_F32_VTCM ||
type == HTP_MM_KERNEL_HVX_QUANT_ROW || type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) {
path = "hvx-tiled";
} else if (type == HTP_MM_KERNEL_HVX_F16_F16_DDR || type == HTP_MM_KERNEL_HVX_F16_F32_DDR ||
type == HTP_MM_KERNEL_HVX_F32_F32_DDR || type == HTP_MM_KERNEL_HVX_F32_F16_DDR ||
type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) {
path = "hvx-flat";
}
snprintf(str, max_size, "%s vtcm %d", path, (int) kparams->vtcm_size);
} else {
snprintf(str, max_size, "----");
}
p += sprintf(p, "%s", node.dst()->name);
}
void format(const htp_opnode & node) {
format_op_dims(dims, sizeof(dims), node);
format_op_strides(strides, sizeof(strides), node);
format_op_types(types, sizeof(types), node);
format_op_buffs(buffs, sizeof(buffs), node);
format_op_names(names, sizeof(names), node);
format_kernel_params(kparams, sizeof(kparams), node);
format_op_dims(dims, node);
format_op_strides(strides, node);
format_op_types(types, node);
format_op_buffs(buffs, node);
format_op_names(names, node);
}
htp_opformat() {
strides[0] = '\0';
dims[0] = '\0';
types[0] = '\0';
buffs[0] = '\0';
names[0] = '\0';
kparams[0] = '\0';
}
htp_opformat() {}
htp_opformat(const htp_opnode & node) { format(node); }
};
+38 -14
View File
@@ -19,9 +19,43 @@ add_library(${HTP_LIB} SHARED
htp_iface_skel.c
worker-pool.c
hex-dma.c
hmx-queue.c
flash-attn-ops.c
hmx-flash-attn-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
if (GGML_HEXAGON_FA_EXP2_HF)
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
endif()
# HMX acceleration: available on v73+ architectures
set(HTP_HMX_VERSIONS v73 v75 v79 v81)
list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
if (_hmx_idx GREATER_EQUAL 0)
target_sources(${HTP_LIB} PRIVATE
hmx-flash-attn-ops.c
hmx-matmul-ops.c
hmx-queue.c
)
# -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
set_source_files_properties(
hmx-flash-attn-ops.c
hmx-matmul-ops.c
hmx-queue.c
PROPERTIES COMPILE_OPTIONS "-mhmx"
)
target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1)
endif()
build_idl(htp_iface.idl ${HTP_LIB})
target_sources(${HTP_LIB} PRIVATE
matmul-ops.c
binary-ops.c
unary-ops.c
@@ -29,6 +63,7 @@ add_library(${HTP_LIB} SHARED
softmax-ops.c
act-ops.c
rope-ops.c
flash-attn-ops.c
set-rows-ops.c
get-rows-ops.c
cpy-ops.c
@@ -44,17 +79,6 @@ add_library(${HTP_LIB} SHARED
pad-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
$<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>)
if (GGML_HEXAGON_FA_EXP2_HF)
message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
endif()
build_idl(htp_iface.idl ${HTP_LIB})
set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
install(TARGETS ${HTP_LIB})
+15 -13
View File
@@ -3,7 +3,7 @@ if (HEXAGON_TOOLCHAIN_INCLUDED)
endif()
set(HEXAGON_TOOLCHAIN_INCLUDED true)
# Cross Compiling for Hexagon
#Cross Compiling for Hexagon
set(HEXAGON TRUE)
set(CMAKE_SYSTEM_NAME QURT)
set(CMAKE_SYSTEM_PROCESSOR Hexagon)
@@ -14,6 +14,7 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(CUSTOM_RUNELF_PATH "")
#To fix backward compatibility with EAI addon.
if (NOT HEXAGON_SDK_ROOT)
set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
endif()
@@ -30,6 +31,7 @@ endif()
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
#Get the Binary extension of the Hexagon Toolchain
if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
endif()
@@ -46,12 +48,12 @@ set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
HEXAGON_TOOLS_ROOT
)
# QURT Related includes and linker flags
#QURT Related includes and linker flags
set(V_ARCH ${HEXAGON_ARCH})
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
if (${TREE} MATCHES PAKMAN)
if( ${TREE} MATCHES PAKMAN )
set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
endif()
message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
@@ -81,9 +83,11 @@ set(QURT_START_LINK_LIBS
)
STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o)
set(QURT_END_LINK_LIBS
${TARGET_DIR}/fini.o
)
# Non QURT related includes and linker flags
#Non QURT related includes and linker flags
set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
@@ -95,10 +99,8 @@ if (NOT NO_WRAP_MEM_API)
set(WRAP_MEMALIGN -Wl,--wrap=memalign)
endif()
set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx")
set(PIC_SHARED_LD_FLAGS
${ARCH_FLAGS}
-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
-G0
-fpic
-Wl,-Bsymbolic
@@ -118,13 +120,13 @@ STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
# System include paths
#System include paths
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
# LLVM toolchain setup
# Compiler paths, options and architecture
#LLVM toolchain setup
#Compiler paths, options and architecture
set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
@@ -135,8 +137,8 @@ set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,")
set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
# Compiler Options
set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
#Compiler Options
set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
+3 -2
View File
@@ -18,8 +18,7 @@
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-ops.h"
int hmx_flash_attn_ext(struct htp_ops_context * octx);
#include "hmx-ops.h"
// Must be multiple of 32
#define FLASH_ATTN_BLOCK_SIZE (32 * 2)
@@ -634,6 +633,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
return HTP_STATUS_NO_SUPPORT;
}
#ifdef HTP_HAS_HMX
// HMX path: head_dim multiple of 64, F16 KV, and no sinks
if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) {
int ret = hmx_flash_attn_ext(octx);
@@ -642,6 +642,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
}
// VTCM too small or other failure -> fall through to HVX path
}
#endif
struct htp_fa_context factx;
factx.octx = octx;
-80
View File
@@ -1,80 +0,0 @@
#ifndef HEX_COMMON_H
#define HEX_COMMON_H
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#ifndef SIZE_MAX
#define SIZE_MAX ((size_t)-1)
#endif
#ifndef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
static inline uint32_t hex_ceil_pow2(uint32_t x) {
if (x <= 1) { return 1; }
int p = 2;
x--;
while (x >>= 1) { p <<= 1; }
return p;
}
static inline size_t hmx_ceil_div(size_t num, size_t den) {
return (num + den - 1) / den;
}
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
return ((size_t) addr & (align - 1)) == 0;
}
static inline size_t hex_align_up(size_t v, size_t align) {
return hmx_ceil_div(v, align) * align;
}
static inline size_t hex_align_down(size_t v, size_t align) {
return (v / align) * align;
}
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
uint32_t left_off = (size_t) addr & (chunk_size - 1);
uint32_t right_off = left_off + n;
return right_off <= chunk_size;
}
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
return m * ((n + m - 1) / m);
}
static inline size_t hex_smin(size_t a, size_t b) {
return a < b ? a : b;
}
static inline size_t hex_smax(size_t a, size_t b) {
return a > b ? a : b;
}
static inline void hex_swap_ptr(void ** p1, void ** p2) {
void * t = *p1;
*p1 = *p2;
*p2 = t;
}
static inline bool hex_mul_overflow(size_t a, size_t b, size_t *out) {
if (a != 0 && b > SIZE_MAX / a) return true;
*out = a * b;
return false;
}
static inline bool hex_add_overflow(size_t a, size_t b, size_t *out) {
if (a > SIZE_MAX - b) return true;
*out = a + b;
return false;
}
#endif // HEX_COMMON_H
+5 -1
View File
@@ -5,7 +5,6 @@
#include <hexagon_types.h>
#include <stdbool.h>
#include <stdint.h>
#include "hex-utils.h"
#include "hex-profile.h"
@@ -128,8 +127,13 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src)
return p;
}
#if __HVX_ARCH__ < 73
static const uint32_t dma_src_l2_bypass_on = 1;
static const uint32_t dma_dst_l2_bypass_on = 0;
#else
static const uint32_t dma_src_l2_bypass_on = 1;
static const uint32_t dma_dst_l2_bypass_on = 1;
#endif
static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+56 -1
View File
@@ -11,7 +11,14 @@
#include "hex-fastdiv.h"
#include "hex-dump.h"
#include "hex-common.h"
#ifndef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
static inline uint64_t hex_get_cycles() {
uint64_t cycles = 0;
@@ -25,6 +32,54 @@ static inline uint64_t hex_get_pktcnt() {
return pktcnt;
}
static inline uint32_t hex_ceil_pow2(uint32_t x) {
if (x <= 1) { return 1; }
int p = 2;
x--;
while (x >>= 1) { p <<= 1; }
return p;
}
static inline size_t hmx_ceil_div(size_t num, size_t den) {
return (num + den - 1) / den;
}
static inline int32_t hex_is_aligned(const void * addr, uint32_t align) {
return ((size_t) addr & (align - 1)) == 0;
}
static inline size_t hex_align_up(size_t v, size_t align) {
return hmx_ceil_div(v, align) * align;
}
static inline size_t hex_align_down(size_t v, size_t align) {
return (v / align) * align;
}
static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
uint32_t left_off = (size_t) addr & (chunk_size - 1);
uint32_t right_off = left_off + n;
return right_off <= chunk_size;
}
static inline uint32_t hex_round_up(uint32_t n, uint32_t m) {
return m * ((n + m - 1) / m);
}
static inline size_t hex_smin(size_t a, size_t b) {
return a < b ? a : b;
}
static inline size_t hex_smax(size_t a, size_t b) {
return a > b ? a : b;
}
static inline void hex_swap_ptr(void ** p1, void ** p2) {
void * t = *p1;
*p1 = *p2;
*p2 = t;
}
static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
Q6_l2fetch_AP((void *) p, control);
+13 -13
View File
@@ -49,7 +49,7 @@
// g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions.
// Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales
// Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax.
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool pipeline) {
static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) {
const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS);
const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK]
const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong
@@ -70,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV,
+ k_dma_size * 2 // K DMA x2
+ v_dma_size * 2 // V DMA x2
+ k_tile_size * 1 // K tiles
+ v_tile_size * (pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
+ v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining)
+ s_tile_size * 2 // S + P
+ d_tile_size * 1 // D (diagonal matrix)
+ col_vec_size * 4 // m_vec, l_vec, s_rowmax, p_rowsum
@@ -290,7 +290,7 @@ static const int16_t d_tile_scatter_offsets[64] __attribute__((aligned(128))) =
struct hmx_fa_context {
const struct htp_ops_context * octx;
bool pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
bool use_pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2
uint32_t n_threads;
// Op parameters
@@ -409,7 +409,7 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data)
return;
}
__fp16 * v_tiles_dest = factx->pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
__fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0];
struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL;
htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start);
@@ -1312,13 +1312,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS);
const uint32_t n_kv_blocks = (nek1 + Bc - 1) / Bc;
const bool pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
const bool use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2);
// Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1
const uint32_t n_threads = pipeline ? n_threads_init : 1;
const uint32_t n_threads = use_pipeline ? n_threads_init : 1;
FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu",
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, pipeline, vtcm_budget);
neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget);
// ======== Build context ========
struct hmx_fa_context factx;
@@ -1339,7 +1339,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
factx.n_kv_blocks = n_kv_blocks;
factx.is_q_fp32 = (q->type == HTP_TYPE_F32);
factx.is_dst_fp32 = (dst->type == HTP_TYPE_F32);
factx.pipeline = pipeline;
factx.use_pipeline = use_pipeline;
factx.mask_broadcast = (mask != NULL && mask->ne[2] == 1);
// Extract op parameters (mutable during softcap adjustment, then stored as const in factx)
@@ -1405,7 +1405,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
factx.vtcm_v_fp16[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes);
factx.vtcm_k_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes);
factx.vtcm_v_tiles[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
if (pipeline) {
if (use_pipeline) {
factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes);
} else {
factx.vtcm_v_tiles[1] = NULL;
@@ -1456,7 +1456,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
// ======== HMX lock strategy ========
// Pipeline: queue thread auto-acquires HMX lock on first push; released by suspend.
// Fallback: main thread holds the lock (original behavior).
if (!factx.pipeline) {
if (!factx.use_pipeline) {
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
}
@@ -1550,7 +1550,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
const size_t k_src_stride = size_k_row_padded / sizeof(__fp16);
const size_t v_src_stride = size_v_row_padded / sizeof(__fp16);
if (factx.pipeline) {
if (factx.use_pipeline) {
// ==================================================================
// Pipeline path: HVX phases ‖ HMX queue worker
// ==================================================================
@@ -1780,7 +1780,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br);
// HMX: O_final = diag(1/l) @ O_prev
if (factx.pipeline) {
if (factx.use_pipeline) {
on_job.o_curr = o_tile_curr;
on_job.o_prev = o_tile_prev;
on_job.d_tiles = factx.vtcm_d_tiles;
@@ -1826,7 +1826,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) {
} // end KV head loop
} // end batch loop
if (factx.pipeline) {
if (factx.use_pipeline) {
hmx_queue_suspend(ctx->hmx_queue);
} else {
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+6
View File
@@ -0,0 +1,6 @@
// HMX operations compiled as a single translation unit.
// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO.
#include "hmx-queue.c"
#include "hmx-matmul-ops.c"
#include "hmx-flash-attn-ops.c"
+88
View File
@@ -0,0 +1,88 @@
// HMX operation entry-point declarations.
// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib)
#ifndef HMX_OPS_H
#define HMX_OPS_H
#include <stddef.h>
#include <stdint.h>
#include "htp-ops.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
float *dst;
const float *activation;
const __fp16 *permuted_weight;
int m;
int k;
int n;
int act_stride;
int weight_stride;
int dst_stride;
int ne02;
int ne03;
int ne12;
int ne13;
size_t src0_nb2;
size_t src0_nb3;
size_t src1_nb2;
size_t src1_nb3;
size_t dst_nb2;
size_t dst_nb3;
} hmx_matmul_f16_f32_batched_params_t;
// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output
// act_stride: activation row stride in elements (= k for contiguous, or
// nb[1]/sizeof(float) for permuted tensors like attention Q).
// weight_stride: weight row stride in elements (= k for compact weights, or
// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK).
int hmx_matmul_f16_f32(struct htp_context *ctx,
float *restrict dst,
const float *activation,
const __fp16 *permuted_weight,
int m, int k, int n,
int act_stride,
int weight_stride);
// Batched F16 wrapper over hmx_mat_mul_f16_f32.
// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3.
int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params);
// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4)
int hmx_matmul_2d_f32(struct htp_context *ctx,
float *restrict dst,
const float *activation,
const uint8_t *permuted_weight,
int m, int k, int n,
int act_stride,
int weight_stride,
int weight_type);
struct mmid_row_mapping;
int hmx_matmul_id_2d_f32(struct htp_context *ctx,
float *restrict dst,
const float *activation,
const uint8_t *permuted_weight,
int m, int k, int n,
int ne11,
size_t act_nb1, size_t act_nb2,
size_t dst_nb1, size_t dst_nb2,
int weight_stride,
int weight_type,
const struct mmid_row_mapping *matrix_rows,
int cur_a,
int mapping_stride);
// HMX flash attention
int hmx_flash_attn_ext(struct htp_ops_context * octx);
#ifdef __cplusplus
}
#endif
#endif // HMX_OPS_H
+3 -9
View File
@@ -13,9 +13,7 @@
#include <stdint.h>
#include <stdbool.h>
#ifndef HTP_MAX_NTHREADS
#define HTP_MAX_NTHREADS 10
#endif
#define HTP_MAX_MMAPS 16
// Memory mapping
@@ -44,13 +42,9 @@ struct htp_ops_context {
enum htp_op_code op; // FIXME: rename to opcode
int32_t op_params[HTP_OP_MAX_PARAMS];
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS];
const struct htp_tensor * src[HTP_OP_MAX_INPUTS];
union {
const struct htp_tensor * dst;
const struct htp_tensor * dsts[HTP_OP_MAX_OUTPUTS];
};
const struct htp_tensor * dst;
// TODO convert these to an array
struct htp_spad src0_spad;
@@ -93,13 +87,13 @@ struct htp_context {
struct htp_ops_context octx;
#ifdef HTP_HAS_HMX
struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap
#endif
};
int op_matmul(struct htp_ops_context * octx);
int op_matmul_id(struct htp_ops_context * octx);
int op_matmul_qkv(struct htp_ops_context * octx);
int op_matmul_ffn(struct htp_ops_context * octx);
int op_binary(struct htp_ops_context * octx);
int op_unary(struct htp_ops_context * octx);
int op_sum_rows(struct htp_ops_context * octx);
+8 -15
View File
@@ -28,19 +28,18 @@ enum htp_data_type {
HTP_TYPE_MXFP4 = 39,
// types used internally for repack, dyn.quant, etc
HTP_TYPE_Q4_0_TILED = 200,
HTP_TYPE_Q4_1_TILED,
HTP_TYPE_Q8_0_TILED,
HTP_TYPE_MXFP4_TILED,
HTP_TYPE_Q4_0x4x2 = 200,
HTP_TYPE_Q4_1x4x2,
HTP_TYPE_Q8_0x4x2,
HTP_TYPE_MXFP4x4x2,
HTP_TYPE_INVALID
};
// Constats for internal types
#define QK_Q4_0_TILED 256 // 32x32 Q4_0 tiled layout
#define QK_Q8_0_TILED 128 // 32x32 Q8_0 tiled layout
#define QK_MXFP4_TILED 256 // 32x32 MXFP4 tiled layout
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
// Mask to enable various stages of the Ops.
@@ -58,8 +57,6 @@ enum htp_op_code {
HTP_OP_DIV = 3,
HTP_OP_MUL_MAT,
HTP_OP_MUL_MAT_ID,
HTP_OP_MUL_MAT_QKV,
HTP_OP_MUL_MAT_FFN,
HTP_OP_RMS_NORM,
HTP_OP_RMS_NORM_MUL,
HTP_OP_UNARY_SILU,
@@ -102,9 +99,7 @@ enum htp_op_code {
#define HTP_OP_MAX_DIMS 4 // aka GGML_MAX_DIMS
#define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS
#define HTP_OP_MAX_OUTPUTS 4
#define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS
#define HTP_OP_MAX_KERN_PARAMS 32
#define HTP_OP_MAX_BUFS 16
#define HTP_OP_MAX_REQS 256
@@ -147,10 +142,8 @@ struct htp_op_desc {
uint32_t opcode; // GGML/HTP Op
uint32_t flags; // Op flags
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS]; // generic blob for host-precomputed parameters
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
uint16_t dst[HTP_OP_MAX_OUTPUTS]; // Output tensor indices
uint16_t pad[2]; // padding to align to 64 bits
uint16_t dst; // Output tensor index
};
#ifndef HTP_MAX_NTHREADS
+1 -2
View File
@@ -11,13 +11,12 @@ struct htp_iface_pmu_conf {
};
interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 n_hmx, in uint64 max_vmem);
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem);
AEEResult stop();
AEEResult mmap(in uint32 fd, in uint32 size);
AEEResult munmap(in uint32 fd);
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
AEEResult etm(in uint32 enable);
AEEResult hwinfo(rout uint32 n_threads, rout uint32 n_hvx, rout uint32 n_hmx, rout uint64 vtcm_size);
};
#endif /* HTP_IDL */
+18 -13
View File
@@ -170,7 +170,25 @@ static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) {
}
#endif
/* Q6_Vsf_equals_Vw is only available on v73+.*/
#if __HVX_ARCH__ < 73
static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in)
{
HVX_Vector const vzero = Q6_V_vzero();
HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero);
HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in);
HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift);
HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift);
HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized);
HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp));
return ret;
}
static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in)
{
return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in));
}
#endif
static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
// This looks complicated.
@@ -287,17 +305,4 @@ static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
#endif // __HVX_ARCH__ < 79
static inline HVX_Vector hvx_vec_load_act_tile(const uint8_t * y_q, uint32_t kt, HVX_Vector * v_act_all) {
if (kt % 4 == 0) {
*v_act_all = hvx_vmem(y_q + kt * 32);
return *v_act_all;
} else if (kt % 4 == 1) {
return Q6_V_vror_VR(*v_act_all, 32);
} else if (kt % 4 == 2) {
return Q6_V_vror_VR(*v_act_all, 64);
} else {
return Q6_V_vror_VR(*v_act_all, 96);
}
}
#endif /* HVX_BASE_H */
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+23 -81
View File
@@ -361,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) {
static void htp_packet_callback(dspqueue_t queue, int error, void * context);
static void htp_error_callback(dspqueue_t queue, int error, void * context);
AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp_queue_id, uint32_t n_hvx, uint32_t n_hmx, uint64_t max_vmem) {
AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
@@ -395,9 +395,10 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp
return AEE_ENOMEMORY;
}
ctx->hmx_enabled = n_hmx;
#ifdef HTP_HAS_HMX
ctx->hmx_enabled = use_hmx;
ctx->hmx_queue = NULL;
if (n_hmx) {
if (use_hmx) {
ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx);
if (ctx->hmx_queue) {
ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS];
@@ -406,7 +407,8 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp
ctx->hmx_enabled = false;
}
}
FARF(HIGH, "HMX %s (n_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", n_hmx);
FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
#endif
qurt_sysenv_max_hthreads_t hw_threads;
qurt_sysenv_get_max_hw_threads(&hw_threads);
@@ -479,11 +481,13 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
dma_queue_delete(ctx->dma[i]);
}
#ifdef HTP_HAS_HMX
if (ctx->hmx_queue) {
hmx_queue_delete(ctx->hmx_queue);
ctx->hmx_queue = NULL;
}
ctx->hmx_enabled = false;
#endif
vtcm_free(ctx);
@@ -496,36 +500,6 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
return AEE_SUCCESS;
}
AEEResult htp_iface_hwinfo(remote_handle64 handle, uint32_t * n_threads, uint32_t * n_hvx, uint32_t * n_hmx, uint64_t * vtcm_size) {
(void)handle;
if (!n_threads || !n_hvx || !n_hmx || !vtcm_size) {
return AEE_EBADPARM;
}
qurt_sysenv_max_hthreads_t hw_threads;
qurt_sysenv_get_max_hw_threads(&hw_threads);
uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
uint32_t n_hvx_val = hw_nhvx;
if (n_hvx_val > hw_threads.max_hthreads) {
n_hvx_val = hw_threads.max_hthreads;
}
if (n_hvx_val > HTP_MAX_NTHREADS) {
n_hvx_val = HTP_MAX_NTHREADS;
}
// for now we force n_threads == n_hvx
*n_threads = n_hvx_val;
*n_hvx = n_hvx_val;
*n_hmx = 1;
uint32_t vtcm_sz = 8 * 1024 * 1024; // 8MB default fallback
HAP_compute_res_query_VTCM(0, (unsigned int *)&vtcm_sz, NULL, NULL, NULL);
*vtcm_size = vtcm_sz;
return AEE_SUCCESS;
}
static void htp_error_callback(dspqueue_t queue, int error, void * context) {
// No errors expected on the DSP.
FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
@@ -580,12 +554,6 @@ static int execute_op(struct htp_ops_context * octx) {
case HTP_OP_MUL_MAT_ID:
return op_matmul_id(octx);
case HTP_OP_MUL_MAT_QKV:
return op_matmul_qkv(octx);
case HTP_OP_MUL_MAT_FFN:
return op_matmul_ffn(octx);
case HTP_OP_MUL:
case HTP_OP_ADD:
case HTP_OP_SUB:
@@ -794,9 +762,8 @@ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, str
}
}
static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) {
memcpy(octx->op_params, op->params, sizeof(octx->op_params));
memcpy(octx->kernel_params, op->kernel_params, sizeof(octx->kernel_params));
octx->flags = op->flags;
octx->op = op->opcode;
@@ -818,41 +785,22 @@ static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, u
src->ne[0], src->ne[1], src->ne[3], src->ne[3]);
}
// Prep output tensors
for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
uint16_t dst_idx = op->dst[i];
if (dst_idx == 0xffff) {
octx->dsts[i] = NULL;
continue;
}
struct htp_tensor *dst = tens + dst_idx;
octx->dsts[i] = dst;
// Prep output tensor
struct htp_tensor *dst = tens + op->dst;
FARF(HIGH, "prep-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, dst_idx, (void*) dst->data, dst->size,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
}
octx->dst = dst;
int status = execute_op(octx);
FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
octx->src0_spad.src = NULL;
octx->src1_spad.src = NULL;
octx->src2_spad.src = NULL;
octx->src3_spad.src = NULL;
octx->dst_spad.src = NULL;
(void) execute_op(octx);
// flush buffers on output
for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) {
if (octx->dsts[i]) {
struct htp_tensor *dst = (struct htp_tensor *)octx->dsts[i];
hex_l2flush((void *) dst->data, dst->size);
dst->flags |= HTP_TENSOR_FLUSHED;
hex_l2flush((void *) dst->data, dst->size);
dst->flags |= HTP_TENSOR_FLUSHED;
FARF(HIGH, "post-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, op->dst[i], (void*) dst->data, dst->size,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
}
}
return status;
FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size,
dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]);
}
#define DSPQUEUE_POLL_TIMEOUT_USEC 100
@@ -944,26 +892,20 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
}
}
int op_status = HTP_STATUS_OK;
uint32_t op_wakeup = n_ops / 2; // half-way throgh the batch
for (uint32_t i=0; i < n_ops; i++) {
struct profile_data prof;
if (i == op_wakeup) {
if (i == (n_ops-1)) {
// wake up the host before starting the last op
dspqueue_write_early_wakeup_noblock(queue, 0, 0);
}
profile_start(ctx->profiler, &prof);
op_status = proc_op_req(octx, tens, i, &ops[i]);
proc_op_req(octx, tens, i, &ops[i]);
profile_stop(ctx->profiler, &prof);
if (op_status != HTP_STATUS_OK) {
break;
}
if (ctx->profiler) {
pds[i].opcode = ops[i].opcode;
pds[i].usecs = prof.usecs;
@@ -977,7 +919,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
struct htp_opbatch_rsp rsp;
rsp.id = req.id;
rsp.status = op_status;
rsp.status = HTP_STATUS_OK;
rsp.n_bufs = n_bufs;
rsp.n_tensors = n_tens;
rsp.n_ops = n_ops;
File diff suppressed because it is too large Load Diff
-508
View File
@@ -1,508 +0,0 @@
#ifndef HTP_MATMUL_OPS_H
#define HTP_MATMUL_OPS_H
#include <stdint.h>
#include <stddef.h>
#include "htp-ops.h"
#include "hex-fastdiv.h"
#include "hex-common.h"
#ifdef __cplusplus
extern "C" {
#endif
// --- HMX Tile Constraints ---
#define HTP_MM_HMX_TILE_N_COLS 32
#define HTP_MM_HMX_TILE_N_ROWS 32
#define HTP_MM_HMX_TILE_SIZE (32 * 32 * sizeof(__fp16)) // 2048 bytes
#define HTP_MM_HMX_TILE_N_ELMS 1024
#define HTP_MM_HMX_MIN_NROWS 4
// --- Weight Repacked Tile Sizes ---
#define HTP_MM_WEIGHT_TILE_SIZE_Q4_0 576
#define HTP_MM_WEIGHT_TILE_SIZE_Q4_1 640
#define HTP_MM_WEIGHT_TILE_SIZE_Q8_0 1088
#define HTP_MM_WEIGHT_TILE_SIZE_IQ4_NL 576
#define HTP_MM_WEIGHT_TILE_SIZE_MXFP4 544
// --- Weight Repacked Aligned Tile Sizes ---
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0 640
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1 640
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0 1152
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_IQ4_NL 640
#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4 640
// --- Activation Tiled Block Sizes (including padding) ---
#define HTP_MM_ACT_TILE_SIZE_Q8_0 1152
#define HTP_MM_ACT_TILE_SIZE_Q8_1 1280
#define HTP_MM_MAX_PREFETCH 16
// --- Solver Cost Model Penalty Weights (HMX-specific) ---
#define HTP_MM_HMX_COST_W_DEQUANT 3 // cost penalty for quantized weight loading/dequantization
#define HTP_MM_HMX_COST_A_CONVERT 2 // cost penalty for activation loading/conversion
// --- DMA Activation Transfer Configuration ---
#define HTP_MM_DMA_ACT_ROWS_PER_STEP 2
#define HTP_MM_DMA_ACT_MULTIPLIER 4
enum htp_mm_kernel_type {
HTP_MM_KERNEL_UNSUPPORTED = 0,
// HMX paths
HTP_MM_KERNEL_HMX_2D,
HTP_MM_KERNEL_HMX_F16_BATCHED,
// HVX floating-point paths
HTP_MM_KERNEL_HVX_F16_F16_VTCM,
HTP_MM_KERNEL_HVX_F16_F16_DDR,
HTP_MM_KERNEL_HVX_F16_F32_DDR,
HTP_MM_KERNEL_HVX_F32_F32_VTCM,
HTP_MM_KERNEL_HVX_F32_F32_DDR,
HTP_MM_KERNEL_HVX_F32_F16_DDR,
// HVX quantized paths
HTP_MM_KERNEL_HVX_QUANT_ROW, // standard row-wise parallel quantization
HTP_MM_KERNEL_HVX_QUANT_BLOCK, // parallel block-wise quantization
HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT, // row-wise fallback flat quantization
};
// Op-specific struct for precomputed matmul params
struct htp_mm_kernel_params {
int32_t kernel_type; // enum htp_mm_kernel_type
int32_t pipeline; // 1 = pipelined execution, 0 = standard
int32_t m_chunk; // Row chunk size (M chunk)
int32_t n_chunk; // Col chunk size (N chunk)
int32_t n_threads; // Number of threads to spawn
int32_t n_act_threads; // Number of threads for activation preparation
int32_t n_hmx; // 1 = use HMX, 0 = use HVX
int32_t n_prefetch; // Prefetch lookahead buffers/rows in VTCM
int32_t tile_size; // Weight tile size
int32_t aligned_tile_size; // Aligned weight tile size (padded to 128)
int32_t src1_row_size; // Row size for quantized activation
int32_t vtcm_size; // Total required scratchpad size in VTCM
int32_t vtcm_src0_size; // src0 scratchpad size in VTCM
int32_t vtcm_src1_size; // src1 scratchpad size in VTCM
int32_t vtcm_src2_size; // src2 scratchpad size in VTCM (fused only)
int32_t vtcm_src3_size; // src3 scratchpad size in VTCM (fused only)
int32_t vtcm_dst_size; // dst scratchpad size in VTCM
// Precomputed division values
struct fastdiv_values div_ne12_ne1;
struct fastdiv_values div_ne1;
struct fastdiv_values div_r2;
struct fastdiv_values div_r3;
struct fastdiv_values div_ne11;
};
#if defined(__cplusplus)
static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
#else
_Static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob");
#endif
struct mmid_row_mapping {
uint32_t i1;
uint32_t i2;
};
// Search for optimal (mc, nc) chunk sizes within VTCM budget.
static inline int htp_mm_hmx_compute_chunks(size_t vtcm_total,
size_t overhead,
size_t per_n_cost,
size_t per_m_cost,
size_t per_mn_cost,
size_t m,
size_t n,
size_t m_block_cost,
size_t n_block_cost,
size_t * m_chunk_out,
size_t * n_chunk_out,
size_t * total_out) {
if (m == 0 || n == 0) return -1;
if (vtcm_total <= overhead) return -1;
if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
const size_t usable = vtcm_total - overhead;
size_t best_cost = SIZE_MAX;
size_t best_mn = 0;
size_t best_m = 0, best_n = 0;
const size_t n_max = hex_align_down((size_t)n, HTP_MM_HMX_TILE_N_COLS);
for (size_t nc = n_max; nc >= HTP_MM_HMX_TILE_N_COLS; nc -= HTP_MM_HMX_TILE_N_COLS) {
size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
if (hex_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
if (n_fixed >= usable) goto next_nc;
if (hex_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc;
if (hex_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc;
{
size_t remain = usable - n_fixed;
size_t mc = remain / mc_denom;
mc = hex_align_down(mc, HTP_MM_HMX_TILE_N_ROWS);
mc = hex_smin(mc, m);
if (mc == 0) {
goto next_nc;
}
size_t mblocks = ((size_t) m + mc - 1) / mc;
size_t nblocks = ((size_t) n + nc - 1) / nc;
size_t cost = mblocks * m_block_cost + nblocks * n_block_cost;
size_t mn = mc * nc;
if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
best_cost = cost;
best_mn = mn;
best_m = mc;
best_n = nc;
}
}
next_nc:
if (nc == HTP_MM_HMX_TILE_N_COLS) break; // avoid size_t underflow
}
if (best_m == 0 || best_n == 0) return -1;
// Compute exact total (with overflow checks)
size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0;
if (hex_mul_overflow(best_n, per_n_cost, &t0)) return -1;
if (hex_mul_overflow(best_m, per_m_cost, &t1)) return -1;
if (hex_mul_overflow(best_m, best_n, &mn)) return -1;
if (hex_mul_overflow(mn, per_mn_cost, &t2)) return -1;
if (hex_add_overflow(t0, t1, &total)) return -1;
if (hex_add_overflow(total, t2, &total)) return -1;
if (hex_add_overflow(total, overhead, &total)) return -1;
*m_chunk_out = best_m;
*n_chunk_out = best_n;
*total_out = total;
return 0;
}
// --- Tile Size Helpers ---
static inline uint32_t htp_mm_get_weight_tile_size(int weight_type) {
switch (weight_type) {
case HTP_TYPE_Q4_0:
case HTP_TYPE_IQ4_NL:
return HTP_MM_WEIGHT_TILE_SIZE_Q4_0;
case HTP_TYPE_Q4_1:
return HTP_MM_WEIGHT_TILE_SIZE_Q4_1;
case HTP_TYPE_Q8_0:
return HTP_MM_WEIGHT_TILE_SIZE_Q8_0;
case HTP_TYPE_MXFP4:
return HTP_MM_WEIGHT_TILE_SIZE_MXFP4;
default:
return 0;
}
}
static inline uint32_t htp_mm_get_weight_aligned_tile_size(int weight_type) {
switch (weight_type) {
case HTP_TYPE_Q4_0:
case HTP_TYPE_IQ4_NL:
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0;
case HTP_TYPE_Q4_1:
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1;
case HTP_TYPE_Q8_0:
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0;
case HTP_TYPE_MXFP4:
return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4;
default:
return 0;
}
}
// --- Activation/Row Size Helpers ---
static inline size_t htp_mm_q8_0_tiled_row_size(uint32_t ne) {
const uint32_t ne_padded = ((ne + 127) / 128) * 128;
const uint32_t nb_32 = ne_padded / 32;
return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_0;
}
static inline size_t htp_mm_q8_1_tiled_row_size(uint32_t ne) {
const uint32_t ne_padded = ((ne + 127) / 128) * 128;
const uint32_t nb_32 = ne_padded / 32;
return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_1;
}
static inline size_t htp_mm_q8_0_flat_row_size(uint32_t ne) {
const uint32_t quants_size = hex_align_up(ne, 128);
const uint32_t num_scales = (ne + 31) / 32;
const uint32_t scales_size = hex_align_up(num_scales * 2, 128);
return quants_size + scales_size;
}
static inline size_t htp_mm_q8_1_flat_row_size(uint32_t ne) {
const uint32_t quants_size = hex_align_up(ne, 128);
const uint32_t num_scales = (ne + 31) / 32;
const uint32_t scales_size = hex_align_up(num_scales * 4, 128);
return quants_size + scales_size;
}
static inline size_t htp_mm_get_tiled_row_stride(int weight_type, uint32_t k) {
uint32_t nb = (k + QK_Q4_0_TILED - 1) / QK_Q4_0_TILED;
switch (weight_type) {
case HTP_TYPE_Q4_0:
case HTP_TYPE_IQ4_NL:
case HTP_TYPE_Q4_1:
case HTP_TYPE_Q8_0:
case HTP_TYPE_MXFP4:
return (size_t) nb * htp_mm_get_weight_tile_size(weight_type);
case HTP_TYPE_F16:
return (size_t) k * sizeof(__fp16);
case HTP_TYPE_F32:
return (size_t) k * sizeof(float);
default:
return 0;
}
}
static inline size_t htp_mm_round_up(size_t n, size_t m) {
return ((n + m - 1) / m) * m;
}
static inline bool htp_mm_hmx_pipeline(uint32_t m) {
return m > 32;
}
static inline void htp_mm_hmx_get_2d_chunk_costs(
int wtype, uint32_t k, bool pipeline, uint32_t aligned_tile_size,
size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
) {
const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
const size_t vec_dot_size = k * sizeof(uint16_t);
const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0;
*size_per_n_out = (pipeline ? 2 : 1) * (is_quant ? qweight_row_stride : row_stride) +
(pipeline ? 2 * vec_dot_size : vec_dot_size);
*size_per_m_out = vec_dot_size;
*size_per_mn_out = (pipeline ? 2 : 1) * sizeof(uint16_t);
}
static inline void htp_mm_hmx_get_batched_chunk_costs(
uint32_t k, uint32_t group_size,
size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out
) {
const size_t vec_dot_size = k * sizeof(uint16_t);
*size_per_n_out = 3 * vec_dot_size;
*size_per_m_out = group_size * vec_dot_size;
*size_per_mn_out = sizeof(uint16_t);
}
static inline size_t htp_mm_hmx_get_2d_vtcm_size(
int wtype, uint32_t k, size_t mc, size_t nc, bool pipeline, uint32_t act_threads, uint32_t aligned_tile_size
) {
const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS;
const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32);
const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k);
const size_t vec_dot_size = k * sizeof(uint16_t);
const size_t act_f32_size = htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE);
size_t weight_area_size = is_quant
? htp_mm_round_up((nc / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE)
: htp_mm_round_up(nc * row_stride, HTP_MM_HMX_TILE_SIZE);
if (pipeline) {
weight_area_size *= 2;
}
const size_t act_area_size = htp_mm_round_up(mc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
const size_t output_area_size = htp_mm_round_up(mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
size_t scratch0_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
size_t scratch1_size = pipeline ? scratch0_size : 0;
size_t scratch2_size = pipeline ? output_area_size : 0;
return weight_area_size + act_area_size + act_f32_size + output_area_size +
scratch0_size + scratch1_size + scratch2_size + 256;
}
static inline size_t htp_mm_hmx_get_batched_vtcm_size(
int wtype, uint32_t k, size_t mc, size_t nc, uint32_t group_size, bool use_dma_activation, bool pipeline, uint32_t act_threads) {
(void)wtype;
(void)pipeline;
const size_t vec_dot_size = k * sizeof(uint16_t);
const size_t f32_scratch_size = use_dma_activation
? htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0;
const size_t act_head_stride = mc * k;
const size_t weight_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
const size_t act_area_size = htp_mm_round_up(group_size * act_head_stride * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
const size_t output_area_size = htp_mm_round_up(group_size * mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE);
const size_t scratch_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE);
return weight_area_size + act_area_size + output_area_size +
2 * scratch_area_size + 256 + f32_scratch_size;
}
static inline size_t htp_mm_hvx_get_vtcm_sizes(
int kernel_type,
int wtype,
uint32_t ne10, // k
uint32_t src1_nrows, // m_total (or act_nrows)
uint32_t n_threads,
size_t dst_row_size,
size_t src0_row_size,
size_t src1_row_size,
uint32_t n_prefetch,
size_t * vtcm_src0_size_out,
size_t * vtcm_src1_size_out,
size_t * vtcm_dst_size_out
) {
size_t vtcm_src0_size = 0;
size_t vtcm_src1_size = 0;
size_t vtcm_dst_size = 0;
const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
wtype == HTP_TYPE_MXFP4);
const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
const size_t dst_nrows = (src1_nrows > 1) ? 0 : 1;
switch (kernel_type) {
case HTP_MM_KERNEL_HVX_F16_F16_VTCM: {
size_t f16_src1_row_size = htp_mm_round_up(ne10 * 2, 128);
vtcm_src1_size = htp_mm_round_up(f16_src1_row_size * src1_nrows, 256);
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
break;
}
case HTP_MM_KERNEL_HVX_F16_F32_DDR:
case HTP_MM_KERNEL_HVX_F16_F16_DDR:
case HTP_MM_KERNEL_HVX_F32_F32_DDR:
case HTP_MM_KERNEL_HVX_F32_F16_DDR: {
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size, 256) * n_threads;
vtcm_src1_size = htp_mm_round_up(n_prefetch * src1_row_size, 256) * n_threads;
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
break;
}
case HTP_MM_KERNEL_HVX_F32_F32_VTCM: {
size_t f32_src1_row_size = htp_mm_round_up(ne10 * 4, 128);
vtcm_src1_size = htp_mm_round_up(f32_src1_row_size * src1_nrows, 256);
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads;
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0;
break;
}
case HTP_MM_KERNEL_HVX_QUANT_BLOCK:
case HTP_MM_KERNEL_HVX_QUANT_ROW: {
size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10);
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
// src0 spad is also used in dynamic quantizer to store padded src1 rows
size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, QK_Q8_0_TILED * sizeof(float));
if (vtcm_src0_size < src1_row_size_padded) {
vtcm_src0_size = src1_row_size_padded;
}
vtcm_src0_size = vtcm_src0_size * n_threads;
vtcm_dst_size = vtcm_dst_size * n_threads;
if (is_repack) {
uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
uint32_t n_k_tiles = ne10 / 32;
uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
if (repacked_vtcm_size < src1_row_size_padded) {
repacked_vtcm_size = src1_row_size_padded;
}
vtcm_src0_size = repacked_vtcm_size * n_threads;
}
break;
}
case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: {
size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10);
vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0;
vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256);
size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, 256);
if (vtcm_src0_size < src1_row_size_padded) {
vtcm_src0_size = src1_row_size_padded;
}
vtcm_src0_size = vtcm_src0_size * n_threads;
vtcm_dst_size = vtcm_dst_size * n_threads;
if (is_repack) {
uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
uint32_t n_k_tiles = ne10 / 32;
uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
if (repacked_vtcm_size < src1_row_size_padded) {
repacked_vtcm_size = src1_row_size_padded;
}
vtcm_src0_size = repacked_vtcm_size * n_threads;
}
break;
}
default:
break;
}
*vtcm_src0_size_out = vtcm_src0_size;
*vtcm_src1_size_out = vtcm_src1_size;
*vtcm_dst_size_out = vtcm_dst_size;
return vtcm_src0_size + vtcm_src1_size + vtcm_dst_size;
}
static inline size_t htp_mm_hvx_id_get_vtcm_sizes(
int wtype,
uint32_t ne10, // k
uint32_t src1_nrows,
uint32_t n_threads,
size_t src0_row_size, // nb01
uint32_t n_prefetch,
size_t * vtcm_src0_size_out,
size_t * vtcm_src1_size_out
) {
const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 ||
wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL ||
wtype == HTP_TYPE_MXFP4);
const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128);
const size_t src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10)
: htp_mm_q8_0_tiled_row_size(ne10);
size_t src0_sz_per_thread = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256);
size_t src1_sz = htp_mm_round_up(src1_row_size * src1_nrows, 256);
// src0 spad also holds temporary transposed src1 columns during dynamic quantization.
const size_t src1_row_size_padded = htp_mm_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float));
if (src0_sz_per_thread < src1_row_size_padded) {
src0_sz_per_thread = src1_row_size_padded;
}
if (is_repack) {
const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype);
const uint32_t n_k_tiles = ne10 / 32;
const uint32_t tile_row_size = n_k_tiles * aligned_tile_size;
size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256);
if (repacked_vtcm_size < src1_row_size_padded) {
repacked_vtcm_size = src1_row_size_padded;
}
src0_sz_per_thread = repacked_vtcm_size;
}
const size_t vtcm_src0_size = src0_sz_per_thread * n_threads;
*vtcm_src0_size_out = vtcm_src0_size;
*vtcm_src1_size_out = src1_sz;
return vtcm_src0_size + src1_sz;
}
#ifdef __cplusplus
}
#endif
#endif // HTP_MATMUL_OPS_H
+4
View File
@@ -14,6 +14,8 @@ Drivers_Dir = 13
1 = %DiskId%
[SourceDisksFiles]
libggml-htp-v68.so = 1
libggml-htp-v69.so = 1
libggml-htp-v73.so = 1
libggml-htp-v75.so = 1
libggml-htp-v79.so = 1
@@ -26,6 +28,8 @@ ExcludeFromSelect = *
CopyFiles=Drivers_Dir
[Drivers_Dir]
libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE
@@ -174,7 +174,7 @@ __kernel void kernel_gemv_noshuffle_q8_0_f32(
regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, convert_float(regS), regB);
dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB);
}
// reduction in local memory, assumes #wave=4
+13 -6
View File
@@ -60,7 +60,19 @@ if (Vulkan_FOUND)
message(STATUS "Vulkan found")
ggml_add_backend_library(ggml-vulkan
ggml-vulkan.cpp
ggml-vulkan-instance.cpp
ggml-vulkan-shaders.cpp
ggml-vulkan-buffers.cpp
ggml-vulkan-pipelines.cpp
ggml-vulkan-matmul.cpp
ggml-vulkan-flash-attn.cpp
ggml-vulkan-operators.cpp
ggml-vulkan-graph.cpp
ggml-vulkan-backend.cpp
ggml-vulkan-debug.cpp
ggml-vulkan-types.h
ggml-vulkan-push-constants.h
ggml-vulkan-common.h
../../include/ggml-vulkan.h
)
@@ -108,9 +120,6 @@ if (Vulkan_FOUND)
if (GGML_VULKAN_CHECK_RESULTS)
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
# the result-checking path computes a CPU reference graph via
# ggml_graph_compute_with_ctx(), which is defined in ggml-cpu
target_link_libraries(ggml-vulkan PRIVATE ggml-cpu)
endif()
if (GGML_VULKAN_DEBUG)
@@ -132,8 +141,6 @@ if (Vulkan_FOUND)
if (GGML_VULKAN_RUN_TESTS)
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
# the test path also calls ggml_graph_compute_with_ctx() (ggml-cpu)
target_link_libraries(ggml-vulkan PRIVATE ggml-cpu)
endif()
# Set up toolchain for host compilation whether cross-compiling or not
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,783 @@
#include "ggml-vulkan-common.h"
ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
/* .get_name = */ ggml_backend_vk_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
/* .is_host = */ NULL,
};
static std::vector<uint32_t> ggml_vk_find_memory_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
std::vector<uint32_t> indices;
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
vk::MemoryType memory_type = mem_props->memoryTypes[i];
if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) &&
(flags & memory_type.propertyFlags) == flags &&
mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) {
indices.push_back(i);
}
}
return indices;
}
static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list,
void *import_ptr = nullptr) {
VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")");
if (size > device->max_buffer_size) {
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit");
}
vk_buffer buf = std::make_shared<vk_buffer_struct>();
if (size == 0) {
buf->size = 0;
return buf;
}
vk::BufferUsageFlags usage_flags = vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst;
vk::MemoryAllocateFlags mem_flags {};
if (device->buffer_device_address) {
usage_flags |= vk::BufferUsageFlagBits::eShaderDeviceAddress;
mem_flags |= vk::MemoryAllocateFlagBits::eDeviceAddress;
}
vk::BufferCreateInfo buffer_create_info{
vk::BufferCreateFlags(),
size,
usage_flags,
vk::SharingMode::eExclusive,
0,
nullptr,
};
vk::ExternalMemoryBufferCreateInfo external_memory_bci;
if (import_ptr) {
external_memory_bci.handleTypes = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
buffer_create_info.setPNext(&external_memory_bci);
}
buf->buffer = device->device.createBuffer(buffer_create_info);
vk::MemoryRequirements mem_req = device->device.getBufferMemoryRequirements(buf->buffer);
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f };
vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
if (device->memory_priority) {
mem_flags_info.setPNext(&mem_priority_info);
}
if (import_ptr) {
vk::MemoryHostPointerPropertiesEXT host_pointer_props;
try {
host_pointer_props = device->device.getMemoryHostPointerPropertiesEXT(vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT, import_ptr);
} catch (vk::SystemError& e) {
GGML_LOG_WARN("ggml_vulkan: Failed getMemoryHostPointerPropertiesEXT (%s)\n", e.what());
device->device.destroyBuffer(buf->buffer);
return {};
}
vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();
uint32_t memory_type_idx;
vk::MemoryPropertyFlags property_flags = *req_flags_list.begin();
for (memory_type_idx = 0; memory_type_idx < 32; ++memory_type_idx) {
if (!(host_pointer_props.memoryTypeBits & (1u << memory_type_idx))) {
continue;
}
if (!(mem_req.memoryTypeBits & (1u << memory_type_idx))) {
continue;
}
vk::MemoryType memory_type = mem_props.memoryTypes[memory_type_idx];
// check for visible+coherent+cached. Other flags (e.g. devicelocal) are allowed
if ((memory_type.propertyFlags & property_flags) == property_flags) {
property_flags = memory_type.propertyFlags;
break;
}
}
if (memory_type_idx == 32) {
GGML_LOG_WARN("ggml_vulkan: Memory type for host allocation not found\n");
device->device.destroyBuffer(buf->buffer);
return {};
}
buf->memory_property_flags = mem_props.memoryTypes[memory_type_idx].propertyFlags;
try {
vk::ImportMemoryHostPointerInfoEXT import_info;
import_info.handleType = vk::ExternalMemoryHandleTypeFlagBits::eHostAllocationEXT;
import_info.pHostPointer = import_ptr;
import_info.setPNext(&mem_flags_info);
buf->device_memory = device->device.allocateMemory({ size, memory_type_idx, &import_info });
} catch (const vk::SystemError& e) {
}
} else {
for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
const auto & req_flags = *it;
const std::vector<uint32_t> memory_type_indices = ggml_vk_find_memory_properties(&mem_props, &mem_req, req_flags);
if (memory_type_indices.empty()) {
continue;
}
bool done = false;
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
try {
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags;
done = true;
break;
} catch (const vk::SystemError& e) {
// loop and retry
// during last attempt throw the exception
if (it + 1 == req_flags_list.end() && mtype_it + 1 == memory_type_indices.end()) {
device->device.destroyBuffer(buf->buffer);
throw e;
}
}
}
if (done) {
break;
}
}
}
if (!buf->device_memory) {
device->device.destroyBuffer(buf->buffer);
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
}
buf->ptr = nullptr;
if (import_ptr) {
buf->ptr = import_ptr;
} else {
if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
buf->ptr = device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
}
}
device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
buf->device = device;
buf->size = size;
if (device->buffer_device_address) {
const vk::BufferDeviceAddressInfo addressInfo(buf->buffer);
buf->bda_addr = device->device.getBufferAddress(addressInfo);
}
device->memory_logger->log_allocation(buf, size);
return buf;
}
vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags) {
try {
return ggml_vk_create_buffer(device, size, {req_flags, fallback_flags});
} catch (const vk::SystemError& e) {
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
throw e;
}
}
vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
vk_buffer buf;
try {
if (device->prefer_host_memory) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal});
} else if (device->uma) {
// On UMA, prefer host-visible memory so direct tensor borrowing works.
// If unavailable, fall back to device-local memory.
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else if (device->disable_host_visible_vidmem) {
if (device->allow_sysmem_fallback) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
}
} else {
// use rebar if available, otherwise fallback to device only visible memory
if (device->allow_sysmem_fallback) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal});
}
}
} catch (const vk::SystemError& e) {
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
throw e;
}
return buf;
}
void ggml_vk_destroy_buffer(vk_buffer& buf) {
if (buf == nullptr) {
return;
}
if (buf->device != nullptr) {
buf->device->memory_logger->log_deallocation(buf);
}
buf.reset();
}
void * ggml_vk_host_malloc(vk_device& device, size_t size) {
VK_LOG_MEMORY("ggml_vk_host_malloc(" << size << ")");
vk_buffer buf = ggml_vk_create_buffer(device, size,
{vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
size/1024.0/1024.0);
device->device.freeMemory(buf->device_memory);
device->device.destroyBuffer(buf->buffer);
return nullptr;
}
std::lock_guard<std::shared_mutex> guard(device->pinned_memory_mutex);
device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf));
return buf->ptr;
}
void ggml_vk_host_free(vk_device& device, void* ptr) {
if (ptr == nullptr) {
return;
}
VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")");
std::lock_guard<std::shared_mutex> guard(device->pinned_memory_mutex);
vk_buffer buf;
size_t index;
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]);
const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]);
if (ptr >= addr && ptr < endr) {
buf = std::get<2>(device->pinned_memory[i]);
index = i;
break;
}
}
if (buf == nullptr) {
fprintf(stderr, "WARNING: failed to free pinned memory: memory not in map\n");
return;
}
ggml_vk_destroy_buffer(buf);
device->pinned_memory.erase(device->pinned_memory.begin() + index);
}
void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
std::shared_lock<std::shared_mutex> guard(device->pinned_memory_mutex);
buf = nullptr;
buf_offset = 0;
for (size_t i = 0; i < device->pinned_memory.size(); i++) {
const uint8_t* addr = (const uint8_t*) std::get<0>(device->pinned_memory[i]);
const uint8_t* endr = addr + std::get<1>(device->pinned_memory[i]);
if (ptr >= addr && ptr < endr) {
buf = std::get<2>(device->pinned_memory[i]);
buf_offset = ((const uint8_t *)ptr) - addr;
break;
}
}
}
void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size) {
if (device->sync_staging == nullptr || device->sync_staging->size < size) {
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
ggml_vk_destroy_buffer(device->sync_staging);
device->sync_staging = ggml_vk_create_buffer_check(device, size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
}
}
void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) {
if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) {
VK_LOG_MEMORY("ggml_vk_ensure_sync_staging_buffer(" << size << ")");
ggml_vk_destroy_buffer(ctx->sync_staging);
ctx->sync_staging = ggml_vk_create_buffer_check(ctx->device, size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
}
}
static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context& subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
VK_LOG_DEBUG("ggml_vk_buffer_write_nc_async(" << tensor << ")");
GGML_ASSERT(!ggml_is_contiguous(tensor));
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ABORT("fatal error");
}
// Check if src is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset = 0;
ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset);
const uint64_t ne0 = tensor->ne[0];
const uint64_t ne1 = tensor->ne[1];
const uint64_t ne2 = tensor->ne[2];
const uint64_t ne3 = tensor->ne[3];
const uint64_t nb0 = tensor->nb[0];
const uint64_t nb1 = tensor->nb[1];
const uint64_t nb2 = tensor->nb[2];
const uint64_t nb3 = tensor->nb[3];
const ggml_type type = tensor->type;
const uint64_t ts = ggml_type_size(type);
const uint64_t bs = ggml_blck_size(type);
const uint64_t dstnb0 = ts;
const uint64_t dstnb1 = dstnb0*(ne0/bs);
const uint64_t dstnb2 = dstnb1*ne1;
const uint64_t dstnb3 = dstnb2*ne2;
const uint64_t ne = ggml_nelements(tensor);
if (buf != nullptr) {
// Memory is pinned, use as staging buffer
std::vector<vk::BufferCopy> slices;
for (uint64_t i3 = 0; i3 < ne3; i3++) {
for (uint64_t i2 = 0; i2 < ne2; i2++) {
// Find longest contiguous slice
if (ne1*nb1 == dstnb2) {
slices.push_back({ buf_offset + i3*nb3 + i2*nb2, offset + i3*dstnb3 + i2*dstnb2, dstnb2 });
} else {
for (uint64_t i1 = 0; i1 < ne1; i1++) {
if (ne0*nb0/bs == dstnb1) {
slices.push_back({ buf_offset + i3*nb3 + i2*nb2 + i1*nb1, offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, dstnb1 });
} else {
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
const uint64_t d_off = offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
for (uint64_t i0 = 0; i0 < ne0; i0++) {
slices.push_back({ s_off + i0*nb0, d_off + i0*dstnb0, dstnb0 });
}
}
}
}
}
}
ggml_vk_sync_buffers(ctx, subctx);
subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
return;
}
if (!sync_staging) {
GGML_ABORT("Asynchronous write to non-pinned memory not supported");
}
// Staging buffer required
vk_buffer& staging = ctx->device->sync_staging;
const uint64_t copy_size = ts*ne/bs;
ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size);
VkBufferCopy buf_copy{ 0, offset, copy_size };
ggml_vk_sync_buffers(ctx, subctx);
vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
for (uint64_t i3 = 0; i3 < ne3; i3++) {
for (uint64_t i2 = 0; i2 < ne2; i2++) {
// Find longest contiguous slice
if (ne1*nb1 == dstnb2) {
deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys);
} else {
for (uint64_t i1 = 0; i1 < ne1; i1++) {
if (ne0*nb0/bs == dstnb1) {
deferred_memcpy((uint8_t *)staging->ptr + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys);
} else {
const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1;
const uint64_t d_off = i3*dstnb3 + i2*dstnb2 + i1*dstnb1;
for (uint64_t i0 = 0; i0 < ne0; i0++) {
deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys);
}
}
}
}
}
}
}
bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
// Check if src is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset = 0;
ggml_vk_host_get(dst->device, src, buf, buf_offset);
if (buf != nullptr) {
// Memory is pinned, use as staging buffer
std::vector<vk::BufferCopy> slices(1);
if (width == spitch && width == dpitch) {
// Only do single write if stride is equal
slices[0].srcOffset = buf_offset;
slices[0].dstOffset = offset;
slices[0].size = width * height;
} else {
slices.resize(height);
for (size_t i = 0; i < height; i++) {
slices[i].srcOffset = buf_offset + i * spitch;
slices[i].dstOffset = offset + i * dpitch;
slices[i].size = width;
}
}
ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer->buf.copyBuffer(buf->buffer, dst->buffer, slices);
return true;
}
VK_LOG_DEBUG("STAGING");
if (!sync_staging) {
// copy was not handled caller needs to fall back
return false;
}
// Staging buffer required
const size_t staging_size = width * height;
ggml_vk_ensure_sync_staging_buffer(dst->device, staging_size);
vk_buffer& staging_buffer = dst->device->sync_staging;
std::vector<vk::BufferCopy> slices(1);
if (width == dpitch) {
slices[0].srcOffset = 0;
slices[0].dstOffset = offset;
slices[0].size = staging_size;
} else {
slices.resize(height);
for (size_t i = 0; i < height; i++) {
slices[i].srcOffset = i * width;
slices[i].dstOffset = offset + i * dpitch;
slices[i].size = width;
}
}
ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer->buf.copyBuffer((VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, slices);
if (width == spitch) {
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, staging_size, &subctx->in_memcpys);
} else {
for (size_t i = 0; i < height; i++) {
deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
}
}
return true;
}
bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging) {
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, size, 1, sync_staging);
}
void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height) {
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
// Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
if (width == spitch && width == dpitch) {
memcpy((uint8_t *)dst->ptr + offset, src, width * height);
} else {
for (size_t i = 0; i < height; i++) {
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
}
}
} else {
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, dpitch, width, height, true);
GGML_ASSERT(ret);
ggml_vk_ctx_end(subctx);
for (auto& cpy : subctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
for (auto& mset : subctx->memsets) {
memset(mset.dst, mset.val, mset.n);
}
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
dst->device->device.resetFences({ dst->device->fence });
ggml_vk_queue_command_pools_cleanup(dst->device);
}
}
void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1);
}
bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging) {
VK_LOG_DEBUG("ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")");
GGML_ASSERT(width > 0);
GGML_ASSERT(height > 0);
GGML_ASSERT(src != nullptr);
// TODO: staging_offset is not used
// Check if dst is pinned memory
vk_buffer buf = nullptr;
size_t buf_offset = 0;
ggml_vk_host_get(src->device, dst, buf, buf_offset);
std::vector<vk::BufferCopy> slices(1);
if (width == spitch && width == dpitch) {
// Only do single write if stride is equal
slices[0].srcOffset = offset;
slices[0].dstOffset = buf_offset;
slices[0].size = width * height;
} else {
slices.resize(height);
for (size_t i = 0; i < height; i++) {
slices[i].srcOffset = offset + i * spitch;
slices[i].dstOffset = buf_offset + i * dpitch;
slices[i].size = width;
}
}
if (buf != nullptr) {
// Memory is pinned, use as staging buffer
ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer->buf.copyBuffer(src->buffer, buf->buffer, slices);
return true;
}
VK_LOG_DEBUG("STAGING");
if (!sync_staging) {
// copy was not handled caller needs to fall back
return false;
}
// Fall back to staging buffer
const size_t staging_size = width * height;
ggml_vk_ensure_sync_staging_buffer(src->device, staging_size);
vk_buffer& staging_buffer = src->device->sync_staging;
std::vector<vk::BufferCopy> staging_slices(1);
if (width == spitch) {
staging_slices[0].srcOffset = offset;
staging_slices[0].dstOffset = 0;
staging_slices[0].size = staging_size;
} else {
staging_slices.resize(height);
for (size_t i = 0; i < height; i++) {
staging_slices[i].srcOffset = offset + i * spitch;
staging_slices[i].dstOffset = i * width;
staging_slices[i].size = width;
}
}
ggml_vk_sync_buffers(nullptr, subctx);
subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, staging_slices);
if (width == dpitch) {
deferred_memcpy(dst, staging_buffer->ptr, staging_size, &subctx->out_memcpys);
} else {
for (size_t i = 0; i < height; i++) {
deferred_memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) staging_buffer->ptr + i * width, width, &subctx->out_memcpys);
}
}
return true;
}
static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) {
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging);
}
void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height) {
VK_LOG_DEBUG("ggml_vk_buffer_read_2d(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")");
// If the device is not an UMA device the memory is host-accessible through rebar. While writing
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
// the HW device to host copy path.
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
subctx->s->buffer->buf.pipelineBarrier(
vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eHost,
{},
{ { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite,
vk::AccessFlagBits::eHostRead } },
{}, {});
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX),
"vk_buffer_read_2d uma waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);
if (width == spitch && width == dpitch) {
memcpy(dst, (const uint8_t *) src->ptr + offset, width * height);
} else {
for (size_t i = 0; i < height; i++) {
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
}
}
} else {
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
bool ret = ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, spitch, dpitch, width, height, true);
GGML_ASSERT(ret);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read_2d waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
}
}
void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1);
}
void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
// Make sure both buffers are on same device
GGML_ASSERT(src->device == dst->device);
VkBufferCopy bc{ src_offset, dst_offset, size };
vkCmdCopyBuffer(ctx->s->buffer->buf, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc);
}
void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
if (src->device == dst->device) {
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
// Copy within the device
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);
} else {
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
// Copy device to device
ggml_vk_ensure_sync_staging_buffer(src->device, size);
// Copy to src staging buffer
ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
// Copy to dst buffer
ggml_vk_buffer_write(dst, dst_offset, src->device->sync_staging->ptr, size);
}
}
void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset_async(" << offset << ", " << c << ", " << size << ")");
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
dst->device->uma) {
deferred_memset((uint8_t*)dst->ptr + offset, c, size, &ctx->memsets);
return;
}
// Fall back to GPU fillBuffer for non-UMA or non-host-visible buffers
ctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
}
void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
if (dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible &&
dst->device->uma) {
memset((uint8_t*)dst->ptr + offset, c, size);
return;
}
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
subctx->s->buffer->buf.fillBuffer(dst->buffer, offset, size, c);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
dst->device->device.resetFences({ dst->device->fence });
ggml_vk_queue_command_pools_cleanup(dst->device);
}
ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
/* .get_base = */ ggml_backend_vk_buffer_get_base,
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
/* .set_tensor_2d = */ ggml_backend_vk_buffer_set_tensor_2d,
/* .get_tensor_2d = */ ggml_backend_vk_buffer_get_tensor_2d,
/* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
/* .clear = */ ggml_backend_vk_buffer_clear,
/* .reset = */ NULL,
};
vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size) {
if (!device->external_memory_host) {
return {};
}
uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
if (uptr & (device->min_imported_host_pointer_alignment - 1)) {
return {};
}
if (size & (device->min_imported_host_pointer_alignment - 1)) {
return {};
}
const vk::MemoryPropertyFlags property_flags = vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached;
vk_buffer buf {};
try {
buf = ggml_vk_create_buffer(device, size, { property_flags }, ptr);
} catch (vk::SystemError& e) {
GGML_LOG_WARN("ggml_vulkan: Failed ggml_vk_create_buffer (%s)\n", e.what());
}
return buf;
}
+228
View File
@@ -0,0 +1,228 @@
#pragma once
#include "ggml-vulkan-push-constants.h"
extern std::mutex queue_mutex;
extern ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface;
extern bool vk_memory_logger_enabled;
extern bool vk_perf_logger_enabled;
extern bool vk_perf_logger_concurrent;
extern bool vk_enable_sync_logger;
extern uint32_t vk_perf_logger_frequency;
extern std::string vk_pipeline_stats_filter;
extern void * const vk_ptr_base;
extern vk_instance_t vk_instance;
extern ggml_backend_buffer_i ggml_backend_vk_buffer_interface;
uint64_t vk_tensor_offset(const ggml_tensor * tensor);
uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t);
void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx);
void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n);
void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
void ggml_vk_submit(vk_context& ctx, vk::Fence fence);
uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues);
void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only);
vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p);
vk_context ggml_vk_create_temporary_context(vk_command_pool& p);
void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p);
void ggml_vk_queue_command_pools_cleanup(vk_device& device);
vk_buffer ggml_vk_create_buffer_check(vk_device& device, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0));
vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size);
void ggml_vk_destroy_buffer(vk_buffer& buf);
vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0);
void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx);
void ggml_vk_set_event(vk_context& ctx, vk::Event& event);
void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events);
vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_t hsk, uint32_t hsv, uint32_t n_rows, uint32_t n_kv, ggml_type k_type, ggml_type v_type, bool f32acc);
vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, bool use_mask, bool use_mask_opt, bool use_logit_softcap, ggml_type k_type, ggml_type v_type);
uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch);
void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr);
vk_device ggml_vk_get_device(size_t idx);
DispatchLoaderDynamic & ggml_vk_default_dispatcher();
void ggml_vk_instance_init();
void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx);
vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type);
void * ggml_vk_host_malloc(vk_device& device, size_t size);
void ggml_vk_host_free(vk_device& device, void* ptr);
void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset);
vk_subbuffer ggml_vk_tensor_subbuffer( const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false);
void ggml_vk_ctx_end(vk_context& ctx);
void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx);
vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx);
bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx);
size_t ggml_vk_align_size(size_t width, size_t align);
void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys = nullptr);
void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets = nullptr);
void ggml_vk_ensure_sync_staging_buffer(vk_device& device, size_t size);
void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size);
bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false);
bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false);
void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height);
void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size);
bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false);
void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height);
void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size);
void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size);
void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size);
void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size);
void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size);
void ggml_vk_matmul( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t padded_n);
bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor);
vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to);
void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out);
vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type);
void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, const vk_subbuffer & in, const vk_subbuffer & out, uint32_t ne);
bool ggml_vk_can_use_fwht(const ggml_backend_vk_context * ctx, const ggml_tensor * src1, const ggml_tensor * dst);
void ggml_vk_fwht(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src, ggml_tensor * dst);
void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx);
bool ggml_vk_use_mul_mat_vec_id(const struct ggml_cgraph * cgraph, int node_idx);
void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx);
bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type, ggml_type v_type);
bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool f32acc, ggml_type k_type = GGML_TYPE_F16);
void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst);
void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx);
void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst);
void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_gated_delta_net(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx);
void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst);
void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_add1(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst);
void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_log(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node);
void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params);
void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst);
void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx);
void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop);
void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx);
void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst);
void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool last_node, bool almost_ready, bool submit);
void ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx);
void ggml_vk_cleanup(ggml_backend_vk_context * ctx);
int ggml_vk_get_device_count();
void ggml_vk_get_device_description(int device, char * description, size_t description_size);
bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer);
void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer);
void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer);
enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor);
void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size);
void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data);
bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst);
void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value);
const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft);
size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor);
void ggml_backend_vk_free(ggml_backend_t backend);
void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
bool ggml_vk_is_empty(ggml_tensor * node);
bool ggml_vk_can_fuse(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops);
bool ggml_vk_can_fuse_ssm_conv(const ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, int num_extra);
bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx, topk_moe_mode mode);
bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
bool ggml_vk_tensors_overlap(const ggml_tensor * a, const ggml_tensor * b, bool elementwise);
bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx);
void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * graph);
int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op);
vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
ggml_backend_reg_t ggml_backend_vk_reg();
bool ggml_vk_instance_layer_settings_available();
bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev);
template <typename T>
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
for (auto& buffer : descriptor_buffer_infos) {
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
}
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
subctx->s->buffer->buf.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
subctx->s->buffer->buf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
subctx->s->buffer->buf.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
pipeline->layout,
0,
{ descriptor_set },
{});
subctx->s->buffer->buf.dispatch(wg0, wg1, wg2);
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,293 @@
#include "ggml-vulkan-common.h"
void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, const ggml_tensor * sinks, ggml_tensor * dst) {
VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3];
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
if (sinks) {
std::cerr << "), (" << sinks << ", name=" << sinks->name << ", type=" << sinks->type << ", ne0=" << sinks->ne[0] << ", ne1=" << sinks->ne[1] << ", ne2=" << sinks->ne[2] << ", ne3=" << sinks->ne[3] << ", nb0=" << sinks->nb[0] << ", nb1=" << sinks->nb[1] << ", nb2=" << sinks->nb[2] << ", nb3=" << sinks->nb[3];
}
std::cerr << "))");
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
const uint32_t nem0 = mask ? mask->ne[0] : 0;
const uint32_t nem1 = mask ? mask->ne[1] : 0;
const uint32_t nem2 = mask ? mask->ne[2] : 0;
const uint32_t nem3 = mask ? mask->ne[3] : 0;
const uint32_t HSK = nek0;
const uint32_t HSV = nev0;
uint32_t N = neq1;
const uint32_t KV = nek1;
GGML_ASSERT(ne0 == HSV);
GGML_ASSERT(ne2 == N);
// input tensor rows must be contiguous
GGML_ASSERT(nbq0 == ggml_type_size(q->type));
GGML_ASSERT(nbk0 == ggml_type_size(k->type));
GGML_ASSERT(nbv0 == ggml_type_size(v->type));
GGML_ASSERT(neq0 == HSK);
GGML_ASSERT(neq1 == N);
GGML_ASSERT(nev1 == nek1);
// dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(nb0 <= nb1);
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
assert(dst->type == GGML_TYPE_F32);
assert(q->type == GGML_TYPE_F32);
uint32_t gqa_ratio = 1;
uint32_t qk_ratio = neq2 / nek2;
uint32_t workgroups_x = (uint32_t)neq1;
uint32_t workgroups_y = (uint32_t)neq2;
uint32_t workgroups_z = (uint32_t)neq3;
const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32 || k->type == GGML_TYPE_BF16;
// For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
// For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
vk_fa_tuning_params tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, 512, KV, k->type, v->type, f32acc);
const uint32_t max_gqa = std::min(tuning_params.block_rows, 32u);
if (N <= 8 && qk_ratio > 1 && qk_ratio <= max_gqa &&
qk_ratio * nek2 == neq2 && nek2 == nev2 && nem2 <= 1) {
// grouped query attention - make the N dimension equal to gqa_ratio, reduce
// workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
// and change addressing calculations to index Q's dimension 2.
gqa_ratio = qk_ratio;
N = gqa_ratio;
workgroups_y /= gqa_ratio;
}
tuning_params = get_fa_tuning_params(ctx->device, HSK, HSV, N, KV, k->type, v->type, f32acc);
const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
// For F32, the shader treats it as a block of size 4 (for vec4 loads)
if (k->type == GGML_TYPE_F32) {
k_stride /= 4;
}
if (v->type == GGML_TYPE_F32) {
v_stride /= 4;
}
const uint32_t alignment = tuning_params.block_cols;
bool aligned = (KV % alignment) == 0 &&
// the "aligned" shader variant will forcibly align strides, for performance
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
// Need to use the coopmat2 variant that clamps loads when HSK/HSV aren't sufficiently aligned.
if (((HSK | HSV) % 16) != 0 && tuning_params.path == FA_COOPMAT2) {
aligned = false;
}
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float));
if (logit_softcap != 0) {
scale /= logit_softcap;
}
// Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768 && nem0 >= tuning_params.block_cols * 16;
vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
mask != nullptr, use_mask_opt, logit_softcap != 0, k->type, v->type);
vk_pipeline pipeline = nullptr;
{
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16;
auto it = pipelines.find(fa_pipeline_state);
if (it != pipelines.end()) {
pipeline = it->second;
} else {
pipelines[fa_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
}
}
assert(pipeline);
// Compile early to initialize wg_denoms.
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
uint32_t split_kv = KV;
uint32_t split_k = 1;
// Intel Alchemist prefers more workgroups
const uint32_t shader_core_count_multiplier = (ctx->device->vendor_id == VK_VENDOR_ID_INTEL && ctx->device->architecture != INTEL_XE2) ? 2 : 1;
// Use a placeholder core count if one isn't available. split_k is a big help for perf.
const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count * shader_core_count_multiplier : 16;
const uint32_t Br = fa_pipeline_state.Br;
const uint32_t Bc = fa_pipeline_state.Bc;
GGML_ASSERT(Br == pipeline->wg_denoms[0]);
const uint32_t Tr = CEIL_DIV(N, Br);
// Try to use split_k when KV is large enough to be worth the overhead.
if (gqa_ratio > 1 && workgroups_x <= Br) {
split_k = shader_core_count * 2 / (workgroups_x * workgroups_y * workgroups_z);
} else if (gqa_ratio <= 1) {
uint32_t total_wgs_no_split = Tr * workgroups_y * workgroups_z;
if (total_wgs_no_split < shader_core_count * 2) {
split_k = shader_core_count * 2 / total_wgs_no_split;
}
}
if (split_k > 1) {
// Try to evenly split KV into split_k chunks, but it needs to be a multiple
// of "align", so recompute split_k based on that.
split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment);
split_k = CEIL_DIV(KV, split_kv);
}
// Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1)
// and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows.
// For matrices, the order is (inner to outer) [HSV, ne1, k, ne2, ne3].
// For L/M, the order is (inner to outer) [ne1, k, ne2, ne3].
const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne2 * ne3 : 0;
if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) {
GGML_ABORT("Requested preallocation size is too large");
}
if (ctx->prealloc_size_split_k < split_k_size) {
ctx->prealloc_size_split_k = split_k_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
const uint32_t mask_opt_num_dwords = CEIL_DIV(nem0, 16 * Bc);
const uint64_t mask_opt_size = sizeof(uint32_t) * mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2 * nem3;
vk_pipeline pipeline_fa_mask_opt = nullptr;
if (use_mask_opt) {
{
std::lock_guard<std::mutex> guard(ctx->device->compile_mutex);
auto &pipelines = ctx->device->pipeline_fa_mask_opt;
auto it = pipelines.find({Br, Bc});
if (it != pipelines.end()) {
pipeline_fa_mask_opt = it->second;
} else {
pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared<vk_pipeline_struct>();
}
}
assert(pipeline_fa_mask_opt);
ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1);
if (ctx->prealloc_size_y < mask_opt_size) {
ctx->prealloc_size_y = mask_opt_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
if (ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
}
const uint32_t n_head_kv = neq2;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q);
vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k);
vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v);
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf;
vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf;
vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf;
uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2;
if (use_mask_opt)
{
const vk_op_flash_attn_mask_opt_push_constants opt_pc = {
nem0,
nem1,
nem2,
(uint32_t)(mask->nb[1] / sizeof(ggml_fp16_t)),
(uint32_t)(mask->nb[2] / sizeof(ggml_fp16_t)),
(uint32_t)(mask->nb[3] / sizeof(ggml_fp16_t)),
mask_opt_num_dwords,
mask_opt_num_dwords * CEIL_DIV(nem1, Br),
mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2,
};
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline_fa_mask_opt,
{ mask_buf, mask_opt_buf }, opt_pc,
{ mask_opt_num_dwords, CEIL_DIV(nem1, Br), nem2 * nem3 });
ggml_vk_sync_buffers(ctx, subctx);
}
const vk_flash_attn_push_constants pc = { N, KV,
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
(uint32_t)neq2, (uint32_t)neq3,
(uint32_t)nek2, (uint32_t)nek3,
(uint32_t)nev2, (uint32_t)nev3,
nem1, nem2, nem3,
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
scale, max_bias, logit_softcap,
mask_n_head_log2, m0, m1,
gqa_ratio, split_kv, split_k };
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
if (ctx->prealloc_split_k_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
// We reuse workgroups_x to mean the number of splits, so we need to
// cancel out the divide by wg_denoms[0].
uint32_t dispatch_x;
if (gqa_ratio > 1) {
workgroups_x *= pipeline->wg_denoms[0];
dispatch_x = split_k * workgroups_x;
} else {
dispatch_x = Tr * split_k * pipeline->wg_denoms[0];
}
vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf, mask_opt_buf},
pc, { dispatch_x, workgroups_y, workgroups_z });
ggml_vk_sync_buffers(ctx, subctx);
const vk_op_flash_attn_split_k_reduce_push_constants pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, split_k, (sinks != nullptr) };
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
{split_k_buf, sinks_buf, dst_buf},
pc2, { (uint32_t)ne1, HSV, (uint32_t)(ne2 * ne3) });
ctx->prealloc_split_k_need_sync = true;
} else {
if (gqa_ratio > 1) {
// When using gqa, we want one actual workgroup per batch, so cancel out wg_denoms
workgroups_x *= pipeline->wg_denoms[0];
}
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf, mask_opt_buf},
pc, { workgroups_x, workgroups_y, workgroups_z });
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,512 @@
#include "ggml-vulkan-common.h"
std::mutex queue_mutex;
void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
if (tensor->view_src) {
return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base;
}
return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
}
uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t)
{
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
}
static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) {
const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset},
VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange});
return range;
}
void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) {
// Use waitForFences while most of the graph executes. Hopefully the CPU can sleep
// during this wait.
if (ctx->almost_ready_fence_pending) {
VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence");
ctx->device->device.resetFences({ ctx->almost_ready_fence });
ctx->almost_ready_fence_pending = false;
}
// Spin (w/pause) waiting for the graph to finish executing.
vk::Result result;
while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) {
if (result != vk::Result::eNotReady) {
fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__);
exit(1);
}
for (uint32_t i = 0; i < 100; ++i) {
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
YIELD();
}
}
ctx->device->device.resetFences({ ctx->fence });
}
void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
ctx->pipeline_descriptor_set_requirements += n;
if (!pipeline->compiled) {
ggml_vk_load_shaders(ctx->device, pipeline);
}
ggml_pipeline_allocate_descriptor_sets(ctx);
}
void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
// Enough descriptors are available
return;
}
vk_device& device = ctx->device;
// Grow by 50% to avoid frequent allocations
uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements});
uint32_t to_alloc = needed - ctx->descriptor_sets.size();
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
while (to_alloc > 0) {
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
to_alloc -= alloc_count;
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
if (pool_idx >= ctx->descriptor_pools.size()) {
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
}
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
for (uint32_t i = 0; i < alloc_count; i++) {
layouts[i] = device->dsl;
}
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
pool_idx++;
}
}
static vk_command_buffer* ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
p.pool,
vk::CommandBufferLevel::ePrimary,
1);
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
p.cmd_buffers.push_back({ cmd_buffers.front(), 0, true });
return &p.cmd_buffers[p.cmd_buffers.size()-1];
}
void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
if (fence) {
std::lock_guard<std::mutex> guard(queue_mutex);
ctx->p->q->queue.submit({}, fence);
}
return;
}
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
std::vector<std::vector<uint64_t>> tl_wait_vals;
std::vector<std::vector<uint64_t>> tl_signal_vals;
std::vector<std::vector<vk::Semaphore>> tl_wait_semaphores;
std::vector<std::vector<vk::Semaphore>> tl_signal_semaphores;
std::vector<vk::TimelineSemaphoreSubmitInfo> tl_submit_infos;
std::vector<vk::SubmitInfo> submit_infos;
int idx = -1;
std::vector<std::vector<vk::PipelineStageFlags>> stage_flags;
size_t reserve = 0;
for (const auto& sequence : ctx->seqs) {
reserve += sequence.size();
}
// Pre-reserve vectors to prevent reallocation, which invalidates pointers
tl_wait_semaphores.reserve(reserve);
tl_wait_vals.reserve(reserve);
tl_signal_semaphores.reserve(reserve);
tl_signal_vals.reserve(reserve);
tl_submit_infos.reserve(reserve);
submit_infos.reserve(reserve);
stage_flags.reserve(reserve);
for (const auto& sequence : ctx->seqs) {
for (const auto& submission : sequence) {
stage_flags.push_back({});
idx++;
tl_wait_vals.push_back({});
tl_wait_semaphores.push_back({});
tl_signal_vals.push_back({});
tl_signal_semaphores.push_back({});
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
stage_flags[idx].push_back(ctx->p->q->stage_flags);
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
}
for (size_t i = 0; i < submission.signal_semaphores.size(); i++) {
tl_signal_vals[idx].push_back(submission.signal_semaphores[i].value);
tl_signal_semaphores[idx].push_back(submission.signal_semaphores[i].s);
}
tl_submit_infos.push_back({
(uint32_t) submission.wait_semaphores.size(),
tl_wait_vals[idx].data(),
(uint32_t) submission.signal_semaphores.size(),
tl_signal_vals[idx].data(),
});
tl_submit_infos[idx].sType = vk::StructureType::eTimelineSemaphoreSubmitInfo;
tl_submit_infos[idx].pNext = nullptr;
vk::SubmitInfo si{
(uint32_t) submission.wait_semaphores.size(),
tl_wait_semaphores[idx].data(),
stage_flags[idx].data(),
1,
&submission.buffer->buf,
(uint32_t) submission.signal_semaphores.size(),
tl_signal_semaphores[idx].data(),
};
si.setPNext(&tl_submit_infos[idx]);
submit_infos.push_back(si);
}
}
std::lock_guard<std::mutex> guard(queue_mutex);
ctx->p->q->queue.submit(submit_infos, fence);
ctx->seqs.clear();
}
uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyProperties>& queue_family_props, const vk::QueueFlags& required, const vk::QueueFlags& avoid, int32_t compute_index, uint32_t min_num_queues) {
VK_LOG_DEBUG("ggml_vk_find_queue_family_index()");
const uint32_t qfsize = queue_family_props.size();
// Try with avoid preferences first
for (uint32_t i = 0; i < qfsize; i++) {
if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required && !(queue_family_props[i].queueFlags & avoid)) {
return i;
}
}
// Fall back to only required
for (size_t i = 0; i < qfsize; i++) {
if (queue_family_props[i].queueCount >= min_num_queues && (compute_index < 0 || i != (uint32_t) compute_index) && queue_family_props[i].queueFlags & required) {
return i;
}
}
// Fall back to reusing compute queue
for (size_t i = 0; i < qfsize; i++) {
if (queue_family_props[i].queueCount >= min_num_queues && queue_family_props[i].queueFlags & required) {
return i;
}
}
// Fall back to ignoring min_num_queries
for (size_t i = 0; i < qfsize; i++) {
if (queue_family_props[i].queueFlags & required) {
return i;
}
}
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
if (compute_index >= 0) {
return compute_index;
}
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
for(auto &q_family : queue_family_props) {
std::cerr << "Queue number: " + std::to_string(q_family.queueCount) << " flags: " + to_string(q_family.queueFlags) << std::endl;
}
abort();
}
void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_family_index, uint32_t queue_index, vk::PipelineStageFlags&& stage_flags, bool transfer_only) {
VK_LOG_DEBUG("ggml_vk_create_queue()");
std::lock_guard<std::recursive_mutex> guard(device->mutex);
q.queue_family_index = queue_family_index;
q.transfer_only = transfer_only;
q.cmd_pool.init(device, &q);
q.queue = device->device.getQueue(queue_family_index, queue_index);
q.stage_flags = stage_flags;
}
vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
vk_context result = std::make_shared<vk_context_struct>();
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
ctx->gc.contexts.emplace_back(result);
result->p = &p;
return result;
}
vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
vk_context result = std::make_shared<vk_context_struct>();
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
result->p = &p;
return result;
}
static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) {
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
vk::SemaphoreCreateInfo ci{};
ci.setPNext(&tci);
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
ctx->gc.semaphores.push_back({ semaphore, 0 });
return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
}
static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) {
VK_LOG_DEBUG("ggml_vk_create_timeline_semaphore()");
if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) {
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
vk::SemaphoreCreateInfo ci{};
ci.setPNext(&tci);
vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
}
return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
}
static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
if (ctx->event_idx >= ctx->gc.events.size()) {
ctx->gc.events.push_back(ctx->device->device.createEvent({}));
}
return ctx->gc.events[ctx->event_idx++];
}
void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
// Requires command buffers to be done
device->device.resetCommandPool(p.pool);
// Don't clear the command buffers and mark them as not in use.
// This allows us to reuse them
for (auto& cmd_buffer : p.cmd_buffers) {
cmd_buffer.in_use = false;
}
}
void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
// Arbitrary frequency to cleanup/reuse command buffers
static constexpr uint32_t cleanup_frequency = 10;
if (device->compute_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
}
if (device->transfer_queue.cmd_pool.buffers_in_use() >= cleanup_frequency) {
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
}
}
vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset) {
return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) };
}
void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) {
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
const bool transfer_queue = subctx->p->q->transfer_only;
if (ctx) {
ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false;
}
subctx->s->buffer->buf.pipelineBarrier(
subctx->p->q->stage_flags,
subctx->p->q->stage_flags,
{},
{ {
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }
} },
{},
{}
);
}
static void ggml_vk_reset_event(vk_context& ctx, vk::Event& event) {
VK_LOG_DEBUG("ggml_vk_set_event()");
ctx->s->buffer->buf.resetEvent(
event,
ctx->p->q->stage_flags
);
}
void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
VK_LOG_DEBUG("ggml_vk_set_event()");
ctx->s->buffer->buf.setEvent(
event,
ctx->p->q->stage_flags
);
}
void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
VK_LOG_DEBUG("ggml_vk_wait_events()");
if (events.empty()) {
return;
}
ctx->s->buffer->buf.waitEvents(
events,
ctx->p->q->stage_flags,
ctx->p->q->stage_flags,
{},
{},
{}
);
}
vk_subbuffer ggml_vk_tensor_subbuffer(
const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign) {
vk_buffer buffer = nullptr;
size_t offset = 0;
if (ctx->device->uma) {
ggml_vk_host_get(ctx->device, tensor->data, buffer, offset);
}
if (!buffer) {
auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
buffer = buf_ctx->dev_buffer;
offset = vk_tensor_offset(tensor) + tensor->view_offs;
}
GGML_ASSERT(buffer != nullptr);
size_t size = ggml_nbytes(tensor);
size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
// The shader must support misaligned offsets when indexing into the buffer
GGML_ASSERT(allow_misalign || misalign_bytes == 0);
offset &= ~misalign_bytes;
size += misalign_bytes;
return vk_subbuffer{buffer, offset, size};
}
static vk_command_buffer* ggml_vk_get_or_create_cmd_buffer(vk_device& device, vk_command_pool& pool) {
for (auto& cmd_buffer : pool.cmd_buffers) {
if (!cmd_buffer.in_use) {
cmd_buffer.use_counter++;
cmd_buffer.in_use = true;
return &cmd_buffer;
}
}
return ggml_vk_create_cmd_buffer(device, pool);
}
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
vk_submission s;
s.buffer = ggml_vk_get_or_create_cmd_buffer(device, p);
if (one_time) {
s.buffer->buf.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
} else {
s.buffer->buf.begin({ vk::CommandBufferUsageFlags{} });
}
return s;
}
void ggml_vk_ctx_end(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_ctx_end(" << ctx << ", " << ctx->seqs.size() << ")");
if (ctx->s == nullptr) {
return;
}
ctx->s->buffer->buf.end();
ctx->s = nullptr;
}
void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
if (subctx->s != nullptr) {
ggml_vk_ctx_end(subctx);
}
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
}
vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
vk_context result;
if (!ctx->compute_ctx.expired()) {
result = ctx->compute_ctx.lock();
} else {
result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = result;
ggml_vk_ctx_begin(ctx->device, result);
}
if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
result->s->wait_semaphores.push_back(ctx->transfer_semaphore);
ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
}
return result;
}
bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) {
if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) {
return false;
}
vk_context cpy_ctx = ctx->transfer_ctx.lock();
ggml_vk_ctx_end(cpy_ctx);
for (auto& cpy : cpy_ctx->in_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
}
ctx->transfer_semaphore.value++;
cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore);
ggml_vk_submit(cpy_ctx, {});
ctx->transfer_ctx.reset();
return true;
}
size_t ggml_vk_align_size(size_t width, size_t align) {
VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")");
return CEIL_DIV(width, align) * align;
}
void deferred_memcpy(void * dst, const void * src, size_t size, std::vector<vk_staging_memcpy>* memcpys) {
if (memcpys == nullptr) {
memcpy(dst, src, size);
} else {
memcpys->emplace_back(dst, src, size);
}
}
void deferred_memset(void * dst, uint32_t val, size_t size, std::vector<vk_staging_memset>* memsets) {
if (memsets == nullptr) {
memset(dst, val, size);
} else {
memsets->emplace_back(dst, val, size);
}
}
@@ -0,0 +1,872 @@
#pragma once
#include "ggml-vulkan-types.h"
uint32_t get_misalign_bytes(const ggml_backend_vk_context * ctx, const ggml_tensor * t);
struct vk_mat_mat_push_constants {
uint32_t M; uint32_t N; uint32_t K;
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
uint32_t base_work_group_z; uint32_t num_batches;
uint32_t k_split;
uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3;
uint32_t padded_N;
};
struct vk_mat_vec_push_constants {
uint32_t ncols;
uint32_t stride_a;
uint32_t stride_b;
uint32_t stride_d;
uint32_t batch_stride_a;
uint32_t batch_stride_b;
uint32_t batch_stride_d;
uint32_t fusion_flags;
uint32_t base_work_group_y;
uint32_t ne02;
uint32_t ne12;
uint32_t broadcast2;
uint32_t broadcast3;
};
struct vk_mat_vec_p021_push_constants {
uint32_t ncols_x;
uint32_t nrows_x;
uint32_t nchannels_x;
uint32_t nchannels_y;
uint32_t b_offset;
uint32_t d_offset;
uint32_t fusion_flags;
};
struct vk_mat_vec_nc_push_constants {
uint32_t ncols_x;
uint32_t nrows_x;
uint32_t row_stride_x;
uint32_t channel_stride_x;
uint32_t channel_stride_y;
uint32_t channel_x_divisor;
uint32_t ne12;
uint32_t b_offset;
uint32_t d_offset;
uint32_t nb03;
uint32_t nb13;
uint32_t nb23;
uint32_t fusion_flags;
};
struct vk_mat_mat_id_push_constants {
uint32_t M; uint32_t N; uint32_t K;
uint32_t stride_a; uint32_t stride_b; uint32_t stride_d;
uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d;
uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11;
uint32_t padded_N;
};
struct vk_mat_vec_id_push_constants {
uint32_t ncols;
uint32_t stride_a;
uint32_t stride_b;
uint32_t stride_d;
uint32_t batch_stride_a;
uint32_t batch_stride_b;
uint32_t batch_stride_d;
uint32_t fusion_flags;
uint32_t nei0;
uint32_t ne11;
uint32_t expert_i1;
uint32_t nbi1;
};
struct vk_flash_attn_push_constants {
uint32_t N;
uint32_t KV;
uint32_t ne1;
uint32_t ne2;
uint32_t ne3;
uint32_t neq2;
uint32_t neq3;
uint32_t nek2;
uint32_t nek3;
uint32_t nev2;
uint32_t nev3;
uint32_t nem1;
uint32_t nem2;
uint32_t nem3;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb21;
uint32_t nb22;
uint32_t nb23;
float scale;
float max_bias;
float logit_softcap;
uint32_t mask_n_head_log2;
float m0;
float m1;
uint32_t gqa_ratio;
uint32_t split_kv;
uint32_t k_num;
};
static_assert(sizeof(vk_flash_attn_push_constants) <= 128, "sizeof(vk_flash_attn_push_constants) must be <= 128");
struct vk_op_push_constants {
uint32_t KX;
uint32_t KY;
float param1;
float param2;
float param3;
float param4;
};
struct vk_op_fwht_push_constants {
uint32_t n_rows;
uint32_t src_offset;
uint32_t dst_offset;
float scale;
};
struct vk_op_count_experts_push_constants {
uint32_t ne00;
uint32_t ne01;
uint32_t nb00;
uint32_t nb01;
uint32_t a_offset;
};
struct vk_op_glu_push_constants {
uint32_t N;
uint32_t ne00;
uint32_t ne20;
uint32_t mode; // 0: default, 1: swapped, 2: split
float alpha; // for swiglu_oai
float limit;
uint32_t nb00;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb10;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb20;
uint32_t nb21;
uint32_t nb22;
uint32_t nb23;
uint32_t ne21;
uint32_t ne22;
uint32_t misalign_offsets;
uint32_t ne2_012mp; uint32_t ne2_012L;
uint32_t ne2_01mp; uint32_t ne2_01L;
uint32_t ne2_0mp; uint32_t ne2_0L;
};
static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128");
struct vk_op_unary_push_constants {
uint32_t ne;
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t misalign_offsets;
float param1; float param2; float param3; float param4;
uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls;
uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls;
};
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
ne = ne != 0 ? ne : ggml_nelements(dst);
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
vk_op_unary_push_constants p{};
p.ne = (uint32_t)ne;
size_t src0_tsize = ggml_type_size(src0->type);
p.ne00 = (uint32_t)src0->ne[0];
p.ne01 = (uint32_t)src0->ne[1];
p.ne02 = (uint32_t)src0->ne[2];
p.ne03 = (uint32_t)src0->ne[3];
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
size_t dst_tsize = ggml_type_size(dst->type);
p.ne10 = (uint32_t)dst->ne[0];
p.ne11 = (uint32_t)dst->ne[1];
p.ne12 = (uint32_t)dst->ne[2];
p.ne13 = (uint32_t)dst->ne[3];
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
return p; // offsets are initialized later in ggml_vk_op
}
struct vk_op_pad_push_constants {
uint32_t ne;
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t misalign_offsets;
uint32_t circular;
uint32_t lp0; uint32_t rp0;
uint32_t lp1; uint32_t rp1;
uint32_t lp2; uint32_t rp2;
uint32_t lp3; uint32_t rp3;
};
static vk_op_pad_push_constants vk_op_pad_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst) {
int64_t ne = ggml_nelements(dst);
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
vk_op_pad_push_constants p{};
p.ne = (uint32_t)ne;
size_t src0_tsize = ggml_type_size(src0->type);
p.ne00 = (uint32_t)src0->ne[0];
p.ne01 = (uint32_t)src0->ne[1];
p.ne02 = (uint32_t)src0->ne[2];
p.ne03 = (uint32_t)src0->ne[3];
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
size_t dst_tsize = ggml_type_size(dst->type);
p.ne10 = (uint32_t)dst->ne[0];
p.ne11 = (uint32_t)dst->ne[1];
p.ne12 = (uint32_t)dst->ne[2];
p.ne13 = (uint32_t)dst->ne[3];
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
p.lp0 = dst->op_params[0];
p.rp0 = dst->op_params[1];
p.lp1 = dst->op_params[2];
p.rp1 = dst->op_params[3];
p.lp2 = dst->op_params[4];
p.rp2 = dst->op_params[5];
p.lp3 = dst->op_params[6];
p.rp3 = dst->op_params[7];
p.circular = dst->op_params[8];
return p; // fastdiv values and offsets are initialized later in ggml_vk_op
}
static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L)
{
// compute L = ceil(log2(d));
L = 0;
while (L < 32 && (uint32_t{1} << L) < d) {
L++;
}
mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1);
}
static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) {
return L0 | (L1 << 8) | (L2 << 16);
}
template <typename T> void init_pushconst_fastdiv(T &p) {
GGML_UNUSED(p);
static_assert(!std::is_const<T>::value, "unexpected type");
}
template <> inline void init_pushconst_fastdiv(vk_op_unary_push_constants &p) {
// Compute magic values to divide by these six numbers.
uint32_t ne0_012L;
uint32_t ne0_01L;
uint32_t ne0_0L;
uint32_t ne1_012L;
uint32_t ne1_01L;
uint32_t ne1_0L;
init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L);
init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L);
init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L);
init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L);
init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L);
init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L);
p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L);
p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L);
}
template <> inline void init_pushconst_fastdiv(vk_op_glu_push_constants &p) {
// GLU linearizes over dst, then uses dst coordinates for src0/src1.
init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L);
init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L);
init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L);
}
struct vk_op_binary_push_constants {
uint32_t ne;
uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13;
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23;
uint32_t misalign_offsets;
float param1; float param2; int32_t param3;
};
struct vk_op_multi_add_push_constants {
// shape for dst
uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23;
// strides for srcs+dst
uint32_t nb[MAX_PARAMETER_COUNT][4];
uint32_t rms_partials;
};
static_assert(MAX_PARAMETER_COUNT == 12);
static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);
struct vk_op_topk_moe_push_constants {
uint32_t n_rows;
uint32_t n_experts_push;
uint32_t n_expert_used;
float clamp_min;
float clamp_max;
uint32_t gating_func;
uint32_t has_bias;
uint32_t with_norm;
float output_scale;
float output_bias;
};
struct vk_op_add_id_push_constants {
uint32_t ne0;
uint32_t ne1;
uint32_t s01;
uint32_t s02;
uint32_t s11;
uint32_t s21;
};
struct vk_op_diag_mask_push_constants {
uint32_t ncols;
uint32_t rows_per_channel;
int32_t n_past;
};
struct vk_op_rope_push_constants {
uint32_t rope_mode;
uint32_t nrows;
uint32_t n_dims;
float freq_scale;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float theta_scale;
uint32_t has_ff;
int32_t sections[4];
uint32_t is_imrope;
uint32_t is_back;
uint32_t set_rows_stride;
uint32_t ne00;
uint32_t ne01;
uint32_t ne02;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t a_offset;
uint32_t d_offset;
};
static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128");
struct vk_op_rms_norm_mul_rope_push_constants {
vk_op_binary_push_constants bin;
vk_op_rope_push_constants rope;
};
struct vk_op_soft_max_push_constants {
uint32_t KX;
uint32_t KY;
uint32_t ne00;
uint32_t ne01;
uint32_t ne02;
uint32_t ne12;
uint32_t ne13;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
float scale;
float max_bias;
float m0;
float m1;
uint32_t n_head_log2;
uint32_t nrows_x;
uint32_t has_sinks;
};
struct vk_op_argsort_push_constants {
uint32_t ncols;
uint32_t ncols_padded;
uint32_t ncols_padded_log2;
uint32_t nrows;
uint32_t order;
uint32_t outer_start;
uint32_t outer_end;
uint32_t inner_start;
uint32_t inner_end;
};
struct vk_op_topk_push_constants {
uint32_t orig_ncols;
uint32_t ncols_input;
uint32_t ncols_output;
uint32_t k;
uint32_t nrows;
uint32_t first_pass;
uint32_t last_pass;
};
struct vk_op_im2col_push_constants {
uint64_t dst_addr;
uint32_t batch_offset; uint32_t offset_delta;
uint32_t IC;
uint32_t IW; uint32_t IH;
uint32_t OW; uint32_t OH;
uint32_t KW; uint32_t KH;
uint32_t OH_batch;
uint32_t CHW;
int32_t s0; int32_t s1;
int32_t p0; int32_t p1;
int32_t d0; int32_t d1;
uint32_t batch_IC;
};
struct vk_op_im2col_3d_push_constants {
uint64_t dst_addr;
uint32_t nb10;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t s0;
uint32_t s1;
uint32_t s2;
uint32_t p0;
uint32_t p1;
uint32_t p2;
uint32_t d0;
uint32_t d1;
uint32_t d2;
uint32_t IW;
uint32_t IH;
uint32_t ID;
uint32_t IC;
uint32_t KW;
uint32_t OH;
uint32_t KD_KH_KW;
uint32_t KH_KW;
uint32_t IC_KD_KH_KW;
uint32_t N_OD_OH;
uint32_t OD_OH;
uint32_t OD_OH_OW_IC_KD_KH_KW;
uint32_t OH_OW_IC_KD_KH_KW;
uint32_t OW_IC_KD_KH_KW;
uint32_t misalign_offsets;
};
struct vk_op_timestep_embedding_push_constants {
uint32_t nb1;
uint32_t dim;
uint32_t max_period;
};
struct vk_op_col2im_1d_push_constants {
uint32_t T_out;
uint32_t OC;
uint32_t K_OC;
uint32_t T_in;
uint32_t K;
int32_t stride;
int32_t p0;
};
struct vk_op_conv_transpose_1d_push_constants {
uint32_t Cout;
uint32_t Cin;
uint32_t K;
uint32_t L;
uint32_t KL;
uint32_t nb01;
uint32_t nb02;
uint32_t nb11;
uint32_t nb1;
int32_t s0;
};
struct vk_op_snake_push_constants {
uint32_t ne0;
uint32_t ne1;
};
struct vk_op_pool2d_push_constants {
uint32_t IW; uint32_t IH;
uint32_t OW; uint32_t OH;
uint32_t OC;
uint32_t pelements;
uint32_t op;
int32_t k0; int32_t k1;
int32_t s0; int32_t s1;
int32_t p0; int32_t p1;
};
struct vk_op_rwkv_wkv6_push_constants {
uint32_t B;
uint32_t T;
uint32_t C;
uint32_t H;
};
struct vk_op_rwkv_wkv7_push_constants {
uint32_t B;
uint32_t T;
uint32_t C;
uint32_t H;
};
struct vk_op_gated_delta_net_push_constants {
uint32_t H;
uint32_t n_tokens;
uint32_t n_seqs;
uint32_t s_off;
uint32_t sq1, sq2, sq3;
uint32_t sv1, sv2, sv3;
uint32_t sb1, sb2, sb3;
uint32_t neq1, rq3;
float scale;
uint32_t K;
};
struct vk_op_ssm_scan_push_constants {
uint32_t nb02, nb03, nb12, nb13;
uint32_t nb21, nb22, nb31;
uint32_t nb42, nb43, nb52, nb53;
uint32_t s_off;
uint32_t n_head, d_head, n_group, n_tok;
};
struct vk_op_ssm_conv_push_constants {
uint32_t nb01, nb02;
uint32_t nb11;
uint32_t dst_nb0, dst_nb1, dst_nb2;
uint32_t nc, ncs, nr, n_t, n_s;
};
struct vk_op_conv2d_push_constants {
uint32_t Cout;
uint32_t Cin;
uint32_t N;
uint32_t W;
uint32_t H;
uint32_t OW;
uint32_t OH;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb1;
uint32_t nb2;
uint32_t nb3;
// init_fastdiv_values constants for dividing by OW, OW*OH
uint32_t OWmp; uint32_t OWL;
uint32_t OWOHmp; uint32_t OWOHL;
};
template <> inline void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
// Compute magic values to divide by OW, OW*OH
init_fastdiv_values(p.OW, p.OWmp, p.OWL);
init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL);
}
struct vk_op_conv2d_dw_push_constants {
uint32_t ne;
uint32_t batches;
uint32_t channels;
uint32_t dst_w;
uint32_t dst_h;
uint32_t src_w;
uint32_t src_h;
uint32_t knl_w;
uint32_t knl_h;
int32_t stride_x;
int32_t stride_y;
int32_t pad_x;
int32_t pad_y;
int32_t dilation_x;
int32_t dilation_y;
};
struct vk_op_upscale_push_constants {
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
uint32_t ne00; uint32_t ne01;
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
float sf0; float sf1; float sf2; float sf3;
float pixel_offset;
};
struct vk_op_sum_rows_push_constants
{
uint32_t n_cols;
uint32_t ne01, ne02;
uint32_t nb01, nb02, nb03;
uint32_t nb11, nb12, nb13;
float weight;
uint32_t misalign_offsets;
uint32_t ne0_12mp, ne0_12L;
uint32_t ne0_1mp, ne0_1L;
};
static vk_op_sum_rows_push_constants vk_op_sum_rows_push_constants_init(const ggml_tensor * src, const ggml_tensor * dst, int64_t n_cols) {
uint32_t type_size = (uint32_t)ggml_type_size(src->type);
vk_op_sum_rows_push_constants p = {};
p.n_cols = (uint32_t)n_cols;
p.ne01 = (uint32_t)src->ne[1];
p.ne02 = (uint32_t)src->ne[2];
p.nb01 = (uint32_t)src->nb[1] / type_size;
p.nb02 = (uint32_t)src->nb[2] / type_size;
p.nb03 = (uint32_t)src->nb[3] / type_size;
p.nb11 = (uint32_t)dst->nb[1] / type_size;
p.nb12 = (uint32_t)dst->nb[2] / type_size;
p.nb13 = (uint32_t)dst->nb[3] / type_size;
p.weight = 1.0f;
return p;
}
template <> inline void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) {
init_fastdiv_values(p.ne01*p.ne02, p.ne0_12mp, p.ne0_12L);
init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L);
}
struct vk_quantize_q8_1_push_constants {
uint32_t ne;
uint32_t num_blocks;
};
struct vk_op_flash_attn_split_k_reduce_push_constants {
uint32_t D;
uint32_t ne1;
uint32_t ne2;
uint32_t ne3;
uint32_t k_num;
uint32_t sinks;
};
struct vk_op_flash_attn_mask_opt_push_constants {
uint32_t nem0;
uint32_t nem1;
uint32_t nem2;
uint32_t nbm1;
uint32_t nbm2;
uint32_t nbm3;
uint32_t nbd1;
uint32_t nbd2;
uint32_t nbd3;
};
template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
GGML_UNUSED(p);
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
GGML_UNUSED(dst);
static_assert(!std::is_const<T>::value, "unexpected type");
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
GGML_ASSERT(!src3 || get_misalign_bytes(ctx, src3) == 0);
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_p021_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.b_offset = b_offset;
p.d_offset = d_offset;
GGML_UNUSED(src0);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_mat_vec_nc_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.b_offset = b_offset;
p.d_offset = d_offset;
GGML_UNUSED(src0);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_fwht_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
p.src_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
p.dst_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <typename T> size_t push_constant_size(const T &t) {
static_assert(std::is_class<T>::value, "T must be a struct/class");
GGML_UNUSED(t);
return sizeof(T);
}
template <typename T> size_t push_constant_size(const std::vector<T> &t) {
GGML_UNUSED(t);
return sizeof(T) * t.size();
}
template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
GGML_UNUSED(t);
return sizeof(T) * N;
}
template <typename T> const T *push_constant_data(const T &t) {
static_assert(std::is_class<T>::value, "T must be a struct/class");
return &t;
}
template <typename T> const T *push_constant_data(const std::vector<T> &t) {
return t.data();
}
template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
return t.data();
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.misalign_offsets = (a_offset << 16) | d_offset;
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset;
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
GGML_ASSERT(a_offset < (1u << 8));
GGML_ASSERT(b_offset < (1u << 8));
GGML_ASSERT(d_offset < (1u << 8));
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.misalign_offsets = (a_offset << 16) | d_offset;
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.misalign_offsets = (a_offset << 16) | d_offset;
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.misalign_offsets = (a_offset << 16) | d_offset;
GGML_UNUSED(src0);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
GGML_ASSERT(dst->op != GGML_OP_GET_ROWS || (a_offset == 0 && b_offset == 0 && d_offset == 0));
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
p.a_offset = a_offset;
p.d_offset = d_offset;
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
template <> inline void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_rope_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
p.a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
p.d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
GGML_UNUSED(src1);
GGML_UNUSED(src2);
GGML_UNUSED(src3);
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,17 @@
#version 450
#include "types.glsl"
#include "generic_unary_head.glsl"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
}
@@ -1,431 +0,0 @@
#version 450
#extension GL_EXT_control_flow_attributes : enable
#ifdef COOPMAT2
#extension GL_NV_cooperative_matrix2 : enable
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_KHR_memory_scope_semantics : enable
#endif
#ifdef COOPMAT
#extension GL_KHR_cooperative_matrix : enable
#extension GL_KHR_shader_subgroup_basic : enable
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_KHR_memory_scope_semantics : enable
#endif
#include "types.glsl"
// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j
layout(binding = 0) readonly buffer A {
A_TYPE knl_data[];
}; // src0 - kernel: [KW, KH, KD, IC*OC]
layout(binding = 1) readonly buffer B {
B_TYPE src_data[];
}; // src1 - input: [IW, IH, ID, IC*N] -- channel_first format
layout(binding = 2) writeonly buffer D {
D_TYPE dst_data[];
}; // dst - result: [OW, OH, OD, OC*N]
layout(push_constant) uniform parameter {
// I/O channels, batch size
uint32_t OC;
uint32_t IC;
uint32_t N;
// Tensor spatial sizes: input, output
uint32_t IW;
uint32_t IH;
uint32_t ID;
uint32_t OW;
uint32_t OH;
uint32_t OD;
// Strides in elements
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb1;
uint32_t nb2;
uint32_t nb3;
// fastdiv helper values
uint32_t OWmp; uint32_t OWL;
uint32_t OWOHmp; uint32_t OWOHL;
uint32_t OWOHODmp; uint32_t OWOHODL;
}
p;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
// Blocktile sizes
layout(constant_id = 1) const uint BS_K = 128;
layout(constant_id = 2) const uint BS_CRS = 16;
layout(constant_id = 3) const uint BS_NPQ = 128;
// Thread-tile sizes
layout(constant_id = 4) const uint TS_K = 8;
layout(constant_id = 5) const uint SHMEM_PAD = 4;
// Stride, padding, dilation
layout(constant_id = 6) const uint s0 = 1;
layout(constant_id = 7) const uint s1 = 1;
layout(constant_id = 8) const uint s2 = 1;
layout(constant_id = 9) const uint p0 = 0;
layout(constant_id = 10) const uint p1 = 0;
layout(constant_id = 11) const uint p2 = 0;
layout(constant_id = 12) const uint d0 = 1;
layout(constant_id = 13) const uint d1 = 1;
layout(constant_id = 14) const uint d2 = 1;
// Kernel spatial sizes
layout(constant_id = 15) const uint KW = 1;
layout(constant_id = 16) const uint KH = 1;
layout(constant_id = 17) const uint KD = 1;
// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned)
layout(constant_id = 18) const uint aligned = 0;
// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this.
layout(constant_id = 19) const uint csh_store = 0;
#ifdef COOPMAT
// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of
// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN ==
// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size.
layout(constant_id = 20) const uint WM = 32;
layout(constant_id = 21) const uint WN = 32;
const uint TM = 16;
const uint TN = 16;
const uint TK = 16;
const uint cms_per_row = WM / TM;
const uint cms_per_col = WN / TN;
const uint warps_M = BS_K / WM;
const uint warps_N = BS_NPQ / WN;
#endif
// without padding, ID_idx/IH_idx/IW_idx are in bounds by construction
const bool dhw_in_bounds = (p0 == 0) && (p1 == 0) && (p2 == 0);
uint32_t tid = gl_LocalInvocationID.x;
const uint32_t WG_SIZE = gl_WorkGroupSize.x;
uint splitWork(uint work_size, uint block_size) {
return (block_size + work_size - 1) / block_size;
}
uint32_t K = p.OC;
uint32_t CRS = p.IC * KD * KH * KW;
uint32_t NPQ = p.N * p.OD * p.OH * p.OW;
// Number of blocktiles per input
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
#if defined(COOPMAT2) || defined(COOPMAT)
#define SHMEM_TYPE float16_t
#else
#define SHMEM_TYPE float
#endif
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
const uint32_t Ash_len = BS_K * Ash_stride;
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
#if defined(COOPMAT2) || defined(COOPMAT)
// stage matC through shmem so global stores are row-major (NPQ-contiguous)
const uint32_t Csh_stride = BS_NPQ;
#ifdef COOPMAT
const uint32_t Csh_len = BS_K * Csh_stride;
#else
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
#endif
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
#endif
// Threadtile sizes
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
// Number of threadtiles per blocktile
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
/*
Compute
KxCRS @ CRSxNPQ = K x NPQ
K=OC
C=IC
D,R,S=KD,KH,KW
Z,P,Q=OD,OH,OW
*/
uint32_t B_idx_K = gl_WorkGroupID.x;
uint32_t B_idx_NPQ = gl_WorkGroupID.y + gl_WorkGroupID.z * 512;
uint32_t T_y = tid / NT_NPQ;
uint32_t T_x = tid % NT_NPQ;
uint32_t Ar = tid / BS_CRS;
uint32_t Ac = tid % BS_CRS;
const uint32_t ArpWg = WG_SIZE / BS_CRS;
uint32_t Br = tid / BS_NPQ;
uint32_t Bc = tid % BS_NPQ;
const uint32_t BrpWg = WG_SIZE / BS_NPQ;
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
void split_crs(uint32_t crs_idx, out uint32_t ic, out uint32_t kd, out uint32_t kh, out uint32_t kw) {
const uint32_t KHKW = KH * KW;
const uint32_t KDKHKW = KD * KHKW;
ic = crs_idx / KDKHKW;
uint32_t rem = crs_idx - ic * KDKHKW;
kd = rem / KHKW;
rem = rem - kd * KHKW;
kh = rem / KW;
kw = rem - kh * KW;
}
void split_npq(uint32_t npq_idx, out uint32_t n, out uint32_t od, out uint32_t oh, out uint32_t ow) {
const uint32_t OWOH = p.OW * p.OH;
n = fastdiv(npq_idx, p.OWOHODmp, p.OWOHODL);
uint32_t rem = npq_idx - n * p.OD * OWOH;
od = fastdiv(rem, p.OWOHmp, p.OWOHL);
rem = rem - od * OWOH;
oh = fastdiv(rem, p.OWmp, p.OWL);
ow = rem - oh * p.OW;
}
#ifdef COOPMAT2
#define ACC_TYPE float16_t
ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem)
{
uint32_t K_idx = B_idx_K * BS_K + r;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c;
uint32_t N_idx;
uint32_t OD_idx;
uint32_t OH_idx;
uint32_t OW_idx;
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3;
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
dst_data[dst_idx] = D_TYPE(elem);
}
return elem;
}
#endif
void main() {
if (B_idx_NPQ * BS_NPQ >= NPQ) {
return;
}
#ifdef COOPMAT2
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
#elif defined(COOPMAT)
coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
[[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
sums[i] = coopmat<float16_t, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0);
}
const uint warp_r = gl_SubgroupID / warps_N;
const uint warp_c = gl_SubgroupID % warps_N;
#else
float regC[TS_K][TS_NPQ];
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regC[T_ly][T_lx] = 0.0;
}
}
#endif
/* Advance block in CRS dim */
[[dont_unroll]] for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
uint32_t CRS_idx_a = B_idx_CRS * BS_CRS + Ac;
uint32_t IC_idx_a;
uint32_t KD_idx_a;
uint32_t KH_idx_a;
uint32_t KW_idx_a;
split_crs(CRS_idx_a, IC_idx_a, KD_idx_a, KH_idx_a, KW_idx_a);
/* Load kernel to A_block: (BS_K x BS_CRS)*/
UNROLL for (uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg) {
uint32_t B_ly = r_offset + Ar;
uint32_t B_lx = Ac;
uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/
uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + KD_idx_a * p.nb02 + (K_idx * p.IC + IC_idx_a) * p.nb03;
if (aligned == 0) {
knl_idx = min(knl_idx, K * CRS - 1);
}
float val = knl_data[knl_idx];
if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) {
val = 0.0;
}
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
}
/* Load input to B_block: (BS_CRS x BS_NPQ) */
UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
uint32_t B_ly = r_offset + Br; /* Row index of B block */
uint32_t B_lx = Bc;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */
uint32_t N_idx;
uint32_t OD_idx;
uint32_t OH_idx;
uint32_t OW_idx;
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
uint32_t CRS_idx_b = B_idx_CRS * BS_CRS + B_ly;
uint32_t IC_idx_b;
uint32_t KD_idx_b;
uint32_t KH_idx_b;
uint32_t KW_idx_b;
split_crs(CRS_idx_b, IC_idx_b, KD_idx_b, KH_idx_b, KW_idx_b);
uint32_t ID_idx = OD_idx * s2 + KD_idx_b * d2 - p2;
uint32_t IH_idx = OH_idx * s1 + KH_idx_b * d1 - p1;
uint32_t IW_idx = OW_idx * s0 + KW_idx_b * d0 - p0;
uint32_t src_idx = IW_idx + IH_idx * p.nb11 + ID_idx * p.nb12 + (N_idx * p.IC + IC_idx_b) * p.nb13;
// skip clamp when address can't go OOB
if (aligned == 0 || !dhw_in_bounds) {
src_idx = min(src_idx, p.IC * p.N * p.IW * p.IH * p.ID - 1);
}
float val = src_data[src_idx];
bool oob = false;
if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) {
oob = true;
}
// also catches lower-bound underflow (idx wraps to 0x80000000+)
if (!dhw_in_bounds && (ID_idx >= p.ID || IH_idx >= p.IH || IW_idx >= p.IW)) {
oob = true;
}
if (oob) {
val = 0.0;
}
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
}
barrier();
#ifdef COOPMAT2
coopmat<float16_t, gl_ScopeWorkgroup, BS_K, BS_CRS, gl_MatrixUseA> matA;
coopmat<float16_t, gl_ScopeWorkgroup, BS_CRS, BS_NPQ, gl_MatrixUseB> matB;
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
matC = coopMatMulAdd(matA, matB, matC);
#elif defined(COOPMAT)
// each subgroup multiplies its grid of fragments per TK-sized CRS chunk
[[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) {
coopmat<float16_t, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a[cms_per_row];
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK;
coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
}
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
coopmat<float16_t, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN;
coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]);
}
}
}
#else
if (T_y * TS_K < K) {
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
float regA[TS_K];
float regB[TS_NPQ];
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
}
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx];
}
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]);
}
}
}
}
#endif
barrier();
}
/* Save C* */
#if defined(COOPMAT2) || defined(COOPMAT)
// stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores
#ifdef COOPMAT
const bool use_staged_store = true;
#else
const bool use_staged_store = (csh_store != 0);
#endif
if (use_staged_store) {
#ifdef COOPMAT
// cm1: each subgroup stores its fragment grid into its Csh slot
[[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN;
coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
}
}
#else
coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor);
#endif
barrier();
// cooperative shmem->global: WG threads spread across BS_NPQ (the
// contiguous direction of dst), each iter covers store_rows_per_iter K-rows
const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ;
const uint32_t store_iters = BS_K / store_rows_per_iter;
const uint32_t k_thread_offset = tid / BS_NPQ;
const uint32_t npq_thread = tid % BS_NPQ;
[[unroll]] for (uint32_t i = 0; i < store_iters; i++) {
uint32_t k_local = i * store_rows_per_iter + k_thread_offset;
uint32_t K_idx = B_idx_K * BS_K + k_local;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread;
uint32_t N_idx;
uint32_t OD_idx;
uint32_t OH_idx;
uint32_t OW_idx;
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3;
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]);
}
}
}
#ifdef COOPMAT2
else {
coopMatPerElementNV(matC, matC, perElemOpStore);
}
#endif
#else
if (T_y * TS_K < K) {
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx;
uint32_t N_idx;
uint32_t OD_idx;
uint32_t OH_idx;
uint32_t OW_idx;
split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx);
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3;
if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) {
dst_data[dst_idx] = D_TYPE(regC[T_ly][T_lx]);
}
}
}
}
#endif
}
@@ -0,0 +1,17 @@
#version 450
#include "types.glsl"
#include "generic_unary_head.glsl"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val));
}
@@ -463,7 +463,6 @@ void main() {
}
rowmaxf = max(rowmaxf, float(Sf[r][c]));
}
rowmaxf += FATTN_KQ_MAX_OFFSET;
float Moldf = Mf[r];
// M = max(rowmax, Mold)
@@ -352,7 +352,6 @@ void main() {
}
rowmaxf = max(rowmaxf, float(sfsh[r_vec + (c * cols_per_iter + col_tid) * sfshstride][r_comp]));
}
rowmaxf += FATTN_KQ_MAX_OFFSET;
float Moldf = Mf[r];
// Compute max across the row
@@ -1,25 +0,0 @@
#version 450
#include "types.glsl"
#include "generic_binary_head.glsl"
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint col = gl_GlobalInvocationID.x;
if (col >= p.ne20) {
return;
}
for (uint row = gl_GlobalInvocationID.y; row < p.ne21; row += gl_WorkGroupSize.y * gl_NumWorkGroups.y) {
float sum = 0.0f;
for (uint i = 0; i < p.ne10; ++i) {
if (data_b[get_boffset() + i*p.nb10] == int(row)) {
sum += data_a[get_aoffset() + i*p.nb01 + col*p.nb00];
}
}
data_d[get_doffset() + row*p.nb21 + col*p.nb20] = sum;
}
}
@@ -14,13 +14,16 @@ void main() {
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
const uint a_base = get_aoffset() + src0_idx(row * p.ne00);
const uint d_base = get_doffset() + dst_idx(row * p.ne10);
const uint i3 = row / (p.ne11 * p.ne12);
const uint i3_offset = i3 * p.ne12 * p.ne11;
const uint i2 = (row - i3_offset) / p.ne11;
const uint i2_offset = i2 * p.ne11;
const uint i1 = row - i3_offset - i2_offset;
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_base + i0*p.nb00]);
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]);
sum[tid] += xi * xi;
}
@@ -36,6 +39,6 @@ void main() {
const FLOAT_TYPE scale = 1.0f / max(sqrt(sum[0]), FLOAT_TYPE(p.param1));
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
data_d[d_base + i0*p.nb10] = D_TYPE(scale * FLOAT_TYPE(data_a[a_base + i0*p.nb00]));
data_d[i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0] = D_TYPE(scale * FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]));
}
}
@@ -0,0 +1,22 @@
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#extension GL_EXT_control_flow_attributes : enable
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
void main() {
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
if (i >= p.KX) {
return;
}
const float val = float(data_a[i]);
data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
}
+23 -31
View File
@@ -38,7 +38,17 @@
#define LOAD_VEC_B 1
#endif
layout (constant_id = 11) const uint ALIGNED = 0;
// Load 2 values at once without affecting index calculations through LOAD_VEC
#if (defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)) && !defined(ALIGNED)
#define LOAD_VEC_BATCH_A 2
#else
#define LOAD_VEC_BATCH_A 1
#endif
#if !defined(ALIGNED)
#define LOAD_VEC_BATCH_B 2
#else
#define LOAD_VEC_BATCH_B 1
#endif
#if !defined(TO_FLOAT_TYPE)
#define TO_FLOAT_TYPE FLOAT_TYPE
@@ -47,13 +57,6 @@ layout (constant_id = 11) const uint ALIGNED = 0;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
#if defined(DATA_A_F32)
layout (binding = 0) readonly buffer A_SCALAR {float data_a_scalar[];};
#elif defined(DATA_A_F16)
layout (binding = 0) readonly buffer A_SCALAR {float16_t data_a_scalar[];};
#elif defined(DATA_A_BF16)
layout (binding = 0) readonly buffer A_SCALAR {uint16_t data_a_scalar[];};
#endif
#if defined(A_TYPE_PACKED16)
layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
#endif
@@ -62,7 +65,6 @@ layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32
#endif
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 1) readonly buffer B_SCALAR {B_TYPE_SCALAR data_b_scalar[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
#ifdef MUL_MAT_ID
@@ -192,23 +194,13 @@ void main() {
const uint warp_r = warp_i % (BM / WM);
const uint warp_c = warp_i / (BM / WM);
#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
const uint LOAD_VEC_A_EFF = (ALIGNED != 0) ? LOAD_VEC_A : 1;
const uint LOAD_VEC_BATCH_A = (ALIGNED != 0) ? 1 : 2;
#else
const uint LOAD_VEC_A_EFF = LOAD_VEC_A;
const uint LOAD_VEC_BATCH_A = 1;
#endif
const uint LOAD_VEC_B_EFF = (ALIGNED != 0) ? LOAD_VEC_B : 1;
const uint LOAD_VEC_BATCH_B = (ALIGNED != 0) ? 1 : 2;
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A_EFF / LOAD_VEC_BATCH_A);
const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A_EFF / LOAD_VEC_BATCH_A);
const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B_EFF / LOAD_VEC_BATCH_B);
const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B_EFF / LOAD_VEC_BATCH_B);
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A_EFF * LOAD_VEC_BATCH_A / BK;
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B_EFF * LOAD_VEC_BATCH_B / BK;
const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A * LOAD_VEC_BATCH_A / BK;
const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B * LOAD_VEC_BATCH_B / BK;
#ifdef MUL_MAT_ID
#ifdef MUL_MAT_ID_USE_SUBGROUPS
@@ -247,15 +239,15 @@ void main() {
uint pos_a =
#ifdef MUL_MAT_ID
expert_idx * (p.batch_stride_a / LOAD_VEC_A_EFF) +
expert_idx * (p.batch_stride_a / LOAD_VEC_A) +
#else
batch_idx_a * (p.batch_stride_a / LOAD_VEC_A_EFF) +
batch_idx_a * (p.batch_stride_a / LOAD_VEC_A) +
#endif
(ir * BM * p.stride_a + start_k) / LOAD_VEC_A_EFF;
(ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
#ifdef MUL_MAT_ID
uint pos_b = 0;
#else
uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B_EFF;
uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B;
#endif
#ifdef COOPMAT
@@ -295,8 +287,8 @@ void main() {
barrier();
pos_a += BK / LOAD_VEC_A_EFF;
pos_b += BK / LOAD_VEC_B_EFF;
pos_a += BK / LOAD_VEC_A;
pos_b += BK / LOAD_VEC_B;
#ifdef COOPMAT
[[unroll]] for (uint i = 0; i < BK; i += TK) {
@@ -36,7 +36,6 @@ layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working wit
layout (constant_id = 4) const bool enable_smaller_matrices = false;
const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN;
const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN;
layout (constant_id = 5) const uint ALIGNED = 0;
layout (push_constant) uniform parameter
{
@@ -112,7 +111,7 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
};
uint _ne1;
layout (constant_id = 6) const uint subgroup_size = 32;
layout (constant_id = 5) const uint subgroup_size = 32;
shared uvec4 ballots_sh[BLOCK_SIZE / subgroup_size];
B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2])
@@ -298,12 +297,12 @@ void main() {
// Hint to the compiler that values are aligned (want 16B alignment).
// Quants are always block-aligned, no alignment needed.
if (ALIGNED != 0) {
#if ALIGNED
#if QUANT_K == 1
stride_a &= ~7;
stride_a &= ~7;
#endif
stride_b &= ~7;
#endif
stride_b &= ~7;
}
// Create layouts for both clamped and unclamped accesses
tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2);
@@ -1,57 +1,50 @@
void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint block, const uint end_k) {
#if defined(DATA_A_F32) || defined(DATA_A_F16)
#if LOAD_VEC_A == 8
if (ALIGNED != 0) {
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]);
buf_a[buf_idx ] = aa[0].xy;
buf_a[buf_idx + 1] = aa[0].zw;
buf_a[buf_idx + 2] = aa[1].xy;
buf_a[buf_idx + 3] = aa[1].zw;
return;
}
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]);
buf_a[buf_idx ] = aa[0].xy;
buf_a[buf_idx + 1] = aa[0].zw;
buf_a[buf_idx + 2] = aa[1].xy;
buf_a[buf_idx + 3] = aa[1].zw;
#elif LOAD_VEC_A == 4
if (ALIGNED != 0) {
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]);
buf_a[buf_idx ] = aa.xy;
buf_a[buf_idx + 1] = aa.zw;
return;
}
#endif
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]);
buf_a[buf_idx ] = aa.xy;
buf_a[buf_idx + 1] = aa.zw;
#else // LOAD_VEC_BATCH_A == 2
const uint idx = pos_a + col * p.stride_a + row * 2;
const uint buf_idx = col * SHMEM_STRIDE + row;
if (idx_m < p.M && block + row * 2 + 1 < end_k) {
buf_a[buf_idx] = FLOAT_TYPEV2(data_a_scalar[idx],
data_a_scalar[idx + 1]);
buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx],
data_a[idx + 1]);
} else if (idx_m < p.M && block + row * 2 < end_k) {
buf_a[buf_idx] = FLOAT_TYPEV2(data_a_scalar[idx], 0.0f);
buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx], 0.0f);
} else {
buf_a[buf_idx] = FLOAT_TYPEV2(0.0f);
}
#endif
#elif defined(DATA_A_BF16)
#if LOAD_VEC_A == 4
if (ALIGNED != 0) {
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx]));
buf_a[buf_idx ] = aa.xy;
buf_a[buf_idx + 1] = aa.zw;
return;
}
#endif
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx]));
buf_a[buf_idx ] = aa.xy;
buf_a[buf_idx + 1] = aa.zw;
#else // LOAD_VEC_BATCH_A == 2
const uint idx = pos_a + col * p.stride_a + row * 2;
const uint buf_idx = col * SHMEM_STRIDE + row;
if (idx_m < p.M && block + row * 2 + 1 < end_k) {
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a_scalar[idx]),
TO_FLOAT_TYPE(data_a_scalar[idx + 1]));
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]),
TO_FLOAT_TYPE(data_a[idx + 1]));
} else if (idx_m < p.M && block + row * 2 < end_k) {
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a_scalar[idx]), 0.0f);
buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]), 0.0f);
} else {
buf_a[buf_idx] = FLOAT_TYPEV2(0.0f);
}
#endif
#elif defined(DATA_A_Q4_0)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
@@ -533,85 +526,75 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
#if !defined(MUL_MAT_ID)
void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint block, const uint end_k) {
#if LOAD_VEC_B == 8
if (ALIGNED != 0) {
// Not supported for b_type bf16 because bf16mat2x4 does not exist
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
buf_b[buf_idx + 0] = bb[0].xy;
buf_b[buf_idx + 1] = bb[0].zw;
buf_b[buf_idx + 2] = bb[1].xy;
buf_b[buf_idx + 3] = bb[1].zw;
return;
}
// Not supported for b_type bf16 because bf16mat2x4 does not exist
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
buf_b[buf_idx + 0] = bb[0].xy;
buf_b[buf_idx + 1] = bb[0].zw;
buf_b[buf_idx + 2] = bb[1].xy;
buf_b[buf_idx + 3] = bb[1].zw;
#elif LOAD_VEC_B == 4
if (ALIGNED != 0) {
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
#if defined(DATA_B_BF16)
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
#else
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
#endif
buf_b[buf_idx + 0] = bb.xy;
buf_b[buf_idx + 1] = bb.zw;
return;
}
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
#endif
buf_b[buf_idx + 0] = bb.xy;
buf_b[buf_idx + 1] = bb.zw;
#else // LOAD_VEC_BATCH_B == 2
const uint idx = pos_b + col * p.stride_b + row * 2;
const uint buf_idx = col * SHMEM_STRIDE + row;
if (idx_n < p.N && block + row * 2 + 1 < end_k) {
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]),
TO_FLOAT_TYPE(data_b_scalar[idx + 1]));
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]),
TO_FLOAT_TYPE(data_b[idx + 1]));
} else if (idx_n < p.N && block + row * 2 < end_k) {
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), 0.0f);
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
} else {
buf_b[buf_idx] = FLOAT_TYPEV2(0.0f);
}
#endif
}
#else
void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1, const uint block, const uint end_k) {
#if LOAD_VEC_B == 8
if (ALIGNED != 0) {
// Not supported for b_type bf16 because bf16mat2x4 does not exist
const u16vec2 row_idx = row_ids[col];
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
buf_b[buf_idx + 0] = bb[0].xy;
buf_b[buf_idx + 1] = bb[0].zw;
buf_b[buf_idx + 2] = bb[1].xy;
buf_b[buf_idx + 3] = bb[1].zw;
return;
}
// Not supported for b_type bf16 because bf16mat2x4 does not exist
const u16vec2 row_idx = row_ids[col];
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]);
buf_b[buf_idx + 0] = bb[0].xy;
buf_b[buf_idx + 1] = bb[0].zw;
buf_b[buf_idx + 2] = bb[1].xy;
buf_b[buf_idx + 3] = bb[1].zw;
#elif LOAD_VEC_B == 4
if (ALIGNED != 0) {
const u16vec2 row_idx = row_ids[col];
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
const u16vec2 row_idx = row_ids[col];
const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2;
#if defined(DATA_B_BF16)
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx]));
#else
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
#endif
buf_b[buf_idx + 0] = bb.xy;
buf_b[buf_idx + 1] = bb.zw;
return;
}
FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]);
#endif
buf_b[buf_idx + 0] = bb.xy;
buf_b[buf_idx + 1] = bb.zw;
#else // LOAD_VEC_BATCH_B == 2
const uint row_i = ic * BN + col;
const uint buf_idx = col * SHMEM_STRIDE + row;
if (row_i < _ne1 && block + row * 2 + 1 < end_k) {
const u16vec2 row_idx = row_ids[col];
const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]),
TO_FLOAT_TYPE(data_b_scalar[idx + 1]));
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]),
TO_FLOAT_TYPE(data_b[idx + 1]));
} else if (row_i < _ne1 && block + row * 2 < end_k) {
const u16vec2 row_idx = row_ids[col];
const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), 0.0f);
buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
} else {
buf_b[buf_idx] = FLOAT_TYPEV2(0.0f);
}
#endif
}
#endif
+10 -10
View File
@@ -1,26 +1,26 @@
#version 450
#include "generic_head.glsl"
#include "types.glsl"
#include "generic_unary_head.glsl"
#extension GL_EXT_control_flow_attributes : enable
#define BLOCK_SIZE 512
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
shared vec2 sum[BLOCK_SIZE];
void main() {
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
const uint tid = gl_LocalInvocationID.x;
const uint a_base = get_aoffset() + src0_idx(row * p.ne00);
const uint d_base = get_doffset() + dst_idx(row * p.ne10);
sum[tid] = vec2(0.0f, 0.0f);
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
const float xi = float(data_a[a_base + i0*p.nb00]);
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
const float xi = float(data_a[row*p.KX + col]);
sum[tid].x += xi;
sum[tid].y += xi * xi;
}
@@ -34,11 +34,11 @@ void main() {
barrier();
}
const float mean = sum[0].x / p.ne00;
const float var = sum[0].y / p.ne00 - mean * mean;
const float mean = sum[0].x / p.KX;
const float var = sum[0].y / p.KX - mean * mean;
const float inv_std = inversesqrt(var + p.param1);
[[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) {
data_d[d_base + i0*p.nb10] = D_TYPE((float(data_a[a_base + i0*p.nb00]) - mean) * inv_std);
[[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std);
}
}
@@ -0,0 +1,17 @@
#version 450
#include "types.glsl"
#include "generic_unary_head.glsl"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val));
}
@@ -0,0 +1,17 @@
#version 450
#include "types.glsl"
#include "generic_unary_head.glsl"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sqrt(val));
}
@@ -0,0 +1,17 @@
#version 450
#include "types.glsl"
#include "generic_unary_head.glsl"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val);
}
@@ -17,30 +17,6 @@ float op_neg(float x) {
return -x;
}
float op_sqr(float x) {
return x * x;
}
float op_sqrt(float x) {
return sqrt(x);
}
float op_sin(float x) {
return sin(x);
}
float op_cos(float x) {
return cos(x);
}
float op_clamp(float x) {
return clamp(x, p.param1, p.param2);
}
float op_leaky_relu(float x) {
return max(x, 0.0f) + min(x, 0.0f) * p.param1;
}
float op_step(float x) {
return x >= 0.0f ? 1.0f : 0.0f;
}
@@ -11,7 +11,6 @@
#include <future>
#include <queue>
#include <condition_variable>
#include <atomic>
#include <cstdio>
#include <cstring>
#include <cstdlib>
@@ -35,9 +34,6 @@
std::mutex lock;
std::vector<std::pair<std::string, std::string>> shader_fnames;
// Set when any shader subprocess fails (non-zero exit / stderr / launch failure) so the
// build is stopped instead of silently producing a broken libggml-vulkan. (issue #24393)
static std::atomic<bool> compile_failed{false};
std::locale c_locale("C");
std::string GLSLC = "glslc";
@@ -82,7 +78,7 @@ enum MatMulIdType {
namespace {
int execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
void execute_command(std::vector<std::string>& command, std::string& stdout_str, std::string& stderr_str) {
#ifdef _WIN32
HANDLE stdout_read, stdout_write;
HANDLE stderr_read, stderr_write;
@@ -131,11 +127,8 @@ int execute_command(std::vector<std::string>& command, std::string& stdout_str,
CloseHandle(stdout_read);
CloseHandle(stderr_read);
WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exit_code = 1;
GetExitCodeProcess(pi.hProcess, &exit_code);
CloseHandle(pi.hProcess);
CloseHandle(pi.hThread);
return (int)exit_code;
#else
int stdout_pipe[2];
int stderr_pipe[2];
@@ -182,9 +175,7 @@ int execute_command(std::vector<std::string>& command, std::string& stdout_str,
close(stdout_pipe[0]);
close(stderr_pipe[0]);
int status = 0;
waitpid(pid, &status, 0);
return WIFEXITED(status) ? WEXITSTATUS(status) : -1;
waitpid(pid, nullptr, 0);
}
#endif
}
@@ -381,14 +372,13 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
// }
// std::cout << std::endl;
int exit_code = execute_command(cmd, stdout_str, stderr_str);
if (exit_code != 0 || !stderr_str.empty()) {
std::cerr << "cannot compile " << name << " (exit code " << exit_code << ")\n\n";
execute_command(cmd, stdout_str, stderr_str);
if (!stderr_str.empty()) {
std::cerr << "cannot compile " << name << "\n\n";
for (const auto& part : cmd) {
std::cerr << part << " ";
}
std::cerr << "\n\n" << stderr_str << std::endl;
compile_failed = true;
return;
}
@@ -408,7 +398,6 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
shader_fnames.push_back(std::make_pair(name, out_path));
} catch (const std::exception& e) {
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
compile_failed = true;
}
}
@@ -550,9 +539,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
};
// Shaders with f16 B_TYPE
string_to_spv(shader_name + "_f32_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE_SCALAR", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_f32_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_f32_f16" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE_SCALAR", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_f16" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
// bf16
{
@@ -574,7 +565,8 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
#endif
{
if (!dot2) {
string_to_spv(shader_name + "_bf16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPE_SCALAR", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"B_TYPEV4", "bf16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_bf16", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"B_TYPEV4", "bf16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPEV4", "bf16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
}
}
}
@@ -591,6 +583,8 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
}
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
// For unaligned, load one at a time for f32/f16, or two at a time for quants
std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? "1" : load_vec_quant;
// For aligned matmul loads
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant;
@@ -603,11 +597,13 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
// don't generate f32 variants for coopmat2
if (!coopmat2) {
string_to_spv(shader_name + "_" + tname + "_f32" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE_SCALAR", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f32" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f32" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPEV4", "vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
}
if (tname != "f16" && tname != "f32") {
string_to_spv(shader_name + "_" + tname + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE_SCALAR", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f16" + dot2_sfx, source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f16" + dot2_sfx + "_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPEV4", "f16vec4"}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
}
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
@@ -854,12 +850,21 @@ void process_shaders() {
string_to_spv("repeat_i32", "repeat.comp", {{"A_TYPE", "int32_t"}, {"D_TYPE", "int32_t"}});
string_to_spv("repeat_back_f32", "repeat_back.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("get_rows_back_f32", "get_rows_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
string_to_spv("repeat_i16", "repeat.comp", {{"A_TYPE", "int16_t"}, {"D_TYPE", "int16_t"}});
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("sqrt_f32", "sqrt.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("concat_i8", "concat.comp", {{"A_TYPE", "uint8_t"}, {"B_TYPE", "uint8_t"}, {"D_TYPE", "uint8_t"}});
@@ -886,18 +891,6 @@ void process_shaders() {
string_to_spv("silu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_silu"}});
string_to_spv("relu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_relu"}});
string_to_spv("relu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_relu"}});
string_to_spv("sqr_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sqr"}});
string_to_spv("sqr_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sqr"}});
string_to_spv("sqrt_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sqrt"}});
string_to_spv("sqrt_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sqrt"}});
string_to_spv("sin_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_sin"}});
string_to_spv("sin_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_sin"}});
string_to_spv("cos_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_cos"}});
string_to_spv("cos_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_cos"}});
string_to_spv("clamp_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_clamp"}});
string_to_spv("clamp_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_clamp"}});
string_to_spv("leaky_relu_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_leaky_relu"}});
string_to_spv("leaky_relu_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_leaky_relu"}});
string_to_spv("neg_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_neg"}});
string_to_spv("neg_f32", "unary.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"OP", "op_neg"}});
string_to_spv("tanh_f16", "unary.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OP", "op_tanh"}});
@@ -955,6 +948,7 @@ void process_shaders() {
string_to_spv("geglu_quick_f16","geglu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("geglu_quick_f32","geglu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
@@ -1066,31 +1060,6 @@ void process_shaders() {
}
}
for (auto unroll : {false, true}) {
for (auto a_f16 : {false, true}) {
std::map<std::string, std::string> defines = {
{"A_TYPE", a_f16 ? "float16_t" : "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"},
{"UNROLL", unroll ? "[[unroll]]" : ""},
};
std::string name = std::string("conv3d") + (a_f16 ? "_f16" : "") + "_f32";
string_to_spv(name + (unroll ? "_unroll" : ""), "conv3d_mm.comp", defines);
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (unroll) {
auto cm2_defines = defines;
cm2_defines["COOPMAT2"] = "1";
string_to_spv(name, "conv3d_mm.comp", cm2_defines, true, false, true);
}
#endif
#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
if (unroll) {
auto cm1_defines = defines;
cm1_defines["COOPMAT"] = "1";
string_to_spv(name, "conv3d_mm.comp", cm1_defines, true, true, false);
}
#endif
}
}
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
string_to_spv("conv2d_dw_whcn_f16_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
@@ -1282,11 +1251,6 @@ int main(int argc, char** argv) {
process_shaders();
if (compile_failed) {
std::cerr << "vulkan-shaders-gen: one or more shaders failed to compile" << std::endl;
return EXIT_FAILURE;
}
write_output_files();
return EXIT_SUCCESS;
@@ -905,12 +905,11 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key {
ggml_type src0_type;
ggml_type src1_type;
int vectorized;
uint32_t num_cols;
bool use_mmvq;
bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const {
return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized &&
num_cols == other.num_cols && use_mmvq == other.use_mmvq;
use_mmvq == other.use_mmvq;
}
};
@@ -920,7 +919,6 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.src0_type);
ggml_webgpu_hash_combine(seed, key.src1_type);
ggml_webgpu_hash_combine(seed, key.vectorized);
ggml_webgpu_hash_combine(seed, key.num_cols);
ggml_webgpu_hash_combine(seed, key.use_mmvq);
return seed;
}
@@ -995,12 +993,11 @@ struct ggml_webgpu_mul_mat_id_pipeline_key {
ggml_type src0_type;
ggml_type src1_type;
uint32_t n_experts;
uint32_t num_cols;
int vectorized;
bool operator==(const ggml_webgpu_mul_mat_id_pipeline_key & other) const {
return src0_type == other.src0_type && src1_type == other.src1_type && n_experts == other.n_experts &&
num_cols == other.num_cols && vectorized == other.vectorized;
vectorized == other.vectorized;
}
};
@@ -1010,7 +1007,6 @@ struct ggml_webgpu_mul_mat_id_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.src0_type);
ggml_webgpu_hash_combine(seed, key.src1_type);
ggml_webgpu_hash_combine(seed, key.n_experts);
ggml_webgpu_hash_combine(seed, key.num_cols);
ggml_webgpu_hash_combine(seed, key.vectorized);
return seed;
}
@@ -1111,7 +1107,7 @@ inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0,
const ggml_tensor * src1,
bool supports_dot_product,
const std::string & vendor) {
if (src1->ne[1] <= 4) {
if (src1->ne[1] == 1) {
bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia";
if (supports_dp4a && supports_dot_product) {
switch (src1->type) {
@@ -1893,7 +1889,6 @@ class ggml_webgpu_shader_lib {
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
1 :
0;
key.num_cols = context.dst->ne[1];
key.use_mmvq =
ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor);
@@ -2009,7 +2004,6 @@ class ggml_webgpu_shader_lib {
if (key.vectorized) {
variant += "_vectorized";
}
defines.push_back(std::string("NUM_COLS=") + std::to_string(key.num_cols));
auto processed = preprocessor.preprocess(shader_src, defines);
auto decisions = std::make_shared<ggml_webgpu_mul_mat_vec_shader_decisions>();
@@ -2427,7 +2421,6 @@ class ggml_webgpu_shader_lib {
if (key.vectorized) {
variant += "_vectorized";
}
defines.push_back(std::string("NUM_COLS=1"));
defines.push_back(std::string("N_EXPERTS=") + std::to_string(key.n_experts));
+10 -12
View File
@@ -1418,17 +1418,15 @@ static void ggml_webgpu_quantize_q8_dispatch(webgpu_context &
const size_t dst_offset = ggml_webgpu_tensor_offset(dst);
const size_t q8_src1_align_offset = ROUNDUP_POW2(
dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
const size_t q8_src1_binding_size = ROUNDUP_POW2(
src1->ne[3] * src1->ne[2] * src1->ne[1] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)),
WEBGPU_STORAGE_BUF_BINDING_MULT);
const size_t q8_src1_binding_size =
ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)),
WEBGPU_STORAGE_BUF_BINDING_MULT);
std::vector<uint32_t> q8_params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
(uint32_t) src1->ne[0],
(uint32_t) src1->ne[1],
(uint32_t) src1->ne[2],
(uint32_t) src1->ne[3],
};
@@ -1444,7 +1442,7 @@ static void ggml_webgpu_quantize_q8_dispatch(webgpu_context &
uint32_t q8_wg_x = 1;
uint32_t q8_wg_y = 1;
const uint32_t wg_per_vec = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size;
const uint32_t q8_total_wg = src1->ne[1] * src1->ne[2] * src1->ne[3] * wg_per_vec;
const uint32_t q8_total_wg = src1->ne[2] * src1->ne[3] * wg_per_vec;
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y);
@@ -1458,7 +1456,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
ggml_tensor * src1,
ggml_tensor * dst) {
// Determine if this is a mat-vec operation
bool use_mat_vec = (dst->ne[1] <= 4);
bool is_vec = (dst->ne[1] == 1);
// use MMVQ path for mat-vec
bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product,
@@ -1484,7 +1482,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
webgpu_pipeline pipeline;
std::vector<webgpu_dispatch_desc> dispatches;
if (use_mat_vec) {
if (is_vec) {
if (use_mmvq) {
ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches);
}
@@ -1531,7 +1529,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
uint32_t wg_y = 1;
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
if (use_mat_vec) {
if (is_vec) {
auto * decisions = static_cast<ggml_webgpu_mul_mat_vec_shader_decisions *>(pipeline.context.get());
uint32_t batches = dst->ne[2] * dst->ne[3];
@@ -3693,8 +3691,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product,
ctx->webgpu_global_ctx->vendor);
if (use_mmvq) {
const size_t q8_src1_size = src1->ne[3] * src1->ne[2] * src1->ne[1] *
(36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32));
const size_t q8_src1_size =
src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32));
res = ROUNDUP_POW2(res + q8_src1_size +
ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment,
WEBGPU_STORAGE_BUF_BINDING_MULT);
@@ -4270,7 +4268,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_RMS_NORM:
case GGML_OP_NORM:
case GGML_OP_L2_NORM:
supports_op = (op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32) && ggml_is_contiguous_rows(src0);
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
break;
case GGML_OP_ROPE:
supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
@@ -103,7 +103,7 @@ fn main(
#ifdef USE_SUBGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let subgroup_total = subgroupAdd(acc[0][row]);
let subgroup_total = subgroupAdd(acc[row]);
if (subgroup_invocation_id == 0u) {
partial_sums[partial_index(row, subgroup_id)] = subgroup_total;
}
@@ -126,7 +126,7 @@ fn main(
#ifdef USE_WORKGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] = acc[0][row];
partial_sums[partial_index(row, thread_id)] = acc[row];
}
workgroupBarrier();
@@ -91,67 +91,61 @@ fn main(
let dst_idx_base = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row_base;
#ifdef MMVQ
let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * params.n * (params.k / 32u);
let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * (params.k / 32u);
let acc = accumulate_vec_q_dot(thread_id, row_base, src0_batch_offset, src1q_idx_base);
#else
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base);
#endif
for (var col = 0u;col < NUM_COLS;col += 1) {
#ifdef USE_SUBGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let subgroup_total = subgroupAdd(acc[col][row]);
if (subgroup_invocation_id == 0u) {
partial_sums[partial_index(row, subgroup_id)] = subgroup_total;
}
}
workgroupBarrier();
for (var row = subgroup_id; (row < OUTPUTS_PER_WG) && (row_base + row < params.m); row += num_subgroups) {
let output_row = row_base + row;
var row_acc = 0.0f;
for (var k = subgroup_invocation_id; k < num_subgroups; k += subgroup_size) {
row_acc += partial_sums[partial_index(row, k)];
}
let row_total = subgroupAdd(row_acc);
if (subgroup_invocation_id == 0) {
dst[dst_idx_base + col * params.m + row] = row_total;
}
}
#endif
#ifdef USE_WORKGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] = acc[col][row];
}
workgroupBarrier();
var stride = WG_SIZE / 2u;
while (stride > 0) {
if (thread_id < stride) {
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] += partial_sums[partial_index(row, thread_id + stride)];
}
}
workgroupBarrier();
stride = stride / 2;
}
if (thread_id < OUTPUTS_PER_WG) {
let output_row = row_base + thread_id;
if (output_row < params.m) {
dst[dst_idx_base + col * params.m + thread_id] = partial_sums[partial_index(thread_id, 0)];
}
}
#endif
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let subgroup_total = subgroupAdd(acc[row]);
if (subgroup_invocation_id == 0u) {
partial_sums[partial_index(row, subgroup_id)] = subgroup_total;
}
}
workgroupBarrier();
for (var row = subgroup_id; (row < OUTPUTS_PER_WG) && (row_base + row < params.m); row += num_subgroups) {
let output_row = row_base + row;
var row_acc = 0.0f;
for (var k = subgroup_invocation_id; k < num_subgroups; k += subgroup_size) {
row_acc += partial_sums[partial_index(row, k)];
}
let row_total = subgroupAdd(row_acc);
if (subgroup_invocation_id == 0) {
dst[dst_idx_base + row] = row_total;
}
}
#endif
#ifdef USE_WORKGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] = acc[row];
}
workgroupBarrier();
var stride = WG_SIZE / 2u;
while (stride > 0) {
if (thread_id < stride) {
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] += partial_sums[partial_index(row, thread_id + stride)];
}
}
workgroupBarrier();
stride = stride / 2;
}
if (thread_id < OUTPUTS_PER_WG) {
let output_row = row_base + thread_id;
if (output_row < params.m) {
dst[dst_idx_base + thread_id] = partial_sums[partial_index(thread_id, 0)];
}
}
#endif
}
File diff suppressed because it is too large Load Diff
@@ -51,7 +51,10 @@ fn repack_b_dm(block: u32) -> B_DS_TYPE {
fn get_dm(block_byte_base: u32) -> f32 {
return f32(load_f16_at_src0(block_byte_base));
}
#endif // MUL_ACC_Q4_0
fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 {
return f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK;
}
#endif
#ifdef MUL_ACC_Q4_1
#define BLOCK_SIZE_BYTES 20
@@ -82,7 +85,10 @@ fn get_dm(block_byte_base: u32) -> vec2<f32> {
f32(load_f16_at_src0(block_byte_base + 2u))
);
}
#endif // MUL_ACC_Q4_1
fn mul_q8_1(row_sum: i32, dma: vec2<f32>, b_ds: B_DS_TYPE) -> f32 {
return f32(row_sum) * (dma.x * b_ds.x) + dma.y * b_ds.y / THREADS_PER_BLOCK;
}
#endif
#ifdef MUL_ACC_Q8_0
#define BLOCK_SIZE_BYTES 34
@@ -105,48 +111,46 @@ fn repack_b_dm(block: u32) -> B_DS_TYPE {
fn get_dm(block_byte_base: u32) -> f32 {
return f32(load_f16_at_src0(block_byte_base));
}
#endif // MUL_ACC_Q8_0
fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 {
return f32(row_sum) * (da * b_ds);
}
#endif
#if defined(LEGACY_QUANTS)
fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<array<f32, OUTPUTS_PER_WG>, NUM_COLS> {
var acc: array<array<f32, OUTPUTS_PER_WG>, NUM_COLS>;
#ifdef LEGACY_QUANTS
fn mmvq_dot_product(a_byte_base: u32, b_inner_id: u32, b_repacked: vec2<u32>, b_ds: B_DS_TYPE) -> f32 {
var row_sum = 0;
let a_repacked = repack_a(a_byte_base, b_inner_id);
row_sum += dot4I8Packed(a_repacked[0], b_repacked[0]);
row_sum += dot4I8Packed(a_repacked[1], b_repacked[1]);
return mul_q8_1(row_sum, get_dm(a_byte_base), b_ds);
}
fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<f32, OUTPUTS_PER_WG> {
var acc: array<f32, OUTPUTS_PER_WG>;
let num_blocks = params.k / BLOCK_SIZE;
for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) {
let inner_id = thread_id % THREADS_PER_BLOCK;
let b_inner_id = thread_id % THREADS_PER_BLOCK;
let b_block_idx = src1q_idx_base + block;
let b_repacked = repack_b_qs(b_block_idx, b_inner_id);
let b_ds = repack_b_dm(b_block_idx);
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let output_row = row_base + row;
if (output_row < params.m) {
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
let a_repacked = repack_a(block_byte_base, inner_id);
let da = get_dm(block_byte_base);
for (var col = 0u;col < NUM_COLS;col += 1) {
let src1q_idx = src1q_idx_base + col * (params.k / Q8_BLOCK_SIZE) + block;
let b_repacked = repack_b_qs(src1q_idx, inner_id);
let b_ds = repack_b_dm(src1q_idx);
let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1]);
#if defined(MUL_ACC_Q4_0)
acc[col][row] += f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK;
#endif // MUL_ACC_Q4_0
#if defined(MUL_ACC_Q4_1)
acc[col][row] += f32(row_sum) * (da.x * b_ds.x) + da.y * b_ds.y / THREADS_PER_BLOCK;
#endif // MUL_ACC_Q4_1
#if defined(MUL_ACC_Q8_0)
acc[col][row] += f32(row_sum) * (da * b_ds);
#endif // MUL_ACC_Q8_0
}
acc[row] += mmvq_dot_product(block_byte_base, b_inner_id, b_repacked, b_ds);
}
}
}
return acc;
}
#endif // LEGACY_QUANTS
#endif
#ifdef MUL_ACC_Q2_K
#define BLOCK_SIZE_BYTES 84
@@ -187,7 +191,22 @@ fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2<f32> {
let scale = byte_of(load_u32_at_src0_aligned(scale_byte), scale_byte & 3u);
return vec2<f32>(f32(scale & 0xFu), f32(scale >> 4u));
}
#endif // MUL_ACC_Q2_K
fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4<u32>, b_ds: B_DS_TYPE) -> f32 {
let a_repacked = repack_a(a_byte_base, tid);
let dm = get_dm(a_byte_base);
let scale_min = get_scale_min(a_byte_base, tid);
let scale_q = i32(scale_min.x);
let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u;
let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1])
+ dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q;
let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4)
+ dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4);
return b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m));
}
#endif
#ifdef MUL_ACC_Q4_K
#define BLOCK_SIZE_BYTES 144
@@ -246,52 +265,39 @@ fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2<f32> {
return vec2<f32>(scale, min_val);
}
#endif // MUL_ACC_Q4_K
fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4<u32>, b_ds: B_DS_TYPE) -> f32 {
let a_repacked = repack_a(a_byte_base, tid);
let dm = get_dm(a_byte_base);
let scale_min = get_scale_min(a_byte_base, tid);
let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1])
+ dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]);
// Each thread covers half of the Q8_1 block, so add only b_ds.y/2.
return b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD));
}
#endif
#ifdef K_QUANTS
fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<array<f32, OUTPUTS_PER_WG>, NUM_COLS> {
var acc: array<array<f32, OUTPUTS_PER_WG>, NUM_COLS>;
fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array<f32, OUTPUTS_PER_WG> {
var acc: array<f32, OUTPUTS_PER_WG>;
let tid = thread_id % THREADS_PER_BLOCK;
for (var block = thread_id / THREADS_PER_BLOCK; block < params.k / BLOCK_SIZE; block += WG_SIZE / THREADS_PER_BLOCK) {
let src1q_idx = src1q_idx_base + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE;
let b_repacked = repack_b_qs(src1q_idx, tid);
let b_ds = repack_b_dm(src1q_idx);
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let output_row = row_base + row;
if (output_row < params.m) {
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
let a_repacked = repack_a(block_byte_base, tid);
let dm = get_dm(block_byte_base);
let scale_min = get_scale_min(block_byte_base, tid);
for (var col = 0u;col < NUM_COLS;col += 1) {
let src1q_idx = src1q_idx_base + col * (params.k / Q8_BLOCK_SIZE) + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE;
let b_repacked = repack_b_qs(src1q_idx, tid);
let b_ds = repack_b_dm(src1q_idx);
#if defined(MUL_ACC_Q2_K)
let scale_q = i32(scale_min.x);
let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u;
let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1])
+ dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q;
let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4)
+ dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4);
acc[col][row] += b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m));
#endif // MUL_ACC_Q2_K
#if defined(MUL_ACC_Q4_K)
let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1])
+ dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]);
// Each thread covers half of the Q8_1 block, so add only b_ds.y/2.
acc[col][row] += b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD));
#endif // MUL_ACC_Q4_K
}
acc[row] += mmvq_dot_product(block_byte_base, tid, b_repacked, b_ds);
}
}
}
return acc;
}
#endif // K_QUANTS
#endif
@@ -9,11 +9,9 @@ requires packed_4x8_integer_dot_product;
struct Params {
offset_src1: u32,
stride_11: u32,
stride_12: u32,
stride_13: u32,
ne0: u32,
ne1: u32,
ne2: u32,
ne3: u32,
};
@@ -59,28 +57,25 @@ fn main(
@builtin(num_workgroups) num_wg: vec3<u32>
) {
let thread_id = local_id.x;
let ne0_vec4 = params.ne0 / 4u;
let num_vec4 = params.ne0 / 4u;
let wg_per_vec = (ne0_vec4 + (WG_SIZE - 1u)) / WG_SIZE;
let total_batches = wg_per_vec * params.ne1 * params.ne2 * params.ne3;
let wg_per_vec = (num_vec4 + (WG_SIZE - 1u)) / WG_SIZE;
let total_batches = wg_per_vec * params.ne2 * params.ne3;
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
if (wg_linear >= total_batches) {
return;
}
let vec_idx = wg_linear / wg_per_vec;
let src13_idx = vec_idx / (params.ne2 * params.ne1);
let vec_ne12_num = vec_idx % (params.ne2 * params.ne1);
let src12_idx = vec_ne12_num / params.ne1;
let src11_idx = vec_ne12_num % params.ne1;
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + src11_idx * params.stride_11;
let src13_idx = wg_linear / (params.ne2 * wg_per_vec);
let src12_idx = (wg_linear - src13_idx * (params.ne2 * wg_per_vec)) / wg_per_vec;
let src11_wg_idx = wg_linear % wg_per_vec;
let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
let src1_idx_vec4_base = src1_idx_base / 4u;
let blocks_per_row = params.ne0 / 32u;
let blocks_per_wg = (WG_SIZE * 4u) / 32u;
let src1q_idx_base = ((src13_idx * params.ne2 + src12_idx) * params.ne1 + src11_idx) * blocks_per_row;
let src11_wg_idx = wg_linear % wg_per_vec;
let src1q_idx_base = (src13_idx * params.ne2 + src12_idx) * blocks_per_row;
let src1q_idx = src1q_idx_base + src11_wg_idx * blocks_per_wg + thread_id / 8u;
let qs_idx = thread_id % 8u;
@@ -90,7 +85,7 @@ fn main(
var thread_amax = 0.0;
let src11_vec4_idx = src11_wg_idx * WG_SIZE + thread_id;
let is_valid = src11_vec4_idx < ne0_vec4;
let is_valid = src11_vec4_idx < num_vec4;
#ifdef USE_SUBGROUP_REDUCTION
-1
View File
@@ -359,7 +359,6 @@ class Keys:
CHUNK_SIZE = "clip.audio.chunk_size"
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
MAX_POS_EMB = "clip.audio.max_pos_emb"
FEATURE_LAYERS = "clip.audio.feature_layer" # Granite Speech Plus
class Attention:
HEAD_COUNT = "clip.audio.attention.head_count"
-3
View File
@@ -1310,9 +1310,6 @@ class GGUFWriter:
def add_audio_max_pos_emb(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)
def add_audio_feature_layers(self, layers: Sequence[int]) -> None:
self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers)
def add_audio_projector_window_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)
+1 -7
View File
@@ -57,25 +57,19 @@ oppoll=
opflt=
[ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF"
opfuse=
[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
vmem=
[ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM"
mbuf=
[ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB"
mmsel=
[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
set -x
adb $adbserial $adbhost shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $opfuse $vmem $mbuf $mmsel \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --ubatch-size 1024 -fa on \
+1 -7
View File
@@ -51,12 +51,6 @@ opqueue=
oppoll=
[ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP"
opfuse=
[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC"
mmsel=
[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM"
set -x
tool=$1; shift
@@ -65,5 +59,5 @@ adb $adbserial $adbhost shell " \
cd $basedir; ulimit -c unlimited; \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opfuse $mmsel ./$branch/bin/$tool $@ \
$verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \
"
+7 -38
View File
@@ -26,7 +26,7 @@ COL_MAP = {
}
op_pattern = re.compile(
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
)
trace_pattern = re.compile(
@@ -93,40 +93,9 @@ def parse_log(file_path, pmu_index=None):
+ int(ts_match.group('us'))
)
if "|" in line and "profile-op" in line:
parts = [p.strip() for p in line.split("|")]
prefix = parts[0]
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
if not prefix_match:
continue
if len(parts) == 7:
dims, types, timings = parts[2], parts[3], parts[6]
elif len(parts) == 6:
dims, types, timings = parts[2], parts[3], parts[5]
else:
continue
timing_match = re.search(
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
timings
)
if not timing_match:
continue
op_match = timing_match
op_name = prefix_match.group("op_name")
else:
op_match = op_pattern.search(line)
if op_match:
op_name = op_match.group('op_name')
dims = op_match.group('dims').strip()
types = op_match.group('types').strip()
else:
op_match = None
op_match = op_pattern.search(line)
if op_match:
pmu_raw = op_match.group('pmu') if 'pmu' in op_match.groupdict() else None
pmu_raw = op_match.group('pmu')
pmu_val = None
if pmu_raw and pmu_index is not None:
try:
@@ -136,7 +105,7 @@ def parse_log(file_path, pmu_index=None):
except (ValueError, IndexError):
pmu_val = None
evt_raw = op_match.group('evt') if 'evt' in op_match.groupdict() else None
evt_raw = op_match.group('evt')
evt_val = None
if evt_raw:
try:
@@ -153,9 +122,9 @@ def parse_log(file_path, pmu_index=None):
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
current_op = {
'name': op_name,
'dims': dims,
'types': types,
'name': op_match.group('op_name'),
'dims': op_match.group('dims').strip(),
'types': op_match.group('types').strip(),
'op_text': op_text,
'usec': int(op_match.group('usec')),
'cycles': int(op_match.group('cycles')),
+6 -42
View File
@@ -12,7 +12,7 @@ from collections import defaultdict
logger = logging.getLogger("ggml-hexagon-trace")
op_pattern = re.compile(
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+?)\s+:\s+(?:(?P<params>.*?)\s+:\s+)?(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
r"profile-op\s+(?P<op_name>[A-Z_0-9+]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+(?P<strides>[\d:x\s\->!]+)\s+:\s+(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?"
)
trace_pattern = re.compile(
@@ -66,40 +66,7 @@ def parse_log(file_path):
for line in f:
line_idx += 1
if "|" in line and "profile-op" in line:
parts = [p.strip() for p in line.split("|")]
prefix = parts[0]
prefix_match = re.search(r"profile-op\s+(?P<op_name>[A-Z_0-9+]+)", prefix)
if not prefix_match:
continue
if len(parts) == 7:
dims, types, strides, params, timings = parts[2], parts[3], parts[4], parts[5], parts[6]
elif len(parts) == 6:
dims, types, strides, params, timings = parts[2], parts[3], parts[4], "", parts[5]
else:
continue
timing_match = re.search(
r"(?:op-)?usec\s+(?P<usec>\d+)\s+(?:op-)?cycles\s+(?P<cycles>\d+)(?:\s+start\s+(?P<start>\d+))?(?:\s+mhz\s+(?P<mhz>[\d.]+))?(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?(?:\s+evt\s+\[(?P<evt>[\d,\s]+)\])?",
timings
)
if not timing_match:
continue
op_match = timing_match
op_name = prefix_match.group("op_name")
else:
op_match = op_pattern.search(line)
if op_match:
op_name = op_match.group('op_name')
dims = op_match.group('dims').strip() if op_match.group('dims') else ''
types = op_match.group('types').strip() if op_match.group('types') else ''
strides = op_match.group('strides').strip() if op_match.group('strides') else ''
params = op_match.group('params').strip() if ('params' in op_match.groupdict() and op_match.group('params')) else ''
else:
op_match = None
op_match = op_pattern.search(line)
if op_match:
cycles_start_raw = op_match.group('start')
unwrapped_cycles_start = None
@@ -110,11 +77,10 @@ def parse_log(file_path):
op_text = line[idx + 11:].strip() if idx != -1 else line.strip()
current_op = {
'name': op_name,
'dims': dims,
'types': types,
'strides': strides,
'params': params,
'name': op_match.group('op_name'),
'dims': op_match.group('dims').strip() if op_match.group('dims') else '',
'types': op_match.group('types').strip() if op_match.group('types') else '',
'strides': op_match.group('strides').strip() if op_match.group('strides') else '',
'op_text': op_text,
'usec': int(op_match.group('usec')),
'cycles': int(op_match.group('cycles')),
@@ -431,8 +397,6 @@ def generate_perfetto_trace(filtered_ops, output_path):
debug_annots.append(make_debug_annotation("line", int_val=op['line_num']))
if 'strides' in op and op['strides']:
debug_annots.append(make_debug_annotation("strides", string_val=op['strides']))
if 'params' in op and op['params'] and op['params'] != '----':
debug_annots.append(make_debug_annotation("params", string_val=op['params']))
# Slice Begin
evt_begin = make_track_event(1, 2, name=f"{op['name']} ({op['dims']})", category="operator", debug_annotations=debug_annots)
+4 -14
View File
@@ -190,15 +190,7 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);
// causal prepends the state, non-causal pads symmetrically for a centered window
if (hparams.causal_attn) {
bx = ggml_concat(ctx0, conv, bx, 0);
} else {
const int64_t pad = (hparams.n_shortconv_l_cache - 1) / 2;
auto * left = ggml_cont(ctx0,
ggml_view_3d(ctx0, conv, pad, hparams.n_embd, n_seqs, conv->nb[1], conv->nb[2], (d_conv - pad) * conv->nb[0]));
bx = ggml_pad_ext(ctx0, ggml_concat(ctx0, left, bx, 0), 0, pad, 0, 0, 0, 0, 0, 0);
}
bx = ggml_concat(ctx0, conv, bx, 0);
GGML_ASSERT(bx->ne[0] > conv->ne[0]);
// last d_conv columns is a new conv state
@@ -274,12 +266,10 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
cb(cur, "result_norm", -1);
res->t_embd = cur;
if (!cparams.embeddings) {
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;
}
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
+1
View File
@@ -199,6 +199,7 @@ llama_build_and_test(test-jinja.cpp)
llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
llama_build_and_test(test-chat-template.cpp)
llama_build_and_test(test-json-partial.cpp)
llama_build_and_test(test-log.cpp)
llama_build_and_test(
test-peg-parser.cpp
+8 -62
View File
@@ -3298,29 +3298,21 @@ struct test_norm : public test_case {
const std::array<int64_t, 4> ne;
const bool v; // whether a is a non-contiguous view
const float eps;
const bool noncontig_rows;
std::string vars() override {
return VARS_TO_STR5(type, ne, v, eps, noncontig_rows);
return VARS_TO_STR4(type, ne, v, eps);
}
test_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 5, 4, 3},
bool v = false,
float eps = 1e-6f,
bool noncontig_rows = false)
: type(type), ne(ne), v(v), eps(eps), noncontig_rows(noncontig_rows) {}
float eps = 1e-6f)
: type(type), ne(ne), v(v), eps(eps) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
const std::array<int64_t, 4> ne_a = noncontig_rows ?
std::array<int64_t, 4>{ ne[1], ne[0], ne[2], ne[3] } : ne;
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
if (noncontig_rows) {
a = ggml_permute(ctx, a, 1, 0, 2, 3);
ggml_set_name(a, "permuted a");
}
if (v) {
a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
ggml_set_name(a, "view of a");
@@ -6201,29 +6193,21 @@ struct test_l2_norm : public test_case {
const std::array<int64_t, 4> ne;
const float eps;
bool v;
bool noncontig_rows;
std::string vars() override {
return VARS_TO_STR5(type, ne, eps, v, noncontig_rows);
return VARS_TO_STR4(type, ne, eps, v);
}
test_l2_norm(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {64, 64, 320, 1},
float eps = 1e-12f,
bool v = false,
bool noncontig_rows = false)
: type(type), ne(ne), eps(eps), v(v), noncontig_rows(noncontig_rows) {}
bool v = false)
: type(type), ne(ne), eps(eps), v(v) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
const std::array<int64_t, 4> ne_a = noncontig_rows ?
std::array<int64_t, 4>{ ne[1], ne[0], ne[2], ne[3] } : ne;
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_name(a, "a");
if (noncontig_rows) {
a = ggml_permute(ctx, a, 1, 0, 2, 3);
ggml_set_name(a, "permuted a");
}
if (v) {
a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0);
ggml_set_name(a, "view of a");
@@ -8298,11 +8282,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps));
}
test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, false, eps, true));
test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps));
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false));
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true));
test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false, true));
}
}
@@ -8420,11 +8402,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
}
}
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_MXFP4, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1}));
#if 0
{
// Test paths in OpenCL
@@ -8456,7 +8433,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {2, 1}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {1, 2}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {2, 2}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 4, k, {3, 2}, {2, 2}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 1}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {2, 1}));
@@ -8473,7 +8449,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 4, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 1, 3, 2}));
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 3, 2, 1}));
@@ -8599,7 +8574,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
// gpt-oss issue with Vulkan mmq_id
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q4_0, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
for (ggml_type type_a : all_types) {
test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
@@ -9296,34 +9270,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}
struct conv3d_perf_case {
int N, IC, ID, IH, IW, OC, KD, KH, KW, s0, s1, s2, p0, p1, p2, d0, d1, d2;
};
const std::vector<conv3d_perf_case> conv3d_cases = {
{1, 320, 8, 38, 26, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1280, 8, 38, 26, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 320, 8, 76, 52, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 1280, 8, 76, 52, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 320, 8, 152, 104, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
#if 0
// too slow on some devices
{1, 1280, 8, 152, 104, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 320, 4, 304, 208, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
{1, 640, 4, 304, 208, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
#endif
};
for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
for (const conv3d_perf_case & c : conv3d_cases) {
test_cases.emplace_back(new test_conv_3d(
c.N, c.IC, c.ID, c.IH, c.IW,
c.OC, c.KD, c.KH, c.KW,
c.s0, c.s1, c.s2, c.p0, c.p1, c.p2, c.d0, c.d1, c.d2,
kernel_type));
}
}
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
+24 -99
View File
@@ -1562,112 +1562,37 @@ static void test_msgs_oaicompat_json_conversion() {
}
}
static void test_msg_token_delimiters_split() {
static void test_split_by_role() {
LOG_DBG("%s\n", __func__);
// Delimiters that share a leading token, distinguished by the second token,
// to exercise the per-position token matching.
const common_chat_msg_delimiters delims = {
{ { COMMON_CHAT_ROLE_USER, "", { 10, 11 } },
{ COMMON_CHAT_ROLE_ASSISTANT, "", { 10, 12 } } }
};
// Empty inputs
assert_equals<size_t>(0, common_chat_msg_delimiters{}.split({}).spans.size());
assert_equals<size_t>(0, common_chat_msg_delimiters{}.split({ 10, 11 }).spans.size());
assert_equals<size_t>(0, delims.split({}).spans.size());
assert_equals<size_t>(0, common_chat_split_by_role("", {}).size());
assert_equals<size_t>(0, common_chat_split_by_role("hello", {}).size());
assert_equals<size_t>(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size());
// No delimiters match -> no spans
assert_equals<size_t>(0, delims.split({ 100, 101, 102 }).spans.size());
// Multi-role conversation: <user>Hi<assistant>Hello<user>Bye
// Multi-role conversation, no leading/trailing content
{
const llama_tokens tokens = {
10, 11, // <user>
100, 101, // Hi
10, 12, // <assistant>
200, 201, 202, // Hello
10, 11, // <user>
300, 301, // Bye
};
const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye";
const auto splits = common_chat_split_by_role(prompt, {
{ "user", "<|user|>" },
{ "assistant", "<|assistant|>" },
});
assert_equals<size_t>(3, splits.size());
const auto result = delims.split(tokens);
const auto & spans = result.spans;
assert_equals<size_t>(3, spans.size());
assert_equals<std::string>("user", splits[0].role);
assert_equals<size_t>(0, splits[0].pos);
assert_equals<size_t>(10, splits[0].len);
assert_equals<std::string>("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len));
assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role);
assert_equals<size_t>(0, spans[0].pos);
assert_equals<size_t>(4, spans[0].len);
assert_equals<std::string>("assistant", splits[1].role);
assert_equals<size_t>(10, splits[1].pos);
assert_equals<size_t>(18, splits[1].len);
assert_equals<std::string>("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len));
assert_equals(COMMON_CHAT_ROLE_ASSISTANT, spans[1].role);
assert_equals<size_t>(4, spans[1].pos);
assert_equals<size_t>(5, spans[1].len);
assert_equals(COMMON_CHAT_ROLE_USER, spans[2].role);
assert_equals<size_t>(9, spans[2].pos);
assert_equals<size_t>(4, spans[2].len);
// is_user_start() is true at the token position where a user span begins
assert_equals(true, result.is_user_start(0));
assert_equals(false, result.is_user_start(4)); // assistant span
assert_equals(true, result.is_user_start(9));
}
// Content before the first delimiter is not captured as a span
{
const llama_tokens tokens = {
500, 501, // leading content (dropped)
10, 11, // <user>
100, // Hi
};
const auto spans = delims.split(tokens).spans;
assert_equals<size_t>(1, spans.size());
assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role);
assert_equals<size_t>(2, spans[0].pos);
assert_equals<size_t>(3, spans[0].len);
}
// Skipped regions (media chunks) are jumped over but still count as span content
{
const llama_tokens tokens = {
10, 11, // <user>
LLAMA_TOKEN_NULL, // media chunk (3 tokens)
LLAMA_TOKEN_NULL,
LLAMA_TOKEN_NULL,
100, // Hi
10, 12, // <assistant>
};
const std::map<size_t, size_t> skips = { { 2, 3 } };
const auto spans = delims.split(tokens, skips).spans;
assert_equals<size_t>(2, spans.size());
assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role);
assert_equals<size_t>(0, spans[0].pos);
assert_equals<size_t>(6, spans[0].len);
assert_equals(COMMON_CHAT_ROLE_ASSISTANT, spans[1].role);
assert_equals<size_t>(6, spans[1].pos);
assert_equals<size_t>(2, spans[1].len);
}
// A delimiter sequence inside a skipped region is not matched
{
const llama_tokens tokens = {
10, 11, // <user>
10, 12, // skipped region that happens to contain delimiter tokens
100, // Hi
};
const std::map<size_t, size_t> skips = { { 2, 2 } };
const auto spans = delims.split(tokens, skips).spans;
assert_equals<size_t>(1, spans.size());
assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role);
assert_equals<size_t>(0, spans[0].pos);
assert_equals<size_t>(5, spans[0].len);
assert_equals<std::string>("user", splits[2].role);
assert_equals<size_t>(28, splits[2].pos);
assert_equals<size_t>(11, splits[2].len);
assert_equals<std::string>("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len));
}
}
@@ -5932,7 +5857,7 @@ int main(int argc, char ** argv) {
{
test_msg_diffs_compute();
test_msgs_oaicompat_json_conversion();
test_msg_token_delimiters_split();
test_split_by_role();
test_tools_oaicompat_json_conversion();
test_convert_responses_to_chatcmpl();
test_developer_role_to_system_workaround();
+287
View File
@@ -0,0 +1,287 @@
#include "common.h"
#include "json-partial.h"
#include <exception>
#include <iostream>
#include <stdexcept>
template <class T> static void assert_equals(const T & expected, const T & actual) {
if (expected != actual) {
std::cerr << "Expected: " << expected << std::endl;
std::cerr << "Actual: " << actual << std::endl;
std::cerr << std::flush;
throw std::runtime_error("Test failed");
}
}
static void test_json_healing() {
auto parse = [](const std::string & str) {
std::cerr << "# Parsing: " << str << '\n';
std::string::const_iterator it = str.begin();
const auto end = str.end();
common_json out;
std::string healing_marker = "$llama.cpp.json$";
if (common_json_parse(it, end, healing_marker, out)) {
auto dump = out.json.dump();
std::cerr << "Parsed: " << dump << '\n';
std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
std::string result;
if (!out.healing_marker.json_dump_marker.empty()) {
auto i = dump.find(out.healing_marker.json_dump_marker);
if (i == std::string::npos) {
throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
}
result = dump.substr(0, i);
} else {
result = dump;
}
std::cerr << "Result: " << result << '\n';
if (string_starts_with(str, result)) {
std::cerr << "Failure!\n";
}
// return dump;
} else {
throw std::runtime_error("Failed to parse: " + str);
}
};
auto parse_all = [&](const std::string & str) {
for (size_t i = 1; i < str.size(); i++) {
parse(str.substr(0, i));
}
};
parse_all("{\"a\": \"b\"}");
parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
parse_all("[{\"a\": \"b\"}]");
auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
for (const auto & input : inputs) {
common_json out;
assert_equals(true, common_json_parse(input, "$foo", out));
assert_equals<std::string>(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true));
assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
}
};
// No healing needed:
test(
{
R"([{"a":"b"}, "y"])",
},
R"([{"a":"b"},"y"])",
""
);
// Partial literals can't be healed:
test(
{
R"([1)",
R"([tru)",
R"([n)",
R"([nul)",
R"([23.2)",
},
R"(["$foo"])",
R"("$foo)"
);
test(
{
R"({"a": 1)",
R"({"a": tru)",
R"({"a": n)",
R"({"a": nul)",
R"({"a": 23.2)",
},
R"({"a":"$foo"})",
R"("$foo)"
);
test(
{
R"({)",
},
R"({"$foo":1})",
R"("$foo)"
);
test(
{
R"([)",
},
R"(["$foo"])",
R"("$foo)"
);
// Healing right after a full literal
test(
{
R"(1 )",
},
R"(1)",
""
);
test(
{
R"(true)",
R"(true )",
},
R"(true)",
""
);
test(
{
R"(null)",
R"(null )",
},
R"(null)",
""
);
test(
{
R"([1 )",
},
R"([1,"$foo"])",
R"(,"$foo)"
);
test(
{
R"([{})",
R"([{} )",
},
R"([{},"$foo"])",
R"(,"$foo)"
);
test(
{
R"([true)",
},
// TODO: detect the true/false/null literal was complete
R"(["$foo"])",
R"("$foo)"
);
test(
{
R"([true )",
},
R"([true,"$foo"])",
R"(,"$foo)"
);
test(
{
R"([true,)",
},
R"([true,"$foo"])",
R"("$foo)"
);
// Test nesting
test(
{
R"([{"a": [{"b": [{)",
},
R"([{"a":[{"b":[{"$foo":1}]}]}])",
R"("$foo)"
);
test(
{
R"([{"a": [{"b": [)",
},
R"([{"a":[{"b":["$foo"]}]}])",
R"("$foo)"
);
test(
{
R"([{"a": "b"})",
R"([{"a": "b"} )",
},
R"([{"a":"b"},"$foo"])",
R"(,"$foo)"
);
test(
{
R"([{"a": "b"},)",
R"([{"a": "b"}, )",
},
R"([{"a":"b"},"$foo"])",
R"("$foo)"
);
test(
{
R"({ "code)",
},
R"({"code$foo":1})",
R"($foo)"
);
test(
{
R"({ "code\)",
},
R"({"code\\$foo":1})",
R"(\$foo)"
);
test(
{
R"({ "code")",
},
R"({"code":"$foo"})",
R"(:"$foo)"
);
test(
{
R"({ "key")",
},
R"({"key":"$foo"})",
R"(:"$foo)"
);
// Test unicode escape sequences
test(
{
R"({"a":"\u)",
},
R"({"a":"\u0000$foo"})",
R"(0000$foo)"
);
test(
{
R"({"a":"\u00)",
},
R"({"a":"\u0000$foo"})",
R"(00$foo)"
);
test(
{
R"({"a":"\ud300)",
},
R"({"a":"\ud300$foo"})",
R"($foo)"
);
test(
{
R"({"a":"\ud800)",
},
R"({"a":"\ud800\udc00$foo"})",
R"(\udc00$foo)"
);
test(
{
R"({"a":"\ud800\)",
},
R"({"a":"\ud800\udc00$foo"})",
R"(udc00$foo)"
);
test(
{
R"({"a":"\ud800\u)",
},
R"({"a":"\ud800\udc00$foo"})",
R"(dc00$foo)"
);
test(
{
R"({"a":"\ud800\udc00)",
},
R"({"a":"\ud800\udc00$foo"})",
R"($foo)"
);
}
int main() {
test_json_healing();
std::cerr << "All tests passed.\n";
return 0;
}
+1 -1
View File
@@ -42,7 +42,6 @@
#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_N_HEAD_KV "clip.%s.attention.head_count_kv"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_FEATURE_LAYERS "clip.%s.feature_layer"
// vision-specific
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
@@ -55,6 +54,7 @@
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side"
#define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side"
+3 -3
View File
@@ -91,7 +91,7 @@ struct clip_hparams {
float eps = 1e-6;
float rope_theta = 0.0;
std::vector<int32_t> feature_layers;
std::vector<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
@@ -165,8 +165,8 @@ struct clip_hparams {
return false;
}
bool is_feature_layer(int32_t layer) const {
return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end();
bool is_vision_feature_layer(int32_t layer) const {
return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end();
}
};
+10 -9
View File
@@ -1264,10 +1264,12 @@ struct clip_model_loader {
}
}
// Load the vision/audio feature layer indices if they are explicitly provided
// Load the vision feature layer indices if they are explicitly provided;
// if multiple vision feature layers are present, the values will be concatenated
// to form the final visual features.
// NOTE: gguf conversions should standardize the values of the vision feature layer to
// be non-negative, since we use -1 to mark values as unset here.
get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false);
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false);
// model-specific params
switch (model.proj_type) {
@@ -1649,7 +1651,6 @@ struct clip_model_loader {
get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size);
get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate);
get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count);
// NOTE: feature layers loaded above in common path
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
@@ -1662,11 +1663,11 @@ struct clip_model_loader {
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
hparams.image_resize_pad = PAD_CEIL;
// NOTE: feature_layers loaded in common path as optional
get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer);
get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets);
if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) {
throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d",
hparams.feature_layers.size(), hparams.proj_spatial_offsets.size()));
if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) {
throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d",
hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size()));
}
get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side);
@@ -2739,7 +2740,7 @@ struct clip_model_loader {
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
// Load separate layerwise and spatial projector tensors
const auto projector_count = hparams.feature_layers.size();
const auto projector_count = hparams.vision_feature_layer.size();
model.qf_proj_blocks.resize(projector_count);
for (size_t bid = 0; bid < projector_count; ++bid) {
auto & b = model.qf_proj_blocks[bid];
@@ -4387,7 +4388,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
// Stage 1b only uses block 0's permutations; future stages
// will upload all blocks.
for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) {
for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) {
const std::string prefix = "g4v_blk" + std::to_string(bid) + "_";
upload(prefix + "win_idx", make_win_idx(image_side, window_side));
upload(prefix + "qwin_idx", make_win_idx(new_side, query_side));
+1 -35
View File
@@ -1,7 +1,5 @@
#include "models.h"
#include <algorithm>
ggml_cgraph * clip_graph_granite_speech::build() {
const int n_frames = img.nx();
const int context_size = hparams.audio_chunk_size;
@@ -13,10 +11,6 @@ ggml_cgraph * clip_graph_granite_speech::build() {
const int padded_len = num_blocks * context_size;
const int remainder = n_frames % context_size;
// Calculate projector input dimension based on feature layers
const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1);
const bool use_feature_concat = !hparams.feature_layers.empty();
ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size);
ggml_set_name(attn_dists, "attn_dists");
ggml_set_input(attn_dists);
@@ -37,15 +31,6 @@ ggml_cgraph * clip_graph_granite_speech::build() {
cur = ggml_add(ctx0, cur, model.inp_proj_b);
cb(cur, "inp_linear", -1);
// Capture layer 0 if requested (after input_linear)
ggml_tensor * concat_result = nullptr;
if (use_feature_concat) {
if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) {
concat_result = cur;
cb(concat_result, "feature_layer_0", -1);
}
}
for (int il = 0; il < n_layer; il++) {
const auto & layer = model.layers[il];
auto * residual = cur;
@@ -183,18 +168,6 @@ ggml_cgraph * clip_graph_granite_speech::build() {
NORM_TYPE_NORMAL, eps, il);
cb(cur, "layer_out", il);
// Capture intermediate layer (il + 1) if requested
if (use_feature_concat) {
if (hparams.is_feature_layer(il + 1)) {
if (concat_result == nullptr) {
concat_result = cur;
} else {
concat_result = ggml_concat(ctx0, concat_result, cur, 0);
}
cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il);
}
}
// CTC branch
if (il + 1 == ctc_layer) {
auto * mid = build_mm(model.ctc_out_w, cur);
@@ -207,13 +180,6 @@ ggml_cgraph * clip_graph_granite_speech::build() {
}
}
// Append final output to concatenated features if using feature concatenation
if (use_feature_concat && concat_result != nullptr) {
concat_result = ggml_concat(ctx0, concat_result, cur, 0);
cb(concat_result, "concat_final", -1);
cur = concat_result;
}
cb(cur, "encoder_out", -1);
// QFormer projector
@@ -231,7 +197,7 @@ ggml_cgraph * clip_graph_granite_speech::build() {
cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0);
}
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj);
ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj);
ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query,
model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b,
+2 -2
View File
@@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() {
}
// --- Stage 1b/1c: WindowQFormer blocks ---
const int projector_count = hparams.feature_layers.size();
const int projector_count = hparams.vision_feature_layer.size();
const float qformer_eps = 1e-12f;
ggml_tensor * mmproj = nullptr;
for (int bid = 0; bid < projector_count; ++bid) {
const auto & blk = model.qf_proj_blocks[bid];
int vlayer = hparams.feature_layers[bid];
int vlayer = hparams.vision_feature_layer[bid];
GGML_ASSERT(vlayer >= 0 && vlayer < n_layer);
ggml_tensor * h = layer_outs[vlayer];

Some files were not shown because too many files have changed in this diff Show More