Compare commits

...

8 Commits
b6085 ... b6093

Author SHA1 Message Date
Romain Biessy
3306ceabf0 sycl: fix mul_mat selection (#15092) 2025-08-05 18:39:55 +02:00
Juk Armstrong
c81de6e107 Fix glm4moe bug (#15088) 2025-08-05 13:56:44 +01:00
Alex Wu
22f060c9c4 webui: fix markdown table (#15081)
* webui: fix markdown table

* webui: fix table display with themes
2025-08-05 13:56:44 +02:00
compilade
ee3a9fcf88 context : fix index overflow on huge outputs (#15080)
* context : fix overflow when re-ordering huge outputs

* context : fix logits size overflow for huge batches
2025-08-05 11:27:45 +02:00
Diego Devesa
ec428b02c3 llama : add --n-cpu-moe option (#15077)
* llama : add --n-cpu-moe option

Keeps the MoE weights of the first N layers in the CPU
2025-08-05 01:05:36 +02:00
compilade
19f68fa5a4 imatrix : warn when GGUF imatrix is saved without .gguf suffix (#15076)
* imatrix : add warning when suffix is not .gguf for GGUF imatrix

* imatrix : only warn about suffix when output format is unspecified
2025-08-04 23:26:52 +02:00
Christian Kastner
41613437ff cmake: Add GGML_BACKEND_DIR option (#15074)
* cmake: Add GGML_BACKEND_DIR option

This can be used by distributions to specify where to look for backends
when ggml is built with GGML_BACKEND_DL=ON.

* Fix phrasing
2025-08-04 21:29:14 +02:00
Sigbjørn Skjæret
e5bebe5251 gguf-py : add --chat-template-file to gguf_new_metadata (#15075) 2025-08-04 21:01:48 +02:00
13 changed files with 89 additions and 33 deletions

View File

@@ -24,6 +24,7 @@
#include <cstdarg>
#include <filesystem>
#include <fstream>
#include <list>
#include <regex>
#include <set>
#include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
throw std::invalid_argument("unknown buffer type");
}
// FIXME: this leaks memory
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(tensor_name);
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
}
}
));
add_opt(common_arg(
{"--cpu-moe"},
"use CPU for Mixture of Experts (MoE) weights",
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
{"--n-cpu-moe", "-ncmoe"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
[](common_params & params, int value) {
if (value < 0) {
throw std::invalid_argument("invalid value");
}
for (int i = 0; i < value; ++i) {
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
@@ -2649,10 +2665,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--output-format"}, "{gguf,dat}",
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
[](common_params & params, const std::string & value) {
/**/ if (value == "gguf") { params.imat_dat = false; }
else if (value == "dat") { params.imat_dat = true; }
/**/ if (value == "gguf") { params.imat_dat = -1; }
else if (value == "dat") { params.imat_dat = 1; }
else { throw std::invalid_argument("invalid output format"); }
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));

View File

@@ -439,7 +439,7 @@ struct common_params {
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
bool imat_dat = false; // whether the legacy imatrix.dat format should be output
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity

View File

@@ -39,8 +39,9 @@ if (WIN32)
set(CMAKE_SHARED_MODULE_PREFIX "")
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
#
# option list

View File

@@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml)
find_library(GGML_LIBRARY ggml
REQUIRED
HINTS ${GGML_LIB_DIR}
HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR}
NO_CMAKE_FIND_ROOT_PATH)
add_library(ggml::ggml UNKNOWN IMPORTED)

View File

@@ -214,6 +214,13 @@ add_library(ggml
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)
if (GGML_BACKEND_DIR)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
endif()
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
endif()
target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
if (GGML_BACKEND_DIR)
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
else()
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
else()
add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend})

View File

@@ -498,6 +498,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
std::vector<fs::path> search_paths;
if (user_search_path == nullptr) {
#ifdef GGML_BACKEND_DIR
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
#endif
// default search paths: executable directory, current directory
search_paths.push_back(get_executable_path());
search_paths.push_back(fs::current_path());

View File

@@ -2609,6 +2609,8 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(src1->ne[1] == 1);
GGML_ASSERT(src1->ne[3] == 1);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
@@ -3196,7 +3198,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
}
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1 && src1->ne[3] == 1) {
// KQV single-batch
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {

View File

@@ -111,6 +111,7 @@ def main() -> None:
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
parser.add_argument("--chat-template-file", type=Path, help="Jinja file containing chat template", metavar='chat_template.jinja')
parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
@@ -134,12 +135,17 @@ def main() -> None:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
if args.chat_template_config:
with open(args.chat_template_config, 'r') as fp:
with open(args.chat_template_config, 'r', encoding='utf-8') as fp:
config = json.load(fp)
template = config.get('chat_template')
if template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if args.chat_template_file:
with open(args.chat_template_file, 'r', encoding='utf-8') as fp:
template = fp.read()
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if args.pre_tokenizer:
new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)

View File

@@ -786,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
const int32_t n_vocab = model.vocab.n_tokens();
const int64_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
@@ -959,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const int32_t n_vocab = vocab.n_tokens();
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
// when computing embeddings, all tokens are output
@@ -1328,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
}
void llama_context::output_reorder() {
const uint32_t n_vocab = model.vocab.n_tokens();
const uint64_t n_vocab = model.vocab.n_tokens();
const uint64_t n_embd = model.hparams.n_embd;
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
const uint32_t i0 = output_swaps[s].i0;
const uint32_t i1 = output_swaps[s].i1;
for (size_t s = 0; s < output_swaps.size(); ++s) {
const uint64_t i0 = output_swaps[s].i0;
const uint64_t i1 = output_swaps[s].i1;
if (logits_size > 0) {
for (uint32_t k = 0; k < n_vocab; k++) {
for (uint64_t k = 0; k < n_vocab; k++) {
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
}
}
if (embd_size > 0) {
for (uint32_t k = 0; k < n_embd; k++) {
for (uint64_t k = 0; k < n_embd; k++) {
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
}
}

View File

@@ -13800,10 +13800,6 @@ struct llm_build_glm4_moe : public llm_graph_context {
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// MoE layer with shared experts
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
// Process routed experts using existing MoE infrastructure
ggml_tensor * routed_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,

View File

@@ -506,13 +506,17 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
auto fname = m_params.out_file;
bool use_legacy_format = m_params.imat_dat;
int8_t use_legacy_format = m_params.imat_dat;
if (use_legacy_format) {
if (use_legacy_format > 0) {
this->save_imatrix_legacy(n_chunk);
return;
}
// else, default to GGUF imatrix
// only warn when `--output-format gguf` is not specified
if (use_legacy_format == 0 && !string_ends_with(fname, ".gguf")) {
LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__);
LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__);
}
if (n_chunk > 0) {
fname += ".at_";

Binary file not shown.

View File

@@ -31,7 +31,24 @@ html {
hr {
@apply my-4 border-base-content/20 border-1;
}
/* TODO: fix markdown table */
table {
@apply w-full border-collapse text-sm font-sans my-4 text-base-content;
}
thead {
@apply bg-base-200 text-base-content;
}
th {
@apply border border-base-300 px-4 py-2 text-left font-semibold;
}
td {
@apply border border-base-300 px-4 py-2 align-top;
}
tbody tr:nth-child(even) {
@apply bg-base-100;
}
tbody tr:hover {
@apply bg-base-200;
}
}
.btn-mini {