Compare commits

..

3 Commits

Author SHA1 Message Date
Michael Wand
6e14286eda cli : fix not copying preserved tokens (#24258) 2026-06-14 11:52:15 +02:00
Bartowski
8ed274ef46 Add cohere2moe to llama-vocab for TINY_AYA (#24601) 2026-06-14 09:04:46 +02:00
Sigbjørn Skjæret
46722116b9 ci : use CUDA label for cuda backend (#24594) 2026-06-14 08:27:52 +02:00
3 changed files with 13 additions and 5 deletions

2
.github/labeler.yml vendored
View File

@@ -12,7 +12,7 @@ SYCL:
- ggml/src/ggml-sycl/**
- docs/backend/SYCL.md
- examples/sycl/**
Nvidia GPU:
CUDA:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cuda.h

View File

@@ -2280,7 +2280,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
clean_spaces = false;
ignore_merges = true;
} else if (
tokenizer_pre == "tiny_aya") {
tokenizer_pre == "tiny_aya" ||
tokenizer_pre == "cohere2moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
clean_spaces = false;
} else if (

View File

@@ -97,11 +97,18 @@ struct cli_context {
task.params.chat_parser_params.parser.load(chat_params.parser);
}
// Copy the preserved tokens into the sampling params
const llama_vocab * vocab = llama_model_get_vocab(
llama_get_model(ctx_server.get_llama_context()));
for (const auto & token : chat_params.preserved_tokens) {
auto ids = common_tokenize(vocab, token, false, true);
if (ids.size() == 1) {
task.params.sampling.preserved_tokens.insert(ids[0]);
}
}
// reasoning budget sampler
if (!chat_params.thinking_end_tag.empty()) {
const llama_vocab * vocab = llama_model_get_vocab(
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
task.params.sampling.generation_prompt = chat_params.generation_prompt;