cli : fix not copying preserved tokens (#24258 )

Add cohere2moe to llama-vocab for TINY_AYA (#24601 )
ci : use CUDA label for cuda backend (#24594 )
2026-06-14 14:55:49 +02:00 · 2026-06-14 11:52:15 +02:00 · 2026-06-14 09:04:46 +02:00 · 2026-06-14 08:27:52 +02:00
3 changed files with 13 additions and 5 deletions
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ SYCL:
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
-Nvidia GPU:
+CUDA:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2280,7 +2280,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                clean_spaces = false;
                ignore_merges = true;
            } else if (
-                tokenizer_pre == "tiny_aya") {
+                tokenizer_pre == "tiny_aya" ||
+                tokenizer_pre == "cohere2moe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA;
                clean_spaces = false;
            } else if (
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -97,11 +97,18 @@ struct cli_context {
                task.params.chat_parser_params.parser.load(chat_params.parser);
            }

+            // Copy the preserved tokens into the sampling params
+            const llama_vocab * vocab = llama_model_get_vocab(
+                llama_get_model(ctx_server.get_llama_context()));
+            for (const auto & token : chat_params.preserved_tokens) {
+                auto ids = common_tokenize(vocab, token, false, true);
+                if (ids.size() == 1) {
+                    task.params.sampling.preserved_tokens.insert(ids[0]);
+                }
+            }
+
            // reasoning budget sampler
            if (!chat_params.thinking_end_tag.empty()) {
-                const llama_vocab * vocab = llama_model_get_vocab(
-                    llama_get_model(ctx_server.get_llama_context()));
-
                task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
                task.params.sampling.generation_prompt = chat_params.generation_prompt;
Author	SHA1	Message	Date
Michael Wand	6e14286eda	cli : fix not copying preserved tokens (#24258 )	2026-06-14 11:52:15 +02:00
Bartowski	8ed274ef46	Add cohere2moe to llama-vocab for TINY_AYA (#24601 )	2026-06-14 09:04:46 +02:00
Sigbjørn Skjæret	46722116b9	ci : use CUDA label for cuda backend (#24594 )	2026-06-14 08:27:52 +02:00