kleidiai : update to v1.24.0 and use release archive (#22549 )

CUDA: use fastdiv for batch index split in get_rows (#22650 )
server: implement /models?reload=1 (#21848 )
2026-05-05 01:04:16 +02:00 · 2026-05-04 22:13:31 +03:00 · 2026-05-04 16:24:05 +02:00 · 2026-05-04 16:23:26 +02:00 · 2026-05-04 20:19:30 +08:00 · 2026-05-04 13:38:10 +02:00
24 changed files with 1227 additions and 653 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3794,7 +3794,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
    add_opt(common_arg(
        {"--diffusion-algorithm"}, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
+        string_format(
+            "diffusion algorithm: 0=DIFFUSION_ALGORITHM_ORIGIN, 1=DIFFUSION_ALGORITHM_ENTROPY_BASED, "
+            "2=DIFFUSION_ALGORITHM_MARGIN_BASED, 3=DIFFUSION_ALGORITHM_RANDOM, "
+            "4=DIFFUSION_ALGORITHM_CONFIDENCE_BASED (default: %d)", params.diffusion.algorithm),
        [](common_params & params, int value) { params.diffusion.algorithm = value; }
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
    add_opt(common_arg(
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        if (!end.empty()) {
            if (!start.empty()) {
                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
+                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
            }
            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(end)) + end + p.space());
+            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
        }
    }

@@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
@@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        tool_start = format.per_call_start;
    }

-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
 }

 common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
@@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    common_peg_parser tool_choice = p.choice();

@@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
 }

 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));

@@ -471,8 +466,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
-           p.end();
+    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
 }

 }  // namespace autoparser
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
    if (left_trimmed.empty() && !diff.right.empty()) {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = trim_leading_whitespace(diff.right);
+                start = diff.right;
                mode  = reasoning_mode::TAG_BASED;
            }
        }
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                    start = seg[seg.size() - 2].value;
                }
-                end = trim_trailing_whitespace(diff.left);
+                end = diff.left;
                mode = reasoning_mode::TAG_BASED;
            }
        }
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
            start = result.tags["pre"];
-            end = trim_trailing_whitespace(result.tags["post"]);
+            end = result.tags["post"];
        } else {
            auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
                return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
            });
            result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
            if (result.result.success()) {
-                end = trim_trailing_whitespace(result.tags["post"]);
+                end = result.tags["post"];
            } else {
                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
    return literal(s.substr(0, s.rfind(delimiter)));
 }

+common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
+    auto parser = eps();
+    size_t end_of_prefix_space = tag.size();
+    size_t start_of_suffix_space = tag.size();
+    for (size_t i = 0; i < tag.size(); i++) {
+        if (!std::isspace(tag[i])) {
+            end_of_prefix_space = i;
+            break;
+        }
+    }
+    for (size_t i = tag.size(); i > 0; i--) {
+        if (!std::isspace(tag[i - 1])) {
+            start_of_suffix_space = i;
+            break;
+        }
+    }
+    for (size_t i = 0; i < end_of_prefix_space; i++) {
+        parser += optional(literal(std::string(1, tag[i])));
+    }
+    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
+    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
+        parser += optional(literal(std::string(1, tag[i])));
+    }
+    return parser;
+}
+
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
    // Return a parser that parses the prefix of a string, up to a given delimiter.
    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});

+    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
+    common_peg_parser optspace(const std::string & tag);
+
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -2221,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
-            auto_params.thinking_start_tag = autoparser.reasoning.start;
-            auto_params.thinking_end_tag   = autoparser.reasoning.end;
+            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
+            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
        }
        auto_params.generation_prompt = params.generation_prompt;
        common_peg_arena arena;
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -158,6 +158,8 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
    for (size_t i = 0; i < cur_p->size; i++) {
        if (cur_p->data[i].id != forced) {
            cur_p->data[i].logit = -INFINITY;
+        } else {
+            cur_p->data[i].logit = +INFINITY; // force the token
        }
    }
 }
--- a/examples/diffusion/CMakeLists.txt
+++ b/examples/diffusion/CMakeLists.txt
@@ -1,5 +1,10 @@
+set(TARGET llama-diffusion)
+add_library(${TARGET} STATIC diffusion.cpp diffusion.h)
+target_link_libraries(${TARGET} PUBLIC llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_17)
+
 set(TARGET llama-diffusion-cli)
 add_executable(${TARGET} diffusion-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama-diffusion llama llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -12,11 +12,11 @@ The diffusion CLI supports various parameters to control the generation process:
 ### Core Diffusion Parameters
 - `--diffusion-steps`: Number of diffusion steps (default: 256)
 - `--diffusion-algorithm`: Algorithm for token selection
-  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
-  - `1`: ENTROPY_BASED - Entropy-based selection
-  - `2`: MARGIN_BASED - Margin-based selection
-  - `3`: RANDOM - Random selection
-  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
+  - `0`: DIFFUSION_ALGORITHM_ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
+  - `1`: DIFFUSION_ALGORITHM_ENTROPY_BASED - Entropy-based selection
+  - `2`: DIFFUSION_ALGORITHM_MARGIN_BASED - Margin-based selection
+  - `3`: DIFFUSION_ALGORITHM_RANDOM - Random selection
+  - `4`: DIFFUSION_ALGORITHM_CONFIDENCE_BASED - Confidence-based selection (default)
  - More documentation here https://github.com/DreamLM/Dream
 - `--diffusion-visual`: Enable live visualization during generation

--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -1,127 +1,23 @@
 #include "arg.h"
 #include "chat.h"
 #include "common.h"
+#include "diffusion.h"
 #include "llama.h"
 #include "log.h"

 #include <limits.h>

-#include <algorithm>
 #include <clocale>
-#include <cmath>
 #include <cstring>
-#include <limits>
-#include <random>
 #include <string>
 #include <vector>

-enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
-
-// Unified transfer scheduling methods
-enum transfer_schedule {
-    TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
-    BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
-};
-
-typedef bool (*diffusion_step_callback_t)(int32_t             step,
-                                          int32_t             total_steps,
-                                          const llama_token * tokens,
-                                          int32_t             n_tokens,
-                                          void *              user_data);
-
-struct diffusion_params {
-    int32_t                   steps                   = 0;
-    float                     temperature             = 0;
-    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
-    diffusion_step_callback_t step_callback           = nullptr;
-    void *                    step_callback_user_data = nullptr;
-    int32_t                   seed                    = 0;
-    bool                      visual_mode             = false;
-    bool                      shift_logits            = false;  // Shift logits by -1 after decode
-
-    float   top_p = 0.;
-    int32_t top_k = 0.;
-
-    diffusion_algorithm algorithm = CONFIDENCE_BASED;
-    transfer_schedule   schedule  = TIMESTEP_BASED;
-
-    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
-    float   eps              = 0.;     // Timestep scheduling
-    int32_t block_length     = 0;      // Block size (for block scheduling)
-    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
-    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
-
-    int32_t max_length = 0;            // Maximum sequence length
-};
-
 struct callback_data {
    diffusion_params *  diff_params;
    const llama_vocab * vocab;
    int32_t             n_input;
 };

-static float calculate_confidence(const llama_token_data_array & cur_p,
-                                  diffusion_algorithm            algorithm,
-                                  std::mt19937 &                 rng) {
-    switch (algorithm) {
-        case CONFIDENCE_BASED:
-            return cur_p.data[cur_p.selected].p;  // Selected token probability
-
-        case ENTROPY_BASED:
-            {
-                float       entropy = 0.0f;
-                const float epsilon = 1e-10f;
-                for (size_t i = 0; i < cur_p.size; i++) {
-                    float prob = cur_p.data[i].p;
-                    entropy += prob * logf(prob + epsilon);
-                }
-                return -entropy;  // Higher entropy = lower confidence
-            }
-
-        case MARGIN_BASED:
-            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
-
-        case RANDOM:
-            {
-                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
-                return uniform(rng);  // Random confidence
-            }
-
-        case ORIGIN:
-            return cur_p.data[cur_p.selected].p;
-
-        default:
-            return 0.0f;
-    }
-}
-
-// Unified transfer count calculation function
-static int32_t calculate_transfer_count(int32_t                      step,
-                                        int32_t                      total_steps,
-                                        int32_t                      remaining_masked,
-                                        transfer_schedule            schedule,
-                                        float                        eps,
-                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
-    switch (schedule) {
-        case TIMESTEP_BASED:
-            {
-                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
-                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
-                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
-                return (int32_t) (remaining_masked * p_transfer);
-            }
-
-        case BLOCK_BASED:
-            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
-                return num_transfer_tokens[step];
-            }
-            return remaining_masked / (total_steps - step);  // Fallback
-
-        default:
-            return remaining_masked / (total_steps - step);
-    }
-}
-
 static bool diffusion_step_callback(int32_t             step,
                                    int32_t             total_steps,
                                    const llama_token * tokens,
@@ -176,341 +72,6 @@ static bool diffusion_step_callback(int32_t             step,
    return true;
 }

-static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
-    if (temperature == 0.0f) {
-        return;
-    }
-
-    std::uniform_real_distribution<double> uniform(0.0, 1.0);
-    for (int32_t i = 0; i < n_vocab; i++) {
-        double noise        = uniform(rng);
-        // Prevent log(0)
-        noise               = std::max(noise, 1e-20);
-        double gumbel_noise = std::pow(-std::log(noise), temperature);
-        logits[i]           = std::exp(logits[i]) / gumbel_noise;
-    }
-}
-
-static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
-    std::vector<int32_t> num_transfer_tokens(steps);
-
-    int32_t base      = mask_count / steps;
-    int32_t remainder = mask_count % steps;
-
-    for (int32_t i = 0; i < steps; i++) {
-        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
-    }
-
-    return num_transfer_tokens;
-}
-
-static void diffusion_generate(llama_context *          ctx,
-                               const llama_token *      input_tokens,
-                               llama_token *            output_tokens,
-                               int32_t                  n_input,
-                               const diffusion_params & params,
-                               int32_t &                n_generated) {
-    n_generated = 0;
-    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
-        return;
-    }
-
-    const llama_model * model = llama_get_model(ctx);
-
-    // Initialize with input and pad with mask tokens
-    std::copy(input_tokens, input_tokens + n_input, output_tokens);
-    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
-
-    std::mt19937 rng(params.seed);
-
-    llama_set_causal_attn(ctx, false);
-
-    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
-
-    std::vector<llama_token_data> candidates(n_vocab);
-    std::vector<llama_token_data> conf_candidates;
-    conf_candidates.reserve(params.max_length);
-    std::vector<int32_t> mask_positions;
-    mask_positions.reserve(params.max_length);
-
-    // Setup sampler chain
-    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
-    if (params.top_k > 0) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
-    }
-    if (params.top_p < 1.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
-    }
-    if (params.temperature > 0.0f) {
-        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
-    }
-    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
-
-    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
-
-    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
-    batch.n_tokens    = params.max_length;
-
-    // Pre-allocate buffers for CFG if needed
-    int32_t                  logits_size = n_vocab * params.max_length;
-    std::vector<float>       cond_logits_buffer;
-    std::vector<llama_token> un_x_buffer;
-    if (params.cfg_scale > 0.0f) {
-        cond_logits_buffer.resize(logits_size);
-        un_x_buffer.resize(params.max_length);
-    }
-
-    // For block-based processing
-    std::vector<int32_t> num_transfer_tokens;
-    int32_t              num_blocks      = 1;
-    int32_t              steps_per_block = params.steps;
-
-    if (params.schedule == BLOCK_BASED) {
-        GGML_ASSERT(params.max_length % params.block_length == 0);
-        num_blocks = params.max_length / params.block_length;
-        GGML_ASSERT(params.steps % num_blocks == 0);
-        steps_per_block = params.steps / num_blocks;
-    }
-
-    std::vector<float> confidence(params.max_length);
-
-    int64_t total_sampling_time = 0;
-    int64_t total_time          = 0;
-    int64_t time_start          = ggml_time_us();
-
-    for (int block_num = 0; block_num < num_blocks; block_num++) {
-        int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
-        int32_t block_end   = (params.schedule == BLOCK_BASED) ?
-                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
-                                  params.max_length;
-
-        // Count masked tokens in current block for block-based processing
-        if (params.schedule == BLOCK_BASED) {
-            int32_t block_mask_count = 0;
-            for (int i = block_start; i < block_end; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    block_mask_count++;
-                }
-            }
-            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
-        }
-
-        for (int32_t step = 0; step < steps_per_block; step++) {
-            int32_t global_step = block_num * steps_per_block + step;
-
-            if (params.step_callback) {
-                if (!params.step_callback(
-                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
-                    break;
-                }
-            }
-
-            // Setup batch
-            for (int32_t i = 0; i < params.max_length; i++) {
-                batch.token[i]     = output_tokens[i];
-                batch.pos[i]       = i;
-                batch.n_seq_id[i]  = 1;
-                batch.seq_id[i][0] = 0;
-                batch.logits[i]    = 1;
-            }
-
-            float * logits = nullptr;
-
-            if (params.cfg_scale > 0.0f) {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate conditional");
-                    break;
-                }
-                float * cond_logits_ptr = llama_get_logits(ctx);
-                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
-
-                // Unconditional generation (mask input)
-                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
-                for (int32_t i = 0; i < n_input; i++) {
-                    un_x_buffer[i] = params.mask_token_id;
-                }
-
-                for (int32_t i = 0; i < params.max_length; i++) {
-                    batch.token[i] = un_x_buffer[i];
-                }
-                ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("Failed to generate unconditional");
-                    break;
-                }
-                float * uncond_logits = llama_get_logits(ctx);
-
-                // Apply CFG
-                for (int32_t i = 0; i < logits_size; i++) {
-                    cond_logits_buffer[i] =
-                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
-                }
-                logits = cond_logits_buffer.data();
-            } else {
-                int ret = llama_decode(ctx, batch);
-                if (ret != 0) {
-                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
-                    break;
-                }
-                logits = llama_get_logits(ctx);
-            }
-
-            if (!logits) {
-                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
-                break;
-            }
-
-            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
-                if (params.shift_logits) {
-                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
-                }
-                return logits + (pos) *n_vocab;
-            };
-
-            int64_t time_start_sampling = ggml_time_us();
-
-            mask_positions.clear();
-            for (int32_t i = 0; i < params.max_length; i++) {
-                if (output_tokens[i] == params.mask_token_id) {
-                    // For block-based, only consider current block
-                    if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
-                        mask_positions.push_back(i);
-                    }
-                }
-            }
-
-            if (mask_positions.empty()) {
-                break;
-            }
-
-            if (params.add_gumbel_noise && params.temperature > 0.0f) {
-                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
-            }
-
-            if (params.algorithm == ORIGIN) {
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-                float p_transfer = (float) transfer_count / mask_positions.size();
-
-                for (int32_t pos : mask_positions) {
-                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
-                        const float * pos_logits = get_logits_for_pos(pos);
-                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                            candidates[token_id].id    = token_id;
-                            candidates[token_id].logit = pos_logits[token_id];
-                            candidates[token_id].p     = 0.0f;
-                        }
-
-                        llama_token_data_array cur_p = {
-                            candidates.data(),
-                            (size_t) n_vocab,
-                            -1,
-                            false,
-                        };
-
-                        llama_sampler_apply(sampler, &cur_p);
-                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
-                    }
-                }
-            } else {
-                std::vector<std::pair<float, int32_t>> confidences;
-                std::vector<llama_token>               sampled_tokens(mask_positions.size());
-
-                for (size_t i = 0; i < mask_positions.size(); i++) {
-                    int32_t       pos        = mask_positions[i];
-                    const float * pos_logits = get_logits_for_pos(pos);
-
-                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
-                        candidates[token_id].logit = pos_logits[token_id];
-                        candidates[token_id].p     = 0.0f;
-                        candidates[token_id].id    = token_id;
-                    }
-
-                    llama_token_data_array cur_p = {
-                        candidates.data(),
-                        candidates.size(),
-                        -1,
-                        false,
-                    };
-
-                    llama_sampler_apply(sampler, &cur_p);
-                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
-
-                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
-
-                    sampled_tokens[i] = sampled_token;
-                    confidences.emplace_back(conf, i);
-                }
-
-                int32_t transfer_count = calculate_transfer_count(
-                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
-
-                if (transfer_count > 0) {
-                    if (params.alg_temp == 0.0f) {
-                        std::partial_sort(confidences.begin(),
-                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
-                                          confidences.end(),
-                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
-                                              if (a.first != b.first) {
-                                                  return a.first > b.first;
-                                              }
-                                              return a.second < b.second;
-                                          });
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            int32_t mask_idx   = confidences[i].second;
-                            int32_t pos        = mask_positions[mask_idx];
-                            output_tokens[pos] = sampled_tokens[mask_idx];
-                        }
-                    } else {
-                        conf_candidates.clear();
-                        for (size_t i = 0; i < confidences.size(); i++) {
-                            float conf_logit = confidences[i].first / params.alg_temp;
-                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
-                        }
-
-                        llama_token_data_array conf_array = {
-                            conf_candidates.data(),
-                            conf_candidates.size(),
-                            -1,
-                            false,
-                        };
-
-                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
-                            llama_sampler_apply(dist_sampler, &conf_array);
-                            int32_t selected_idx = conf_array.selected;
-                            int32_t mask_idx     = selected_idx;
-                            int32_t pos          = mask_positions[mask_idx];
-                            output_tokens[pos]   = sampled_tokens[mask_idx];
-
-                            conf_candidates[selected_idx].p = 0.0f;
-                            conf_array.selected             = -1;
-                        }
-                    }
-                }
-            }
-
-            int64_t time_end_sampling = ggml_time_us();
-            total_sampling_time += time_end_sampling - time_start_sampling;
-        }
-    }
-
-    int64_t time_end = ggml_time_us();
-    total_time += time_end - time_start;
-
-    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
-            total_time / 1000.0,
-            total_time / 1000.0 / params.steps,
-            total_sampling_time / 1000.0 / params.steps);
-
-    llama_batch_free(batch);
-    llama_sampler_free(sampler);
-    llama_sampler_free(dist_sampler);
-
-    n_generated = params.max_length;
-}
-
 static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
    if (!use_chat_template) {
        return prompt;
@@ -631,10 +192,10 @@ int main(int argc, char ** argv) {
    GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));

    if (params.diffusion.eps) {
-        diff_params.schedule = TIMESTEP_BASED;
+        diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
        diff_params.eps      = params.diffusion.eps;
    } else if (params.diffusion.block_length) {
-        diff_params.schedule     = BLOCK_BASED;
+        diff_params.schedule     = DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED;
        diff_params.block_length = params.diffusion.block_length;
    }

@@ -653,8 +214,17 @@ int main(int argc, char ** argv) {
    callback_data cb_data               = { &diff_params, vocab, n_input };
    diff_params.step_callback_user_data = &cb_data;

-    const char * alg_names[]   = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
-    const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
+    const char * alg_names[]   = {
+        "DIFFUSION_ALGORITHM_ORIGIN",
+        "DIFFUSION_ALGORITHM_ENTROPY_BASED",
+        "DIFFUSION_ALGORITHM_MARGIN_BASED",
+        "DIFFUSION_ALGORITHM_RANDOM",
+        "DIFFUSION_ALGORITHM_CONFIDENCE_BASED",
+    };
+    const char * sched_names[] = {
+        "DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED",
+        "DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED",
+    };
    const char * alg_name =
        (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
    const char * sched_name =
@@ -666,11 +236,11 @@ int main(int argc, char ** argv) {
    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
    LOG_INF("diffusion_params: - %-25s enum             = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
    LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "temperature", diff_params.temperature);
-    if (diff_params.schedule == TIMESTEP_BASED) {
+    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED) {
        LOG_INF("diffusion_params: - %-25s f32              = %.6f\n", "eps", diff_params.eps);
        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "alg_temp", diff_params.alg_temp);
    }
-    if (diff_params.schedule == BLOCK_BASED) {
+    if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
        LOG_INF("diffusion_params: - %-25s u32              = %d\n", "block_length", diff_params.block_length);
        LOG_INF("diffusion_params: - %-25s f32              = %.3f\n", "cfg_scale", diff_params.cfg_scale);
    }
--- a/examples/diffusion/diffusion.cpp
+++ b/examples/diffusion/diffusion.cpp
@@ -0,0 +1,408 @@
+#include "diffusion.h"
+
+#include "log.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cmath>
+#include <cstring>
+#include <random>
+#include <utility>
+#include <vector>
+
+static float calculate_confidence(const llama_token_data_array & cur_p,
+                                  diffusion_algorithm            algorithm,
+                                  std::mt19937 &                 rng) {
+    switch (algorithm) {
+        case DIFFUSION_ALGORITHM_CONFIDENCE_BASED:
+            return cur_p.data[cur_p.selected].p;  // Selected token probability
+
+        case DIFFUSION_ALGORITHM_ENTROPY_BASED:
+            {
+                float       entropy = 0.0f;
+                const float epsilon = 1e-10f;
+                for (size_t i = 0; i < cur_p.size; i++) {
+                    float prob = cur_p.data[i].p;
+                    entropy += prob * logf(prob + epsilon);
+                }
+                return -entropy;  // Higher entropy = lower confidence
+            }
+
+        case DIFFUSION_ALGORITHM_MARGIN_BASED:
+            return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
+
+        case DIFFUSION_ALGORITHM_RANDOM:
+            {
+                std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
+                return uniform(rng);  // Random confidence
+            }
+
+        case DIFFUSION_ALGORITHM_ORIGIN:
+            return cur_p.data[cur_p.selected].p;
+
+        default:
+            return 0.0f;
+    }
+}
+
+// Unified transfer count calculation function
+static int32_t calculate_transfer_count(int32_t                      step,
+                                        int32_t                      total_steps,
+                                        int32_t                      remaining_masked,
+                                        diffusion_transfer_schedule  schedule,
+                                        float                        eps,
+                                        const std::vector<int32_t> & num_transfer_tokens = {}) {
+    switch (schedule) {
+        case DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED:
+            {
+                float t          = 1.0f - (float) step / total_steps * (1.0f - eps);
+                float s          = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
+                float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
+                return (int32_t) (remaining_masked * p_transfer);
+            }
+
+        case DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED:
+            if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
+                return num_transfer_tokens[step];
+            }
+            return remaining_masked / (total_steps - step);  // Fallback
+
+        default:
+            return remaining_masked / (total_steps - step);
+    }
+}
+
+static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
+    if (temperature == 0.0f) {
+        return;
+    }
+
+    std::uniform_real_distribution<double> uniform(0.0, 1.0);
+    for (int32_t i = 0; i < n_vocab; i++) {
+        double noise        = uniform(rng);
+        // Prevent log(0)
+        noise               = std::max(noise, 1e-20);
+        double gumbel_noise = std::pow(-std::log(noise), temperature);
+        logits[i]           = std::exp(logits[i]) / gumbel_noise;
+    }
+}
+
+static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
+    std::vector<int32_t> num_transfer_tokens(steps);
+
+    int32_t base      = mask_count / steps;
+    int32_t remainder = mask_count % steps;
+
+    for (int32_t i = 0; i < steps; i++) {
+        num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
+    }
+
+    return num_transfer_tokens;
+}
+
+void diffusion_generate(llama_context *          ctx,
+                        const llama_token *      input_tokens,
+                        llama_token *            output_tokens,
+                        int32_t                  n_input,
+                        const diffusion_params & params,
+                        int32_t &                n_generated) {
+    n_generated = 0;
+    if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
+        return;
+    }
+
+    const llama_model * model = llama_get_model(ctx);
+
+    // Initialize with input and pad with mask tokens
+    std::copy(input_tokens, input_tokens + n_input, output_tokens);
+    std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
+
+    std::mt19937 rng(params.seed);
+
+    llama_set_causal_attn(ctx, false);
+
+    int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+    std::vector<llama_token_data> candidates(n_vocab);
+    std::vector<llama_token_data> conf_candidates;
+    conf_candidates.reserve(params.max_length);
+    std::vector<int32_t> mask_positions;
+    mask_positions.reserve(params.max_length);
+
+    // Setup sampler chain
+    struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
+    if (params.top_k > 0) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
+    }
+    if (params.top_p < 1.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
+    }
+    if (params.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
+    }
+    llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
+
+    struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
+
+    llama_batch batch = llama_batch_init(params.max_length, 0, 1);
+    batch.n_tokens    = params.max_length;
+
+    // Pre-allocate buffers for CFG if needed
+    int32_t                  logits_size = n_vocab * params.max_length;
+    std::vector<float>       cond_logits_buffer;
+    std::vector<llama_token> un_x_buffer;
+    if (params.cfg_scale > 0.0f) {
+        cond_logits_buffer.resize(logits_size);
+        un_x_buffer.resize(params.max_length);
+    }
+
+    // For block-based processing
+    std::vector<int32_t> num_transfer_tokens;
+    int32_t              num_blocks      = 1;
+    int32_t              steps_per_block = params.steps;
+
+    if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+        GGML_ASSERT(params.max_length % params.block_length == 0);
+        num_blocks = params.max_length / params.block_length;
+        GGML_ASSERT(params.steps % num_blocks == 0);
+        steps_per_block = params.steps / num_blocks;
+    }
+
+    std::vector<float> confidence(params.max_length);
+
+    int64_t total_sampling_time = 0;
+    int64_t total_time          = 0;
+    int64_t time_start          = ggml_time_us();
+
+    for (int block_num = 0; block_num < num_blocks; block_num++) {
+        int32_t block_start = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
+        int32_t block_end   = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ?
+                                  std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
+                                  params.max_length;
+
+        // Count masked tokens in current block for block-based processing
+        if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
+            int32_t block_mask_count = 0;
+            for (int i = block_start; i < block_end; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    block_mask_count++;
+                }
+            }
+            num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
+        }
+
+        for (int32_t step = 0; step < steps_per_block; step++) {
+            int32_t global_step = block_num * steps_per_block + step;
+
+            if (params.step_callback) {
+                if (!params.step_callback(
+                        global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
+                    break;
+                }
+            }
+
+            // Setup batch
+            for (int32_t i = 0; i < params.max_length; i++) {
+                batch.token[i]     = output_tokens[i];
+                batch.pos[i]       = i;
+                batch.n_seq_id[i]  = 1;
+                batch.seq_id[i][0] = 0;
+                batch.logits[i]    = 1;
+            }
+
+            float * logits = nullptr;
+
+            if (params.cfg_scale > 0.0f) {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate conditional");
+                    break;
+                }
+                float * cond_logits_ptr = llama_get_logits(ctx);
+                std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
+
+                // Unconditional generation (mask input)
+                std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
+                for (int32_t i = 0; i < n_input; i++) {
+                    un_x_buffer[i] = params.mask_token_id;
+                }
+
+                for (int32_t i = 0; i < params.max_length; i++) {
+                    batch.token[i] = un_x_buffer[i];
+                }
+                ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("Failed to generate unconditional");
+                    break;
+                }
+                float * uncond_logits = llama_get_logits(ctx);
+
+                // Apply CFG
+                for (int32_t i = 0; i < logits_size; i++) {
+                    cond_logits_buffer[i] =
+                        uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
+                }
+                logits = cond_logits_buffer.data();
+            } else {
+                int ret = llama_decode(ctx, batch);
+                if (ret != 0) {
+                    LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
+                    break;
+                }
+                logits = llama_get_logits(ctx);
+            }
+
+            if (!logits) {
+                LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
+                break;
+            }
+
+            auto get_logits_for_pos = [&](int32_t pos) -> const float * {
+                if (params.shift_logits) {
+                    return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
+                }
+                return logits + pos * n_vocab;
+            };
+
+            int64_t time_start_sampling = ggml_time_us();
+
+            mask_positions.clear();
+            for (int32_t i = 0; i < params.max_length; i++) {
+                if (output_tokens[i] == params.mask_token_id) {
+                    // For block-based, only consider current block
+                    if (params.schedule != DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED || (i >= block_start && i < block_end)) {
+                        mask_positions.push_back(i);
+                    }
+                }
+            }
+
+            if (mask_positions.empty()) {
+                break;
+            }
+
+            if (params.add_gumbel_noise && params.temperature > 0.0f) {
+                add_gumbel_noise(logits, n_vocab, params.temperature, rng);
+            }
+
+            if (params.algorithm == DIFFUSION_ALGORITHM_ORIGIN) {
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+                float p_transfer = (float) transfer_count / mask_positions.size();
+
+                for (int32_t pos : mask_positions) {
+                    if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
+                        const float * pos_logits = get_logits_for_pos(pos);
+                        for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                            candidates[token_id].id    = token_id;
+                            candidates[token_id].logit = pos_logits[token_id];
+                            candidates[token_id].p     = 0.0f;
+                        }
+
+                        llama_token_data_array cur_p = {
+                            candidates.data(),
+                            (size_t) n_vocab,
+                            -1,
+                            false,
+                        };
+
+                        llama_sampler_apply(sampler, &cur_p);
+                        output_tokens[pos] = cur_p.data[cur_p.selected].id;
+                    }
+                }
+            } else {
+                std::vector<std::pair<float, int32_t>> confidences;
+                std::vector<llama_token>               sampled_tokens(mask_positions.size());
+
+                for (size_t i = 0; i < mask_positions.size(); i++) {
+                    int32_t       pos        = mask_positions[i];
+                    const float * pos_logits = get_logits_for_pos(pos);
+
+                    for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
+                        candidates[token_id].logit = pos_logits[token_id];
+                        candidates[token_id].p     = 0.0f;
+                        candidates[token_id].id    = token_id;
+                    }
+
+                    llama_token_data_array cur_p = {
+                        candidates.data(),
+                        candidates.size(),
+                        -1,
+                        false,
+                    };
+
+                    llama_sampler_apply(sampler, &cur_p);
+                    llama_token sampled_token = cur_p.data[cur_p.selected].id;
+
+                    float conf = calculate_confidence(cur_p, params.algorithm, rng);
+
+                    sampled_tokens[i] = sampled_token;
+                    confidences.emplace_back(conf, i);
+                }
+
+                int32_t transfer_count = calculate_transfer_count(
+                    step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
+
+                if (transfer_count > 0) {
+                    if (params.alg_temp == 0.0f) {
+                        std::partial_sort(confidences.begin(),
+                                          confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
+                                          confidences.end(),
+                                          [](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
+                                              if (a.first != b.first) {
+                                                  return a.first > b.first;
+                                              }
+                                              return a.second < b.second;
+                                          });
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            int32_t mask_idx   = confidences[i].second;
+                            int32_t pos        = mask_positions[mask_idx];
+                            output_tokens[pos] = sampled_tokens[mask_idx];
+                        }
+                    } else {
+                        conf_candidates.clear();
+                        for (size_t i = 0; i < confidences.size(); i++) {
+                            float conf_logit = confidences[i].first / params.alg_temp;
+                            conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
+                        }
+
+                        llama_token_data_array conf_array = {
+                            conf_candidates.data(),
+                            conf_candidates.size(),
+                            -1,
+                            false,
+                        };
+
+                        for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
+                            llama_sampler_apply(dist_sampler, &conf_array);
+                            int32_t selected_idx = conf_array.selected;
+                            int32_t mask_idx     = selected_idx;
+                            int32_t pos          = mask_positions[mask_idx];
+                            output_tokens[pos]   = sampled_tokens[mask_idx];
+
+                            conf_candidates[selected_idx].p = 0.0f;
+                            conf_array.selected             = -1;
+                        }
+                    }
+                }
+            }
+
+            int64_t time_end_sampling = ggml_time_us();
+            total_sampling_time += time_end_sampling - time_start_sampling;
+        }
+    }
+
+    int64_t time_end = ggml_time_us();
+    total_time += time_end - time_start;
+
+    LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
+            total_time / 1000.0,
+            total_time / 1000.0 / params.steps,
+            total_sampling_time / 1000.0 / params.steps);
+
+    llama_batch_free(batch);
+    llama_sampler_free(sampler);
+    llama_sampler_free(dist_sampler);
+
+    n_generated = params.max_length;
+}
--- a/examples/diffusion/diffusion.h
+++ b/examples/diffusion/diffusion.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+enum diffusion_algorithm {
+    DIFFUSION_ALGORITHM_ORIGIN           = 0,
+    DIFFUSION_ALGORITHM_ENTROPY_BASED    = 1,
+    DIFFUSION_ALGORITHM_MARGIN_BASED     = 2,
+    DIFFUSION_ALGORITHM_RANDOM           = 3,
+    DIFFUSION_ALGORITHM_CONFIDENCE_BASED = 4,
+};
+
+// Unified transfer scheduling methods
+enum diffusion_transfer_schedule {
+    DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED = 0,  // Dream-style: (1.0 - s/t) * remaining
+    DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED    = 1,  // LLaDA-style: process in blocks with get_num_transfer_tokens
+};
+
+typedef bool (*diffusion_step_callback_t)(int32_t             step,
+                                          int32_t             total_steps,
+                                          const llama_token * tokens,
+                                          int32_t             n_tokens,
+                                          void *              user_data);
+
+struct diffusion_params {
+    int32_t                   steps                   = 0;
+    float                     temperature             = 0;
+    llama_token               mask_token_id           = LLAMA_TOKEN_NULL;
+    diffusion_step_callback_t step_callback           = nullptr;
+    void *                    step_callback_user_data = nullptr;
+    int32_t                   seed                    = 0;
+    bool                      visual_mode             = false;
+    bool                      shift_logits            = false;  // Shift logits by -1 after decode
+
+    float   top_p = 0.;
+    int32_t top_k = 0.;
+
+    diffusion_algorithm         algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED;
+    diffusion_transfer_schedule schedule  = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
+
+    float   cfg_scale        = 0.;     // Config scale for classifier-free guidance
+    float   eps              = 0.;     // Timestep scheduling
+    int32_t block_length     = 0;      // Block size (for block scheduling)
+    float   alg_temp         = 0;      // algorithm temperature (0.0 = deterministic)
+    bool    add_gumbel_noise = false;  // Add gumbel noise to the logits if temp > 0.0
+
+    int32_t max_length = 0;            // Maximum sequence length
+};
+
+void diffusion_generate(llama_context *          ctx,
+                        const llama_token *      input_tokens,
+                        llama_token *            output_tokens,
+                        int32_t                  n_input,
+                        const diffusion_params & params,
+                        int32_t &                n_generated);
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -578,13 +578,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.22.0")
-        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "54049037570ab0ee0a0d126b2ba5ece1")
+        set(KLEIDIAI_COMMIT_TAG "v1.24.0")
+        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
+        set(KLEIDIAI_RELEASE_ARCHIVE_MD5  "2f02ebe29573d45813e671eb304f2a00")

        set(KLEIDIAI_FETCH_ARGS
            URL ${KLEIDIAI_DOWNLOAD_URL}
-            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
+            URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
        )
        if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
            list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -6,17 +6,18 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(
        const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
        const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
-        /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
+        /*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
        /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

-    for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
+    for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
        for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
            // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
            const int i10 =  blockIdx.x;
-            const int i11 =  z / ne12; // TODO fastdiv
-            const int i12 =  z % ne12;
+            const uint2 dm  = fast_div_modulo((uint32_t)z, ne12_fdv);
+            const int i11 =  dm.x;
+            const int i12 =  dm.y;

            const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

@@ -42,17 +43,18 @@ template<typename src0_t, typename dst_t>
 static __global__ void k_get_rows_float(
        const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
        const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
-        /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
+        /*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
        /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

-    for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
+    for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
        for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
            // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
            const int i10 = blockIdx.x;
-            const int i11 = z / ne12; // TODO fastdiv
-            const int i12 = z % ne12;
+            const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
+            const int i11 = dm.x;
+            const int i12 = dm.y;

            if (i00 >= ne00) {
                return;
@@ -115,10 +117,14 @@ static void get_rows_cuda_q(

    GGML_ASSERT(ne00 % 2 == 0);

+    GGML_ASSERT(ne12 > 0);
+    GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
+    const uint3 ne12_fdv = init_fastdiv_values(ne12);
+
    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
        src0_d, src1_d, dst_d,
        ne00, /*ne01, ne02, ne03,*/
-        /*ne10,*/ ne11, ne12, /*ne13,*/
+        /*ne10,*/ ne11, ne12_fdv, /*ne13,*/
        /* s0,*/ s1, s2, s3,
        /* nb00,*/ nb01, nb02, nb03,
        s10, s11, s12/*, s13*/);
@@ -146,10 +152,14 @@ static void get_rows_cuda_float(
    const size_t s12 = nb12 / sizeof(int32_t);
    // const size_t s13 = nb13 / sizeof(int32_t);

+    GGML_ASSERT(ne12 > 0);
+    GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
+    const uint3 ne12_fdv = init_fastdiv_values(ne12);
+
    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
        src0_d, src1_d, dst_d,
        ne00, /*ne01, ne02, ne03,*/
-        /*ne10,*/ ne11, ne12, /*ne13,*/
+        /*ne10,*/ ne11, ne12_fdv, /*ne13,*/
        /* s0,*/ s1, s2, s3,
        /* nb00,*/ nb01, nb02, nb03,
        s10, s11, s12/*, s13*/);
--- a/scripts/server-test-function-call.py
+++ b/scripts/server-test-function-call.py
@@ -79,7 +79,7 @@ def print_info(msg):
 # ---------------------------------------------------------------------------


-def chat_completion(url, messages, tools=None, stream=False):
+def chat_completion(url, messages, tools=None, stream=False, force_tools=False):
    payload = {
        "messages": messages,
        "stream": stream,
@@ -87,7 +87,10 @@ def chat_completion(url, messages, tools=None, stream=False):
    }
    if tools:
        payload["tools"] = tools
-        payload["tool_choice"] = "auto"
+        if force_tools:
+            payload["tool_choice"] = "required"
+        else:
+            payload["tool_choice"] = "auto"

    try:
        response = requests.post(url, json=payload, stream=stream)
@@ -160,7 +163,13 @@ def chat_completion(url, messages, tools=None, stream=False):
    return result


-def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
+def all_tools_called(tools, all_tool_calls):
+    all_tool_names = set([tc["function"]["name"] for tc in tools])
+    all_called_tool_names = set([tc["function"]["name"] for tc in all_tool_calls])
+    return all_tool_names == all_called_tool_names
+
+
+def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6, force_tools=False):
    """
    Drive the multi-turn tool-call loop:
      1. Send messages to model.
@@ -172,8 +181,8 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
    msgs = list(messages)
    all_tool_calls: list[dict] = []

-    for _ in range(max_turns):
-        result = chat_completion(url, msgs, tools=tools, stream=stream)
+    for t in range(max_turns):
+        result = chat_completion(url, msgs, tools=tools, stream=stream, force_tools=(force_tools and not all_tools_called(tools, all_tool_calls)))
        if result is None:
            return all_tool_calls, None

@@ -235,10 +244,10 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn
 # ---------------------------------------------------------------------------


-def run_test(url, test_case, stream):
+def run_test(url, test_case, stream, force_tools):
    name = test_case["name"]
    mode = f"{'stream' if stream else 'non-stream'}"
-    print_header(f"{name}  [{mode}]")
+    print_header(f"{name} [{mode}, force_tools={force_tools}] ")

    all_tool_calls, final_content = run_agentic_loop(
        url,
@@ -246,6 +255,7 @@ def run_test(url, test_case, stream):
        tools=test_case["tools"],
        mock_tool_responses=test_case["mock_tool_responses"],
        stream=stream,
+        force_tools=force_tools
    )

    if final_content is None and not all_tool_calls:
@@ -1093,6 +1103,9 @@ def main():
    parser.add_argument(
        "--stream-only", action="store_true", help="Only run streaming mode tests"
    )
+    parser.add_argument(
+        "--force-tools", action="store_true", help="Change tool mode to forced instead of auto"
+    )
    parser.add_argument(
        "--test",
        help="Run only the test whose name contains this substring (case-insensitive)",
@@ -1103,10 +1116,13 @@ def main():
    print_info(f"Testing server at {url}")

    modes = []
+    force_tools = False
    if not args.stream_only:
        modes.append(False)
    if not args.no_stream:
        modes.append(True)
+    if args.force_tools:
+        force_tools = True

    cases: list[dict] = ALL_TEST_CASES
    if args.test:
@@ -1121,7 +1137,7 @@ def main():
    for stream in modes:
        for case in cases:
            total += 1
-            if run_test(url, case, stream=stream):
+            if run_test(url, case, stream=stream, force_tools=force_tools):
                passed += 1

    color = GREEN if passed == total else RED
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -542,6 +542,36 @@ static common_chat_tool edit_tool{
    })",
 };

+static common_chat_tool manage_todo_list_tool{
+    /* .name = */ "manage_todo_list",
+    /* .description = */ "Create or update the todo list",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "todos": {
+                "type": "array",
+                "description": "List of TODO list items"
+            }
+        },
+        "required": ["todos"]
+    })",
+};
+
+static common_chat_tool run_in_terminal_tool{
+    /* .name = */ "run_in_terminal",
+    /* .description = */ "Run a shell command.",
+    /* .parameters = */ R"({
+        "type": "object",
+        "properties": {
+            "command": {
+                "type": "string",
+                "description": "Shell command to run"
+            }
+        },
+        "required": ["command"]
+    })",
+};
+
 static common_chat_tool magic_tool{
    /* .name = */ "magic",
    /* .description = */ "Magic tool that takes a hash",
@@ -1379,6 +1409,16 @@ class peg_test_builder {
        return *this;
    }

+    peg_test_builder & tool_choice(common_chat_tool_choice choice) {
+        tc_.params.tool_choice = choice;
+        return *this;
+    }
+
+    peg_test_builder & messages(std::vector<common_chat_msg> messages) {
+        tc_.params.messages = std::move(messages);
+        return *this;
+    }
+
    // Execute the test
    void run() {
        // Check template filter
@@ -1755,23 +1795,23 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
               "hello()\n"
               "</parameter>\n"
               "</function>\n"
-               "</tool_call>"
-            )
+               "</tool_call>")
            .enable_thinking(true)
            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
            .tools({
                python_tool
        })
-            .expect_reasoning("Let's call a tool: <tool_call>\n"
-               "<function=python>\n"
-               "<parameter=code>\n"
-               "def hello():\n"
-               "    print(\"Not the real call!\")\n"
-               "\n"
-               "hello()\n"
-               "</parameter>\n"
-               "</function>\n"
-               "</tool_call>")
+            .expect_reasoning(
+                "Let's call a tool: <tool_call>\n"
+                "<function=python>\n"
+                "<parameter=code>\n"
+                "def hello():\n"
+                "    print(\"Not the real call!\")\n"
+                "\n"
+                "hello()\n"
+                "</parameter>\n"
+                "</function>\n"
+                "</tool_call>")
            .expect_tool_calls({
                { "python", "{\"code\": \"def hello():\\n    print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
            })
@@ -1800,6 +1840,219 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
            .tools({ empty_args_tool_no_properties })
            .expect(message_with_tool_calls("empty_args_no_props", "{}"))
            .run();
+
+        // Edge cases when reasoning traces are not sent
+        tst.test(
+               "<think>\n\n</think>\n\n"
+               "<tool_call>\n"
+               "<function=special_function>\n"
+               "<parameter=arg1>\n1\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({
+                special_function_tool
+        })
+            .expect_reasoning("<think>\n\n")
+            .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "<tool_call>\n"
+               "<function=special_function>\n"
+               "<parameter=arg1>\n1\n</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .tools({
+                special_function_tool
+        })
+            .expect_reasoning("")
+            .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "Let me inspect the current directory.\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .expect_content("Let me inspect the current directory.\n")
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "</think>\n\n"
+               "Let me inspect the current directory.\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
+            .expect_content("Let me inspect the current directory.\n")
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "I should inspect the directory.\n"
+               "</think>\n\n"
+               "Let me inspect it now.\n"
+               "<tool_call>\n"
+               "<function=run_in_terminal>\n"
+               "<parameter=command>\n"
+               "pwd\n"
+               "</parameter>\n"
+               "</function>\n"
+               "</tool_call>")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({
+                run_in_terminal_tool
+        })
+            .expect_reasoning("I should inspect the directory.")
+            .expect_content("Let me inspect it now.\n")
+            .expect_tool_calls({
+                { "run_in_terminal", R"({"command": "pwd"})", {} },
+            })
+            .run();
+
+        tst.test(
+               "I might call <tool_call> later, but I am still thinking.\n"
+               "</think>\n\n"
+               "Final answer without tools.")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
+            .tools({ run_in_terminal_tool })
+            .expect_reasoning("I might call <tool_call> later, but I am still thinking.")
+            .expect_content("Final answer without tools.")
+            .run();
+
+        {
+            common_chat_msg user_start;
+            user_start.role    = "user";
+            user_start.content = "Create a todo list, then inspect the repository.";
+
+            common_chat_msg assistant_todos =
+                simple_assist_msg("", "", "manage_todo_list",
+                                  R"({"todos":[{"item":"Inspect repository","selected":false}]})", "call_todos");
+
+            common_chat_msg tool_result;
+            tool_result.role         = "tool";
+            tool_result.content      = "Successfully wrote todo list";
+            tool_result.tool_call_id = "call_todos";
+
+            common_chat_msg user_continue;
+            user_continue.role    = "user";
+            user_continue.content = "Proceed.";
+
+            tst.test(
+                   "I need to run a terminal command.\n"
+                   "</think>\n\n"
+                   "<tool_call>\n"
+                   "<function=run_in_terminal>\n"
+                   "<parameter=command>\n"
+                   "pwd\n"
+                   "</parameter>\n"
+                   "</function>\n"
+                   "</tool_call>")
+                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+                .enable_thinking(true)
+                .tools({
+                    manage_todo_list_tool, run_in_terminal_tool
+            })
+                .messages({ user_start, assistant_todos, tool_result, user_continue })
+                .expect_reasoning("I need to run a terminal command.")
+                .expect_tool_calls({
+                    { "run_in_terminal", R"({"command": "pwd"})", {} },
+                })
+                .run();
+
+            tst.test(
+                   "I need to run a terminal command.\n"
+                   "</think>\n\n"
+                   "Let me inspect the current directory.\n"
+                   "<tool_call>\n"
+                   "<function=run_in_terminal>\n"
+                   "<parameter=command>\n"
+                   "pwd\n"
+                   "</parameter>\n"
+                   "</function>\n"
+                   "</tool_call>")
+                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+                .enable_thinking(true)
+                .tools({
+                    manage_todo_list_tool, run_in_terminal_tool
+            })
+                .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED)
+                .messages({ user_start, assistant_todos, tool_result, user_continue })
+                .expect_reasoning("I need to run a terminal command.")
+                .expect_content("Let me inspect the current directory.\n")
+                .expect_tool_calls({
+                    { "run_in_terminal", R"({"command": "pwd"})", {} },
+                })
+                .run();
+
+            tst.test(
+                   "</think>\n\n"
+                   "<tool_call>\n"
+                   "<function=run_in_terminal>\n"
+                   "<parameter=command>\n"
+                   "pwd\n"
+                   "</parameter>\n"
+                   "</function>\n"
+                   "</tool_call>")
+                .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+                .enable_thinking(true)
+                .tools({
+                    manage_todo_list_tool, run_in_terminal_tool
+            })
+                .messages({ user_start, assistant_todos, tool_result, user_continue })
+                .expect_tool_calls({
+                    { "run_in_terminal", R"({"command": "pwd"})", {} },
+                })
+                .run();
+        }
    }

    {
--- a/tests/test-reasoning-budget.cpp
+++ b/tests/test-reasoning-budget.cpp
@@ -70,20 +70,20 @@ static void test_reasoning_budget(
        llama_sampler_apply(sampler, &cur_p);

        // Check if forcing is active (all logits except one should be -INFINITY)
-        size_t finite_count = 0;
-        llama_token finite_token = -1;
+        size_t not_neg_inf = 0;
+        llama_token not_neg_inf_token = -1;
        for (size_t j = 0; j < cur.size(); j++) {
-            if (std::isfinite(cur[j].logit)) {
-                finite_count++;
-                finite_token = cur[j].id;
+            if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY
+                not_neg_inf++;
+                not_neg_inf_token = cur[j].id;
            }
        }

        llama_sampler_accept(sampler, sequence[i]);

-        fprintf(stderr, "    i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
+        fprintf(stderr, "    i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token);

-        if (finite_count == 1) {
+        if (not_neg_inf == 1) {
            if (actual_force_start == SIZE_MAX) {
                actual_force_start = i;
            }
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1646,7 +1646,11 @@ Listing all models in cache. The model metadata will also include a field to ind
 }
 ```

-Note: For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
+Note:
+1. For a local GGUF (stored offline in a custom directory), the model object will have `"in_cache": false`.
+2. Adding `?reload=1` to the query params will refresh the list of models. The behavior is as follow:
+    - If a model is running but updated or removed from the source, it will be unloaded
+    - If a model is not running, it will be added or updated according to the source

 The `status` object can be:

--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -243,9 +243,8 @@ void server_models::add_model(server_model_meta && meta) {
    };
 }

-// TODO: allow refreshing cached model list
 void server_models::load_models() {
-    // loading models from 3 sources:
+    // Phase 1: load presets from all sources — pure I/O, no lock needed
    // 1. cached models
    common_presets cached_models = ctx_preset.load_from_cache();
    SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
@@ -270,112 +269,266 @@ void server_models::load_models() {

    // note: if a model exists in both cached and local, local takes precedence
    common_presets final_presets;
-    for (const auto & [name, preset] : cached_models) {
-        final_presets[name] = preset;
-    }
-    for (const auto & [name, preset] : local_models) {
-        final_presets[name] = preset;
-    }
-
-    // process custom presets from INI
+    for (const auto & [name, preset] : cached_models) final_presets[name] = preset;
+    for (const auto & [name, preset] : local_models)  final_presets[name] = preset;
    for (const auto & [name, custom] : custom_presets) {
        if (final_presets.find(name) != final_presets.end()) {
-            // apply custom config if exists
-            common_preset & target = final_presets[name];
-            target.merge(custom);
+            final_presets[name].merge(custom);
        } else {
-            // otherwise add directly
            final_presets[name] = custom;
        }
    }
-
-    // server base preset from CLI args take highest precedence
+    // server base preset from CLI args takes highest precedence
    for (auto & [name, preset] : final_presets) {
        preset.merge(base_preset);
    }

-    // convert presets to server_model_meta and add to mapping
-    for (const auto & preset : final_presets) {
-        server_model_meta meta{
-            /* preset       */ preset.second,
-            /* name         */ preset.first,
-            /* aliases      */ {},
-            /* tags         */ {},
-            /* port         */ 0,
-            /* status       */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used    */ 0,
-            /* args         */ std::vector<std::string>(),
-            /* exit_code    */ 0,
-            /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
-        };
-        add_model(std::move(meta));
-    }
-
-    // log available models
-    {
-        std::unordered_set<std::string> custom_names;
-        for (const auto & [name, preset] : custom_presets) {
-            custom_names.insert(name);
+    // Helpers that read `mapping` — must be called while holding the lock.
+    std::unordered_set<std::string> custom_names;
+    for (const auto & [name, preset] : custom_presets) custom_names.insert(name);
+    auto join_set = [](const std::set<std::string> & s) {
+        std::string result;
+        for (const auto & v : s) {
+            if (!result.empty()) result += ", ";
+            result += v;
        }
-        auto join_set = [](const std::set<std::string> & s) {
-            std::string result;
-            for (const auto & v : s) {
-                if (!result.empty()) {
-                    result += ", ";
-                }
-                result += v;
-            }
-            return result;
-        };
-
+        return result;
+    };
+    auto log_available_models = [&]() {
        SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
        for (const auto & [name, inst] : mapping) {
            bool has_custom = custom_names.find(name) != custom_names.end();
            std::string info;
-            if (!inst.meta.aliases.empty()) {
-                info += " (aliases: " + join_set(inst.meta.aliases) + ")";
-            }
-            if (!inst.meta.tags.empty()) {
-                info += " [tags: " + join_set(inst.meta.tags) + "]";
-            }
+            if (!inst.meta.aliases.empty()) info += " (aliases: " + join_set(inst.meta.aliases) + ")";
+            if (!inst.meta.tags.empty())    info += " [tags: "    + join_set(inst.meta.tags)    + "]";
            SRV_INF("  %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
        }
-    }
-
-    // handle custom stop-timeout option
-    for (auto & [name, inst] : mapping) {
-        std::string val;
-        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
-            try {
-                inst.meta.stop_timeout = std::stoi(val);
-            } catch (...) {
-                SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
-                    val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
-                inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
+    };
+    auto apply_stop_timeout = [&]() {
+        for (auto & [name, inst] : mapping) {
+            std::string val;
+            if (inst.meta.preset.get_option(COMMON_ARG_PRESET_STOP_TIMEOUT, val)) {
+                try {
+                    inst.meta.stop_timeout = std::stoi(val);
+                } catch (...) {
+                    SRV_WRN("invalid stop-timeout value '%s' for model '%s', using default %d seconds\n",
+                        val.c_str(), name.c_str(), DEFAULT_STOP_TIMEOUT);
+                    inst.meta.stop_timeout = DEFAULT_STOP_TIMEOUT;
+                }
            }
        }
-    }
+    };
+    // update_args() injects HOST/PORT/ALIAS, so strip them before comparing presets
+    auto preset_options_for_compare = [](common_preset p) {
+        p.unset_option("LLAMA_ARG_HOST");
+        p.unset_option("LLAMA_ARG_PORT");
+        p.unset_option("LLAMA_ARG_ALIAS");
+        return p.options;
+    };

-    // load any autoload models
-    std::vector<std::string> models_to_load;
-    for (const auto & [name, inst] : mapping) {
-        std::string val;
-        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
-            if (common_arg_utils::is_truthy(val)) {
+    // Phase 2: acquire the lock once for all mapping mutations.
+    // We temporarily release it only when calling functions that acquire it internally
+    // (unload, load) or when joining threads (the monitoring thread calls update_status
+    // which locks the mutex, so joining while holding it would deadlock).
+    std::unique_lock<std::mutex> lk(mutex);
+    bool is_first_load = mapping.empty();
+
+    if (is_first_load) {
+        // FIRST LOAD: add all models, then unlock for autoloading
+        for (const auto & [name, preset] : final_presets) {
+            server_model_meta meta{
+                /* preset       */ preset,
+                /* name         */ name,
+                /* aliases      */ {},
+                /* tags         */ {},
+                /* port         */ 0,
+                /* status       */ SERVER_MODEL_STATUS_UNLOADED,
+                /* last_used    */ 0,
+                /* args         */ std::vector<std::string>(),
+                /* exit_code    */ 0,
+                /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+            };
+            add_model(std::move(meta));
+        }
+        apply_stop_timeout();
+        log_available_models();
+
+        std::vector<std::string> models_to_load;
+        for (const auto & [name, inst] : mapping) {
+            std::string val;
+            if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
                models_to_load.push_back(name);
            }
        }
-    }
-    if ((int)models_to_load.size() > base_params.models_max) {
-        throw std::runtime_error(string_format(
-            "number of models to load on startup (%zu) exceeds models_max (%d)",
-            models_to_load.size(),
-            base_params.models_max
-        ));
-    }
-    for (const auto & name : models_to_load) {
-        SRV_INF("(startup) loading model %s\n", name.c_str());
-        load(name);
+        if ((int)models_to_load.size() > base_params.models_max) {
+            throw std::runtime_error(string_format(
+                "number of models to load on startup (%zu) exceeds models_max (%d)",
+                models_to_load.size(), base_params.models_max));
+        }
+
+        lk.unlock();
+        for (const auto & name : models_to_load) {
+            SRV_INF("(startup) loading model %s\n", name.c_str());
+            load(name);
+        }
+    } else {
+        // RELOAD: diff the new preset list against the current mapping and reconcile
+        is_reloading = true;
+
+        // find running models whose source was removed or whose preset changed
+        std::vector<std::string> to_unload;
+        for (const auto & [name, inst] : mapping) {
+            if (!inst.meta.is_running()) continue;
+            auto it = final_presets.find(name);
+            if (it == final_presets.end()) {
+                to_unload.push_back(name); // removed from source
+            } else if (preset_options_for_compare(inst.meta.preset) != preset_options_for_compare(it->second)) {
+                to_unload.push_back(name); // preset changed
+            }
+        }
+
+        // unload() acquires the lock internally, so release before each call
+        for (const auto & name : to_unload) {
+            SRV_INF("(reload) unloading model name=%s (source updated or removed)\n", name.c_str());
+            lk.unlock();
+            unload(name);
+            lk.lock();
+        }
+
+        // wait for all targeted models to reach UNLOADED; cv.wait handles unlock/relock
+        cv.wait(lk, [&]() {
+            for (const auto & name : to_unload) {
+                auto it = mapping.find(name);
+                if (it != mapping.end() && it->second.meta.is_running()) return false;
+            }
+            return true;
+        });
+
+        // collect all threads to join in one pass while the lock is held:
+        // - monitoring threads from just-unloaded models (to_unload)
+        // - threads of already-UNLOADED models that are being removed from source
+        std::vector<std::thread> threads_to_join;
+        for (const auto & name : to_unload) {
+            auto it = mapping.find(name);
+            if (it != mapping.end() && it->second.th.joinable()) {
+                threads_to_join.push_back(std::move(it->second.th));
+            }
+        }
+        for (auto & [name, inst] : mapping) {
+            if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) {
+                threads_to_join.push_back(std::move(inst.th));
+            }
+        }
+
+        // join outside the lock — monitoring thread calls update_status (needs lock)
+        lk.unlock();
+        for (auto & th : threads_to_join) th.join();
+        lk.lock();
+
+        // erase models no longer in any source
+        for (auto it = mapping.begin(); it != mapping.end(); ) {
+            if (final_presets.find(it->first) == final_presets.end()) {
+                SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str());
+                GGML_ASSERT(!it->second.th.joinable()); // must have been joined above
+                it = mapping.erase(it);
+            } else {
+                ++it;
+            }
+        }
+
+        // update presets for non-running models still in source
+        for (auto & [name, inst] : mapping) {
+            if (inst.meta.is_running()) continue;
+            auto it = final_presets.find(name);
+            if (it == final_presets.end()) continue; // erased above
+
+            inst.meta.preset = it->second;
+
+            // re-parse aliases, then validate against other models
+            std::set<std::string> new_aliases;
+            std::string alias_str;
+            if (inst.meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
+                for (auto & alias : string_split<std::string>(alias_str, ',')) {
+                    alias = string_strip(alias);
+                    if (!alias.empty()) new_aliases.insert(alias);
+                }
+            }
+            inst.meta.aliases.clear();
+            for (const auto & alias : new_aliases) {
+                bool conflict = false;
+                for (const auto & [other_name, other_inst] : mapping) {
+                    if (other_name == name) continue;
+                    if (other_name == alias || other_inst.meta.aliases.count(alias)) {
+                        SRV_WRN("(reload) alias '%s' for model '%s' conflicts with model '%s', skipping\n",
+                            alias.c_str(), name.c_str(), other_name.c_str());
+                        conflict = true;
+                        break;
+                    }
+                }
+                if (!conflict) inst.meta.aliases.insert(alias);
+            }
+
+            // re-parse tags
+            inst.meta.tags.clear();
+            std::string tags_str;
+            if (inst.meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
+                for (auto & tag : string_split<std::string>(tags_str, ',')) {
+                    tag = string_strip(tag);
+                    if (!tag.empty()) inst.meta.tags.insert(tag);
+                }
+            }
+
+            inst.meta.exit_code = 0; // clear failed state so the model can be reloaded
+            inst.meta.update_args(ctx_preset, bin_path);
+        }
+
+        // add models that are new in this reload
+        std::vector<std::string> newly_added;
+        for (const auto & [name, preset] : final_presets) {
+            if (mapping.find(name) == mapping.end()) {
+                server_model_meta meta{
+                    /* preset       */ preset,
+                    /* name         */ name,
+                    /* aliases      */ {},
+                    /* tags         */ {},
+                    /* port         */ 0,
+                    /* status       */ SERVER_MODEL_STATUS_UNLOADED,
+                    /* last_used    */ 0,
+                    /* args         */ std::vector<std::string>(),
+                    /* exit_code    */ 0,
+                    /* stop_timeout */ DEFAULT_STOP_TIMEOUT,
+                };
+                add_model(std::move(meta));
+                newly_added.push_back(name);
+            }
+        }
+
+        apply_stop_timeout();
+
+        // clear reload flag before unlocking for autoload — load() blocks on !is_reloading,
+        // so clearing it here (while still locked) prevents a deadlock in the autoload calls below
+        is_reloading = false;
+        cv.notify_all();
+
+        log_available_models();
+
+        // collect autoload candidates while still under the lock
+        std::vector<std::string> to_autoload;
+        for (const auto & name : newly_added) {
+            auto it = mapping.find(name);
+            if (it != mapping.end()) {
+                std::string val;
+                if (it->second.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val) && common_arg_utils::is_truthy(val)) {
+                    to_autoload.push_back(name);
+                }
+            }
+        }
+
+        lk.unlock();
+        for (const auto & name : to_autoload) {
+            SRV_INF("(reload) loading new model %s\n", name.c_str());
+            load(name);
+        }
    }
 }

@@ -536,7 +689,10 @@ void server_models::load(const std::string & name) {
    }
    unload_lru();

-    std::lock_guard<std::mutex> lk(mutex);
+    std::unique_lock<std::mutex> lk(mutex);
+    // edge case: block until any in-progress reload has finished so we always load
+    // against the freshest preset and a consistent mapping state
+    cv.wait(lk, [this]() { return !is_reloading; });

    auto meta = mapping[name].meta;
    if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
@@ -993,7 +1149,11 @@ void server_models_routes::init_routes() {
        return res;
    };

-    this->get_router_models = [this](const server_http_req &) {
+    this->get_router_models = [this](const server_http_req & req) {
+        bool reload = !req.get_param("reload", "").empty();
+        if (reload) {
+            models.load_models();
+        }
        auto res = std::make_unique<server_http_res>();
        json models_json = json::array();
        auto all_models = models.get_all_meta();
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -100,6 +100,9 @@ private:
    std::condition_variable cv_stop;
    std::set<std::string> stopping_models;

+    // set to true while load_models() is executing a reload; load() will wait until clear
+    bool is_reloading = false;
+
    common_preset_context ctx_preset;

    common_params base_params;
@@ -118,6 +121,11 @@ private:
 public:
    server_models(const common_params & params, int argc, char ** argv);

+    // (re-)load the list of models from various sources and prepare the metadata mapping
+    // - if this is called the first time, simply populate the metadata
+    // - if this is called subsequently (e.g. when refreshing from disk):
+    //   - if a model is running but updated or removed from the source, it will be unloaded
+    //   - if a model is not running, it will be added or updated according to the source
    void load_models();

    // check if a model instance exists (thread-safe)
--- a/tools/server/tests/unit/test_router.py
+++ b/tools/server/tests/unit/test_router.py
@@ -62,6 +62,12 @@ def test_router_chat_completion_stream(model: str, success: bool):
        assert content == ""


+def _get_model_ids(is_reload: bool) -> set[str]:
+    res = server.make_request("GET", "/models" + ("?reload=1" if is_reload else ""))
+    assert res.status_code == 200
+    return {item["id"] for item in res.body.get("data", [])}
+
+
 def _get_model_status(model_id: str) -> str:
    res = server.make_request("GET", "/models")
    assert res.status_code == 200
@@ -205,3 +211,45 @@ def test_router_api_key_required():
    )
    assert authed.status_code == 200
    assert "error" not in authed.body
+
+
+def test_router_reload_models():
+    """POST /models/reload re-reads the INI preset and updates the model list."""
+    global server
+
+    preset_path = os.path.join(TMP_DIR, "test_reload.ini")
+
+    # Initial preset: two models
+    with open(preset_path, "w") as f:
+        f.write(
+            "[model-reload-a]\n"
+            "hf-repo = ggml-org/test-model-stories260K\n"
+            "\n"
+            "[model-reload-b]\n"
+            "hf-repo = ggml-org/test-model-stories260K-infill\n"
+        )
+
+    server.models_preset = preset_path
+    server.start()
+
+    ids = _get_model_ids(is_reload=False)
+    assert "model-reload-a" in ids
+    assert "model-reload-b" in ids
+
+    # Updated preset: remove a, keep b unchanged, add c
+    with open(preset_path, "w") as f:
+        f.write(
+            "[model-reload-b]\n"
+            "hf-repo = ggml-org/test-model-stories260K-infill\n"
+            "\n"
+            "[model-reload-c]\n"
+            "hf-repo = ggml-org/test-model-stories260K\n"
+        )
+
+    try:
+        ids = _get_model_ids(is_reload=True)
+        assert "model-reload-a" not in ids, "removed model should no longer appear"
+        assert "model-reload-b" in ids, "unchanged model should still appear"
+        assert "model-reload-c" in ids, "newly added model should appear"
+    finally:
+        os.remove(preset_path)
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@@ -126,69 +126,70 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict
            actual_arguments = json.loads(actual_arguments)
        assert argument_key in actual_arguments, f"tool arguments: {actual_arguments}, expected: {argument_key}"

+# PR #22654: commented out since we're now allowing content before tool calls in tool_call: required, so we can't force this
+# in the tiny model just by using the grammar
+#
+# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+# @pytest.mark.parametrize("template_name,tool,argument_key", [
+#     ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
+#     ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+#     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
+# ])
+# def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+#     global server
+#     n_predict = 1024
+#     # server = ServerPreset.stories15m_moe()
+#     server.jinja = True
+#     server.n_predict = n_predict
+#     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+#     server.start()
+#     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)

-@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
-@pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
-    ("Qwen3-Coder",                                   TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
-    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
-])
-def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
-    global server
-    n_predict = 1024
-    # server = ServerPreset.stories15m_moe()
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start()
-    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
+# @pytest.mark.slow
+# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+# @pytest.mark.parametrize("template_name,tool,argument_key", [
+#     ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),

+#     ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
+#     ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),

-@pytest.mark.slow
-@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
-@pytest.mark.parametrize("template_name,tool,argument_key", [
-    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
+#     ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
+#     # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
+#     # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),

-    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
-    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
+#     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
+#     ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),

-    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
-    # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
-    # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
+#     ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
+#     ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),

-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
+#     ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
+#     ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),

-    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
-    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
+#     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
+#     ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),

-    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
-    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
+#     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
+#     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),

-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
-    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
+#     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
+#     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
+#     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),

-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
-    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
-
-    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
-    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
-    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
-
-])
-def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
-    global server
-    n_predict = 512
-    # server = ServerPreset.stories15m_moe()
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start(timeout_seconds=TIMEOUT_START_SLOW)
-    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
+# ])
+# def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
+#     global server
+#     n_predict = 512
+#     # server = ServerPreset.stories15m_moe()
+#     server.jinja = True
+#     server.n_predict = n_predict
+#     server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+#     server.start(timeout_seconds=TIMEOUT_START_SLOW)
+#     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)


@pytest.mark.slow
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -5,6 +5,8 @@

 import subprocess
 import os
+
+TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp")
 import re
 import json
 from json import JSONDecodeError
@@ -86,6 +88,7 @@ class ServerProcess:
    api_key: str | None = None
    models_dir: str | None = None
    models_max: int | None = None
+    models_preset: str | None = None
    no_models_autoload: bool | None = None
    lora_files: List[str] | None = None
    enable_ctx_shift: int | None = False
@@ -156,6 +159,8 @@ class ServerProcess:
            server_args.extend(["--models-dir", self.models_dir])
        if self.models_max is not None:
            server_args.extend(["--models-max", self.models_max])
+        if self.models_preset:
+            server_args.extend(["--models-preset", self.models_preset])
        if self.n_batch:
            server_args.extend(["--batch-size", self.n_batch])
        if self.n_ubatch:
--- a/tools/server/webui/src/lib/stores/models.svelte.ts
+++ b/tools/server/webui/src/lib/stores/models.svelte.ts
@@ -1,7 +1,8 @@
 import { SvelteMap, SvelteSet } from 'svelte/reactivity';
 import { toast } from 'svelte-sonner';
 import { ServerModelStatus, ModelModality } from '$lib/enums';
-import { ModelsService, PropsService } from '$lib/services';
+import { ModelsService } from '$lib/services/models.service';
+import { PropsService } from '$lib/services/props.service';
 import { serverStore } from '$lib/stores/server.svelte';
 import { TTLCache } from '$lib/utils';
 import {
Author	SHA1	Message	Date
Charles Xu	eff06702b2	kleidiai : update to v1.24.0 and use release archive (#22549 )	2026-05-04 22:13:31 +03:00
leonardHONG	e77056f9b2	CUDA: use fastdiv for batch index split in get_rows (#22650 )	2026-05-04 16:24:05 +02:00
Xuan-Son Nguyen	935a340292	server: implement /models?reload=1 (#21848 )	2026-05-04 16:23:26 +02:00
Shakhnazar Sailaukan	d8794eecd5	examples: refactor diffusion generation (#22590 ) * examples: refactor diffusion generation * renamed enum values	2026-05-04 20:19:30 +08:00
JusteLeo	36a694c965	webui : fix circular dependency between chat.service.ts and models.svelte.ts (#22625 )	2026-05-04 13:38:10 +02:00
Piotr Wilkin (ilintar)	a4701c98f7	common/autoparser: fixes for newline handling / forced tool calls (#22654 ) * chat/autoparser: the fixes * Move optspace() to chat-peg-parser, comment out server tests invalidated due to content now allowed with forced tool calls. * Trim whitespace on apply instead	2026-05-04 13:18:11 +02:00