Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # .gitignore # examples/batched/batched.cpp # examples/debug/debug.cpp # examples/eval-callback/eval-callback.cpp # examples/idle/idle.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-create.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # examples/training/finetune.cpp # ggml/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-sycl/fattn-tile.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py # ggml/src/ggml-webgpu/wgsl-shaders/rope.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/soft_max.wgsl # scripts/sync-ggml.last # tests/export-graph-ops.cpp # tests/test-chat.cpp # tests/test-state-restore-fragmented.cpp # tests/test-thread-safety.cpp # tools/batched-bench/batched-bench.cpp # tools/cli/cli.cpp # tools/cvector-generator/cvector-generator.cpp # tools/export-lora/export-lora.cpp # tools/imatrix/imatrix.cpp # tools/perplexity/perplexity.cpp # tools/results/results.cpp # tools/server/CMakeLists.txt
2026-04-09 10:31:45 +02:00 · 2026-04-01 10:54:13 +08:00
parent 133b1f2654 825eb91a66
commit 31aa072da1
61 changed files with 2167 additions and 1180 deletions
--- a/.editorconfig
+++ b/.editorconfig
@@ -21,14 +21,6 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[tools/server/public/*]
-indent_size = 2
-
-[tools/server/public/deps_*]
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
-
 [tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
@@ -61,6 +53,14 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

+[tools/server/public/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
+
 [benches/**]
 indent_style = unset
 indent_size = unset
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,4 @@
+# Treat the generated single-file WebUI build as binary for diff purposes.
+# Git's pack-file delta compression still works (byte-level), but this prevents
+# git diff from printing the entire minified file on every change.
+tools/server/public/index.html -diff
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -232,7 +232,7 @@ using chat_template_caps = jinja::caps;
 struct common_chat_templates {
    bool add_bos;
    bool add_eos;
-    bool has_explicit_template;  // Model had builtin template or template overridde was specified.
+    bool has_explicit_template;  // Model had builtin template or template overridden was specified.
    std::unique_ptr<common_chat_template> template_default;  // always set (defaults to chatml)
    std::unique_ptr<common_chat_template> template_tool_use;
 };
@@ -1004,6 +1004,10 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        auto analysis = p.ref("analysis");
        auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
        auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
+
+        // Consume any unsolicited tool calls, e.g. builtin functions
+        auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
+
        auto any = p.rule("any", preamble | analysis);

        if (has_response_format) {
@@ -1047,7 +1051,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
            return p.zero_or_more(start + any) + start + (tool_call | final_msg);
        }

-        return p.zero_or_more(start + any) + start + final_msg;
+        return p.zero_or_more(start + any) + start + (final_msg | unsolicited);
    });

    data.parser = parser.save();
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -366,6 +366,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
+#if defined(_WIN32)
+    SetConsoleOutputCP(CP_UTF8);
+    SetConsoleCP(CP_UTF8);
+#endif
+
    llama_log_set(common_log_default_callback, NULL);

 #ifdef NDEBUG
@@ -374,7 +379,7 @@ void common_init() {
    const char * build_type = " (debug)";
 #endif

-    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -1250,6 +1255,9 @@ llama_context * common_init_result::context() {
 }

 common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
+    if (seq_id < 0 || seq_id >= (int) pimpl->samplers.size()) {
+        return nullptr;
+    }
    return pimpl->samplers[seq_id].get();
 }

--- a/common/download.cpp
+++ b/common/download.cpp
@@ -123,6 +123,9 @@ class ProgressBar {
    static inline std::map<const ProgressBar *, int> lines;
    static inline int max_line = 0;

+    std::string filename;
+    size_t len = 0;
+
    static void cleanup(const ProgressBar * line) {
        lines.erase(line);
        if (lines.empty()) {
@@ -139,7 +142,23 @@ class ProgressBar {
    }

 public:
-    ProgressBar() = default;
+    ProgressBar(const std::string & url = "") : filename(url) {
+        if (auto pos = filename.rfind('/'); pos != std::string::npos) {
+            filename = filename.substr(pos + 1);
+        }
+        if (auto pos = filename.find('?'); pos != std::string::npos) {
+            filename = filename.substr(0, pos);
+        }
+        for (size_t i = 0; i < filename.size(); ++i) {
+            if ((filename[i] & 0xC0) != 0x80) {
+                if (len++ == 39) {
+                    filename.resize(i);
+                    filename += "…";
+                    break;
+                }
+            }
+        }
+    }

    ~ProgressBar() {
        std::lock_guard<std::mutex> lock(mutex);
@@ -147,11 +166,7 @@ public:
    }

    void update(size_t current, size_t total) {
-        if (!is_output_a_tty()) {
-            return;
-        }
-
-        if (!total) {
+        if (!total || !is_output_a_tty()) {
            return;
        }

@@ -163,28 +178,27 @@ public:
        }
        int lines_up = max_line - lines[this];

-        size_t width = 50;
+        size_t bar = 55 - len;
        size_t pct = (100 * current) / total;
-        size_t pos = (width * current) / total;
-
-        std::cout << "\033[s";
+        size_t pos = (bar * current) / total;

        if (lines_up > 0) {
            std::cout << "\033[" << lines_up << "A";
        }
-        std::cout << "\033[2K\r["
-            << std::string(pos, '=')
-            << (pos < width ? ">" : "")
-            << std::string(width - pos, ' ')
-            << "] " << std::setw(3) << pct << "%  ("
-            << current / (1024 * 1024) << " MB / "
-            << total / (1024 * 1024) << " MB) "
-            << "\033[u";
+        std::cout << '\r' << "Downloading " << filename << " ";

-        std::cout.flush();
+        for (size_t i = 0; i < bar; ++i) {
+            std::cout << (i < pos ? "—" : " ");
+        }
+        std::cout << std::setw(4) << pct << "%\033[K";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "B";
+        }
+        std::cout << '\r' << std::flush;

        if (current == total) {
-             cleanup(this);
+            cleanup(this);
        }
    }

@@ -212,7 +226,7 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
-    ProgressBar bar;
+    ProgressBar bar(resolve_path);

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@@ -290,7 +304,7 @@ static int common_download_file_single_online(const std::string        & url,
    const bool file_exists = std::filesystem::exists(path);

    if (file_exists && skip_etag) {
-        LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
    }

@@ -298,7 +312,7 @@ static int common_download_file_single_online(const std::string        & url,
    if (file_exists) {
        last_etag = read_etag(path);
    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_DBG("%s: no previous model file found %s\n", __func__, path.c_str());
    }

    auto head = cli.Head(parts.path);
@@ -332,11 +346,11 @@ static int common_download_file_single_online(const std::string        & url,

    if (file_exists) {
        if (etag.empty()) {
-            LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
+            LOG_DBG("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (!last_etag.empty() && last_etag == etag) {
-            LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
+            LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (remove(path.c_str()) != 0) {
@@ -372,7 +386,7 @@ static int common_download_file_single_online(const std::string        & url,
            }
        }

-        LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
+        LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
                __func__, common_http_show_masked_url(parts).c_str(),
                path_temporary.c_str(), etag.c_str());

@@ -441,7 +455,7 @@ int common_download_file_single(const std::string & url,
        return -1;
    }

-    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+    LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
    return 304; // Not Modified - fake cached response
 }

--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -51,7 +51,7 @@ struct common_ngram_map_value {
 // statistics of a n-gram
 struct common_ngram_map_key {
    size_t   key_idx;   // index of key n-gram in token-history
-    size_t   stat_idx;  // index of last token of stastistics computation (key_num, values)
+    size_t   stat_idx;  // index of last token of statistics computation (key_num, values)

    uint16_t key_num;   // number of occurrences of this key n-gram in token-history
    common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -383,6 +383,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        params.backend_sampling = false;
    }

+    if (rbudget && params.backend_sampling) {
+        LOG_WRN("%s: backend sampling is not compatible with reasoning budget, disabling\n", __func__);
+
+        params.backend_sampling = false;
+    }
+
    auto * result = new common_sampler {
        /* .params  = */ params,
        /* .grmr    = */ grmr,
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -545,11 +545,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
        return 1;
    }

-    common_init();
    llama_backend_init();

    llama_model_params model_params = llama_model_default_params();
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -99,12 +99,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
        return 1;
    }

-    common_init();
-
    params.embedding = true;

    // get max number of sequences per batch
--- a/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/glu.wgsl
@@ -0,0 +1,155 @@
+enable f16;
+
+#ifdef TYPE_F32
+#define DataType f32
+#endif
+#ifdef TYPE_F16
+#define DataType f16
+#endif
+
+#ifdef OP_REGLU
+fn op(a: DataType, b: DataType) -> DataType {
+    return max(a, 0) * b;
+}
+#endif
+
+#ifdef OP_GEGLU
+const SQRT_2_OVER_PI: DataType =  0.79788456080286535587989211986876;
+const GELU_COEF_A: DataType = 0.044715;
+
+fn op(a: DataType, b: DataType) -> DataType {
+    let val = SQRT_2_OVER_PI * a * (1.0 + GELU_COEF_A * a * a);
+    return 0.5 * a * (2.0 - 2.0/ (exp(2* val) + 1)) * b;
+}
+#endif
+
+#ifdef OP_SWIGLU
+fn op(a: DataType, b: DataType) -> DataType {
+    return a / (1.0 + exp(-a)) * b;
+}
+#endif
+#ifdef OP_SWIGLU_OAI
+fn op(a: f32, b: f32) -> f32 {
+    let xi = min(a, params.limit);
+    let gi = max(min(b, params.limit), -params.limit);
+    var out_glu = xi / (1.0 + exp(-xi * params.alpha));
+    out_glu = out_glu * (1.0 + gi);
+    return out_glu;
+}
+#endif
+#ifdef OP_GEGLU_ERF
+const p_erf: DataType = 0.3275911;
+const a1_erf: DataType = 0.254829592;
+const a2_erf: DataType = -0.284496736;
+const a3_erf: DataType = 1.421413741;
+const a4_erf: DataType = -1.453152027;
+const a5_erf: DataType = 1.061405429;
+const SQRT_2_INV: DataType = 0.7071067811865476;
+
+fn op(a: DataType, b: DataType) -> DataType {
+    let a_div_sqr2 = a * SQRT_2_INV;
+    let sign_x = sign(a_div_sqr2);
+    let x = abs(a_div_sqr2);
+    let t = 1.0 / (1.0 + p_erf * x);
+    let y = 1.0 - (((((a5_erf * t + a4_erf) * t + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x));
+    let erf_approx = sign_x * y;
+    return 0.5 * a * (1.0 + erf_approx) * b;
+}
+#endif
+#ifdef OP_GEGLU_QUICK
+const GELU_QUICK_COEF: DataType = -1.702;
+
+fn op(a: DataType, b: DataType) -> DataType {
+    return a * (1.0 / (1.0 + exp(GELU_QUICK_COEF * a))) * b;
+}
+#endif
+
+struct Params {
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+
+    // Strides (in elements)
+    stride_src01: u32,
+    stride_src02: u32,
+    stride_src03: u32,
+
+    stride_src11: u32,
+    stride_src12: u32,
+    stride_src13: u32,
+
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // shape of dst
+    ne: u32,
+    ne0: u32,
+    ne1: u32,
+    ne2: u32,
+
+    swapped: u32,
+    alpha: f32,
+    limit: f32,
+}
+
+@group(0) @binding(0)
+var<storage, read_write> src0: array<DataType>;
+
+#ifdef NO_SPLIT
+@group(0) @binding(1)
+var<storage, read_write> dst: array<DataType>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+fn a_value(base: u32) -> DataType {
+    let offset: u32 = select(0, params.ne0, params.swapped != 0);
+    return src0[base + offset];
+}
+
+fn b_value(base: u32) -> DataType {
+    let offset: u32 = select(params.ne0, 0, params.swapped != 0);
+    return src0[base + offset];
+}
+
+#else
+@group(0) @binding(1)
+var<storage, read_write> src1: array<DataType>;
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<DataType>;
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+fn a_value(base: u32) -> DataType {
+    return src0[base];
+}
+
+fn b_value(base: u32) -> DataType {
+    return src1[base];
+}
+
+#endif
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x >= params.ne) {
+        return;
+    }
+
+    var i = gid.x;
+    let i3 = i / (params.ne2 * params.ne1 * params.ne0);
+    i = i % (params.ne2 * params.ne1 * params.ne0);
+    let i2 = i / (params.ne1 * params.ne0);
+    i = i % (params.ne1 * params.ne0);
+    let i1 = i / params.ne0;
+    let i0 = i % params.ne0;
+
+    let i_a = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01 + i0;
+    let i_b = params.offset_src1 + i3 * params.stride_src13 + i2 * params.stride_src12 + i1 * params.stride_src11 + i0;
+    let i_dst = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1 + i0;
+
+    dst[i_dst] = op(a_value(i_a), b_value(i_b));
+}
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -294,7 +294,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
    }

    // get extra buffer types of the CPU
-    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+    // TODO: a more general solution for non-CPU extra buft should be implemented in the future
    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -18,7 +18,7 @@ struct llama_ubatch {
    }

    // typical for M-RoPE cases:
-    //   0 - sequantial position of the tokens/embeddings in the sequence
+    //   0 - sequential position of the tokens/embeddings in the sequence
    //   1 - y position in the image
    //   2 - x position in the image
    //   3 - other
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -595,7 +595,7 @@ void llama_context::sched_reserve() {

    // reserve again with pp graph to avoid ggml-alloc reallocations during inference
    {
-        // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+        // TODO: not sure if the following graph would be worst case for multi-stream KV caches:
        //
        // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
        //
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1665,7 +1665,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {

 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
    // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
-    //       but this would make the graph topology depend on the number of output tokens, which can interere with
+    //       but this would make the graph topology depend on the number of output tokens, which can interfere with
    //       features that require constant topology such as pipeline parallelism
    //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
    //if (n_outputs < n_tokens) {
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -333,7 +333,7 @@ public:
    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

    // store k_cur and v_cur in the cache based on the provided head location
-    // note: the heads in k_cur and v_cur should be layed out contiguously in memory
+    // note: the heads in k_cur and v_cur should be laid out contiguously in memory
    //   - k_cur  [n_embd_head_k, n_head_k, n_tokens]
    //   - k_idxs [n_tokens]
    //   - v_cur  [n_embd_head_v, n_head_v, n_tokens]
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@@ -9,7 +9,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,

    inpL = build_inp_embd(model.tok_embd);

-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
    cb(inpL, "inp_scaled", -1);

--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@@ -9,7 +9,7 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr

    inpL = build_inp_embd(model.tok_embd);

-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
    cb(inpL, "inp_scaled", -1);

--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -12,7 +12,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const

    inpL = build_inp_embd(model.tok_embd);

-    // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
+    // important: do not normalize weights for raw embeddings input (i.e. encoded image embeddings)
    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
    cb(inpL, "inp_scaled", -1);

--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@@ -91,12 +91,12 @@ int main(int argc, char ** argv) {
    common_params params;
    g_params = &params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
        return 1;
    }

-    common_init();
-
    auto & sparams = params.sampling;

    // save choice to use color for later
@@ -147,19 +147,13 @@ int main(int argc, char ** argv) {

    ctx   = llama_init->context();
    model = llama_init->model();
+    smpl  = llama_init->sampler(0);

    if (ctx == NULL) {
        LOG_ERR("%s: error: unable to create context\n", __func__);
        return 1;
    }

-    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    smpl = llama_init->sampler(0);
-
    llama_memory_t mem = llama_get_memory(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);

--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -17,11 +17,12 @@ using namespace std::chrono_literals;
 int main(int argc, char ** argv) {
    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

-    common_init();
    llama_backend_init();
    llama_numa_init(params.numa);
    auto mparams = common_model_params_to_llama(params);
--- a/tools/mtmd/debug/mtmd-debug.cpp
+++ b/tools/mtmd/debug/mtmd-debug.cpp
@@ -54,11 +54,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
        return 1;
    }

-    common_init();
    mtmd_helper_log_set(common_log_default_callback, nullptr);

    if (params.mmproj.path.empty()) {
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -281,11 +281,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
        return 1;
    }

-    common_init();
    mtmd_helper_log_set(common_log_default_callback, nullptr);

    if (params.mmproj.path.empty()) {
--- a/tools/server/README-dev.md
+++ b/tools/server/README-dev.md
@@ -259,6 +259,6 @@ npm run test
 npm run build
 ```

-After `public/index.html.gz` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.
+After `public/index.html` has been generated, rebuild `llama-server` as described in the [build](#build) section to include the updated UI.

 **Note:** The Vite dev server automatically proxies API requests to `http://localhost:8080`. Make sure `llama-server` is running on that port during development.
--- a/tools/server/public/bundle.css
+++ b/tools/server/public/bundle.css
--- a/tools/server/public/bundle.js
+++ b/tools/server/public/bundle.js
--- a/tools/server/public/index.html
+++ b/tools/server/public/index.html
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-cors-proxy.h
+++ b/tools/server/server-cors-proxy.h
@@ -35,8 +35,8 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
    std::map<std::string, std::string> headers;
    for (auto [key, value] : req.headers) {
        auto new_key = key;
-        if (string_starts_with(new_key, "X-Proxy-Header-")) {
-            string_replace_all(new_key, "X-Proxy-Header-", "");
+        if (string_starts_with(new_key, "x-proxy-header-")) {
+            string_replace_all(new_key, "x-proxy-header-", "");
        }
        headers[new_key] = value;
    }
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -10,7 +10,9 @@

 #ifdef LLAMA_BUILD_WEBUI
 // auto generated files (see README.md for details)
-#include "index.html.gz.hpp"
+#include "index.html.hpp"
+#include "bundle.js.hpp"
+#include "bundle.css.hpp"
 #include "loading.html.hpp"
 #endif

@@ -272,16 +274,19 @@ bool server_http_context::init(const common_params & params) {
        } else {
 #ifdef LLAMA_BUILD_WEBUI
            // using embedded static index.html
-            srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
-                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
-                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
-                } else {
-                    res.set_header("Content-Encoding", "gzip");
-                    // COEP and COOP headers, required by pyodide (python interpreter)
-                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
-                }
+            srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) {
+                // COEP and COOP headers, required by pyodide (python interpreter)
+                res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
+                return false;
+            });
+            srv->Get(params.api_prefix + "/bundle.js", [](const httplib::Request & /*req*/, httplib::Response & res) {
+                res.set_content(reinterpret_cast<const char*>(bundle_js), bundle_js_len, "application/javascript; charset=utf-8");
+                return false;
+            });
+            srv->Get(params.api_prefix + "/bundle.css", [](const httplib::Request & /*req*/, httplib::Response & res) {
+                res.set_content(reinterpret_cast<const char*>(bundle_css), bundle_css_len, "text/css; charset=utf-8");
                return false;
            });
 #endif
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -75,6 +75,8 @@ int main(int argc, char ** argv) {
    // own arguments required by this example
    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
        return 1;
    }
@@ -100,8 +102,6 @@ int main(int argc, char ** argv) {
        params.model_alias.insert(params.model.name);
    }

-    common_init();
-
    // struct that contains llama context and inference
    server_context ctx_server;

--- a/tools/server/webui/README.md
+++ b/tools/server/webui/README.md
@@ -188,14 +188,14 @@ The build process:
 1. **Vite Build** - Bundles all TypeScript, Svelte, and CSS
 2. **Static Adapter** - Outputs to `../public` (llama-server's static file directory)
 3. **Post-Build Script** - Cleans up intermediate files
-4. **Custom Plugin** - Creates `index.html.gz` with:
+4. **Custom Plugin** - Creates `index.html` with:
   - Inlined favicon as base64
   - GZIP compression (level 9)
   - Deterministic output (zeroed timestamps)

 ```text
 tools/server/webui/        →  build  →  tools/server/public/
-├── src/                                 ├── index.html.gz  (served by llama-server)
+├── src/                                 ├── index.html  (served by llama-server)
 ├── static/                              └── (favicon inlined)
 └── ...
 ```
@@ -219,7 +219,7 @@ output: {

 The WebUI is embedded directly into the llama-server binary:

-1. `npm run build` outputs `index.html.gz` to `tools/server/public/`
+1. `npm run build` outputs `index.html` to `tools/server/public/`
 2. llama-server compiles this into the binary at build time
 3. When accessing `/`, llama-server serves the gzipped HTML
 4. All assets are inlined (CSS, JS, fonts, favicon)
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -50,7 +50,6 @@
 		"eslint-config-prettier": "^10.0.1",
 		"eslint-plugin-storybook": "^10.2.4",
 		"eslint-plugin-svelte": "^3.0.0",
-		"fflate": "^0.8.2",
 		"globals": "^16.0.0",
 		"http-server": "^14.1.1",
 		"mdast": "^3.0.0",
--- a/tools/server/webui/scripts/install-git-hooks.sh
+++ b/tools/server/webui/scripts/install-git-hooks.sh
@@ -1,14 +1,12 @@
 #!/bin/bash

-# Script to install pre-commit and pre-push hooks for webui
-# Pre-commit: formats code and runs checks
-# Pre-push: builds the project, stashes unstaged changes
+# Script to install pre-commit hook for webui
+# Pre-commit: formats, checks, builds, and stages build output

 REPO_ROOT=$(git rev-parse --show-toplevel)
 PRE_COMMIT_HOOK="$REPO_ROOT/.git/hooks/pre-commit"
-PRE_PUSH_HOOK="$REPO_ROOT/.git/hooks/pre-push"

-echo "Installing pre-commit and pre-push hooks for webui..."
+echo "Installing pre-commit hook for webui..."

 # Create the pre-commit hook
 cat > "$PRE_COMMIT_HOOK" << 'EOF'
@@ -16,21 +14,19 @@ cat > "$PRE_COMMIT_HOOK" << 'EOF'

 # Check if there are any changes in the webui directory
 if git diff --cached --name-only | grep -q "^tools/server/webui/"; then
-    echo "Formatting and checking webui code..."
-    
-    # Change to webui directory and run format
-    cd tools/server/webui
-    
-    # Check if npm is available and package.json exists
+    REPO_ROOT=$(git rev-parse --show-toplevel)
+    cd "$REPO_ROOT/tools/server/webui"
+
+    # Check if package.json exists
    if [ ! -f "package.json" ]; then
        echo "Error: package.json not found in tools/server/webui"
        exit 1
    fi
-    
+
+    echo "Formatting and checking webui code..."
+
    # Run the format command
    npm run format
-
-    # Check if format command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run format failed"
        exit 1
@@ -38,8 +34,6 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then

    # Run the lint command
    npm run lint
-    
-    # Check if lint command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run lint failed"
        exit 1
@@ -47,156 +41,42 @@ if git diff --cached --name-only | grep -q "^tools/server/webui/"; then

    # Run the check command
    npm run check
-    
-    # Check if check command succeeded
    if [ $? -ne 0 ]; then
        echo "Error: npm run check failed"
        exit 1
    fi

-    # Go back to repo root
-    cd ../../..
-    
    echo "✅ Webui code formatted and checked successfully"
-fi

-exit 0
-EOF
-
-# Create the pre-push hook
-cat > "$PRE_PUSH_HOOK" << 'EOF'
-#!/bin/bash
-
-# Check if there are any webui changes that need building
-WEBUI_CHANGES=$(git diff --name-only @{push}..HEAD | grep "^tools/server/webui/" || true)
-
-if [ -n "$WEBUI_CHANGES" ]; then
-    echo "Webui changes detected, checking if build is up-to-date..."
-    
-    # Change to webui directory
-    cd tools/server/webui
-    
-    # Check if npm is available and package.json exists
-    if [ ! -f "package.json" ]; then
-        echo "Error: package.json not found in tools/server/webui"
+    # Build the webui
+    echo "Building webui..."
+    npm run build
+    if [ $? -ne 0 ]; then
+        echo "❌ npm run build failed"
        exit 1
    fi
-    
-    # Check if build output exists and is newer than source files
-    BUILD_FILE="../public/index.html.gz"
-    NEEDS_BUILD=false
-    
-    if [ ! -f "$BUILD_FILE" ]; then
-        echo "Build output not found, building..."
-        NEEDS_BUILD=true
-    else
-        # Check if any source files are newer than the build output
-        if find src -newer "$BUILD_FILE" -type f | head -1 | grep -q .; then
-            echo "Source files are newer than build output, rebuilding..."
-            NEEDS_BUILD=true
-        fi
-    fi
-    
-    if [ "$NEEDS_BUILD" = true ]; then
-        echo "Building webui..."
-        
-        # Stash any unstaged changes to avoid conflicts during build
-        echo "Checking for unstaged changes..."
-        if ! git diff --quiet || ! git diff --cached --quiet --diff-filter=A; then
-            echo "Stashing unstaged changes..."
-            git stash push --include-untracked -m "Pre-push hook: stashed unstaged changes"
-            STASH_CREATED=$?
-        else
-            echo "No unstaged changes to stash"
-            STASH_CREATED=1
-        fi
-        
-        # Run the build command
-        npm run build
-        
-        # Check if build command succeeded
-        if [ $? -ne 0 ]; then
-            echo "Error: npm run build failed"
-            if [ $STASH_CREATED -eq 0 ]; then
-                echo "You can restore your unstaged changes with: git stash pop"
-            fi
-            exit 1
-        fi

-        # Go back to repo root
-        cd ../../..
-        
-        # Check if build output was created/updated
-        if [ -f "tools/server/public/index.html.gz" ]; then
-            # Add the build output and commit it
-            git add tools/server/public/index.html.gz
-            if ! git diff --cached --quiet; then
-                echo "Committing updated build output..."
-                git commit -m "chore: update webui build output"
-                echo "✅ Build output committed successfully"
-            else
-                echo "Build output unchanged"
-            fi
-        else
-            echo "Error: Build output not found after build"
-            if [ $STASH_CREATED -eq 0 ]; then
-                echo "You can restore your unstaged changes with: git stash pop"
-            fi
-            exit 1
-        fi
-        
-        if [ $STASH_CREATED -eq 0 ]; then
-            echo "✅ Build completed. Your unstaged changes have been stashed."
-            echo "They will be automatically restored after the push."
-            # Create a marker file to indicate stash was created by pre-push hook
-            touch .git/WEBUI_PUSH_STASH_MARKER
-        fi
-    else
-        echo "✅ Build output is up-to-date"
-    fi
-    
-    echo "✅ Webui ready for push"
+    # Stage the build output alongside the source changes
+    cd "$REPO_ROOT"
+    git add tools/server/public/
+
+    echo "✅ Webui built and build output staged"
 fi

 exit 0
 EOF

-# Create the post-push hook (for restoring stashed changes after push)
-cat > "$REPO_ROOT/.git/hooks/post-push" << 'EOF'
-#!/bin/bash
-
-# Check if we have a stash marker from the pre-push hook
-if [ -f .git/WEBUI_PUSH_STASH_MARKER ]; then
-    echo "Restoring your unstaged changes after push..."
-    git stash pop
-    rm -f .git/WEBUI_PUSH_STASH_MARKER
-    echo "✅ Your unstaged changes have been restored."
-fi
-
-exit 0
-EOF
-
-# Make all hooks executable
+# Make hook executable
 chmod +x "$PRE_COMMIT_HOOK"
-chmod +x "$PRE_PUSH_HOOK"
-chmod +x "$REPO_ROOT/.git/hooks/post-push"

 if [ $? -eq 0 ]; then
-    echo "✅ Git hooks installed successfully!"
+    echo "✅ Git hook installed successfully!"
    echo "   Pre-commit: $PRE_COMMIT_HOOK"
-    echo "   Pre-push:   $PRE_PUSH_HOOK"
-    echo "   Post-push:  $REPO_ROOT/.git/hooks/post-push"
    echo ""
-    echo "The hooks will automatically:"
-    echo "  • Format and check webui code before commits (pre-commit)"
-    echo "  • Build webui code before pushes (pre-push)"
-    echo "  • Stash unstaged changes during build process"
-    echo "  • Restore your unstaged changes after the push"
-    echo ""
-    echo "To test the hooks:"
-    echo "  • Make a change to a file in the webui directory and commit it (triggers format/check)"
-    echo "  • Push your commits to trigger the build process"
+    echo "The hook will automatically:"
+    echo "  • Format, lint and check webui code before commits"
+    echo "  • Build webui and stage tools/server/public/ into the same commit"
 else
-    echo "❌ Failed to make hooks executable"
+    echo "❌ Failed to make hook executable"
    exit 1
 fi
--- a/tools/server/webui/scripts/post-build.sh
+++ b/tools/server/webui/scripts/post-build.sh
@@ -1,3 +1,3 @@
 rm -rf ../public/_app;
 rm ../public/favicon.svg;
-rm ../public/index.html;
+rm -f ../public/index.html.gz; # deprecated, but may still be generated by older versions of the build process
--- a/tools/server/webui/src/app.css
+++ b/tools/server/webui/src/app.css
@@ -40,6 +40,17 @@
 	--code-background: oklch(0.985 0 0);
 	--code-foreground: oklch(0.145 0 0);
 	--layer-popover: 1000000;
+
+	--chat-form-area-height: 8rem;
+	--chat-form-area-offset: 2rem;
+	--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
+}
+
+@media (min-width: 640px) {
+	:root {
+		--chat-form-area-height: 24rem;
+		--chat-form-area-offset: 12rem;
+	}
 }

 .dark {
@@ -116,19 +127,6 @@
 	--color-sidebar-ring: var(--sidebar-ring);
 }

-:root {
-	--chat-form-area-height: 8rem;
-	--chat-form-area-offset: 2rem;
-	--max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem)));
-}
-
-@media (min-width: 640px) {
-	:root {
-		--chat-form-area-height: 24rem;
-		--chat-form-area-offset: 12rem;
-	}
-}
-
@layer base {
 	* {
 		@apply border-border outline-ring/50;
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -4,7 +4,7 @@
 	import { getChatActionsContext, setMessageEditContext } from '$lib/contexts';
 	import { chatStore, pendingEditMessageId } from '$lib/stores/chat.svelte';
 	import { conversationsStore } from '$lib/stores/conversations.svelte';
-	import { DatabaseService } from '$lib/services';
+	import { DatabaseService } from '$lib/services/database.service';
 	import { SYSTEM_MESSAGE_PLACEHOLDER } from '$lib/constants';
 	import { MessageRole, AttachmentType } from '$lib/enums';
 	import {
@@ -19,6 +19,7 @@
 	interface Props {
 		class?: string;
 		message: DatabaseMessage;
+		toolMessages?: DatabaseMessage[];
 		isLastAssistantMessage?: boolean;
 		siblingInfo?: ChatMessageSiblingInfo | null;
 	}
@@ -26,6 +27,7 @@
 	let {
 		class: className = '',
 		message,
+		toolMessages = [],
 		isLastAssistantMessage = false,
 		siblingInfo = null
 	}: Props = $props();
@@ -302,6 +304,7 @@
 		{deletionInfo}
 		{isLastAssistantMessage}
 		{message}
+		{toolMessages}
 		messageContent={message.content}
 		onConfirmDelete={handleConfirmDelete}
 		onContinue={handleContinue}
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
@@ -6,42 +6,42 @@
 		SyntaxHighlightedCode
 	} from '$lib/components/app';
 	import { config } from '$lib/stores/settings.svelte';
-	import { Wrench, Loader2, AlertTriangle, Brain } from '@lucide/svelte';
-	import { AgenticSectionType, AttachmentType, FileTypeText } from '$lib/enums';
+	import { Wrench, Loader2, Brain } from '@lucide/svelte';
+	import { AgenticSectionType, FileTypeText } from '$lib/enums';
 	import { formatJsonPretty } from '$lib/utils';
-	import { ATTACHMENT_SAVED_REGEX, NEWLINE_SEPARATOR } from '$lib/constants';
-	import { parseAgenticContent, type AgenticSection } from '$lib/utils';
-	import type { DatabaseMessage, DatabaseMessageExtraImageFile } from '$lib/types/database';
+	import {
+		deriveAgenticSections,
+		parseToolResultWithImages,
+		type AgenticSection,
+		type ToolResultLine
+	} from '$lib/utils';
+	import type { DatabaseMessage } from '$lib/types/database';
 	import type { ChatMessageAgenticTimings, ChatMessageAgenticTurnStats } from '$lib/types/chat';
 	import { ChatMessageStatsView } from '$lib/enums';

 	interface Props {
-		message?: DatabaseMessage;
-		content: string;
+		message: DatabaseMessage;
+		toolMessages?: DatabaseMessage[];
 		isStreaming?: boolean;
 		highlightTurns?: boolean;
 	}

-	type ToolResultLine = {
-		text: string;
-		image?: DatabaseMessageExtraImageFile;
-	};
-
-	let { content, message, isStreaming = false, highlightTurns = false }: Props = $props();
+	let { message, toolMessages = [], isStreaming = false, highlightTurns = false }: Props = $props();

 	let expandedStates: Record<number, boolean> = $state({});

-	const sections = $derived(parseAgenticContent(content));
 	const showToolCallInProgress = $derived(config().showToolCallInProgress as boolean);
 	const showThoughtInProgress = $derived(config().showThoughtInProgress as boolean);

-	// Parse toolResults with images only when sections or message.extra change
+	const sections = $derived(deriveAgenticSections(message, toolMessages, []));
+
+	// Parse tool results with images
 	const sectionsParsed = $derived(
 		sections.map((section) => ({
 			...section,
 			parsedLines: section.toolResult
-				? parseToolResultWithImages(section.toolResult, message?.extra)
-				: []
+				? parseToolResultWithImages(section.toolResult, section.toolResultExtras || message?.extra)
+				: ([] as ToolResultLine[])
 		}))
 	);

@@ -107,26 +107,6 @@
 		expandedStates[index] = !currentState;
 	}

-	function parseToolResultWithImages(
-		toolResult: string,
-		extras?: DatabaseMessage['extra']
-	): ToolResultLine[] {
-		const lines = toolResult.split(NEWLINE_SEPARATOR);
-
-		return lines.map((line) => {
-			const match = line.match(ATTACHMENT_SAVED_REGEX);
-			if (!match || !extras) return { text: line };
-
-			const attachmentName = match[1];
-			const image = extras.find(
-				(e): e is DatabaseMessageExtraImageFile =>
-					e.type === AttachmentType.IMAGE && e.name === attachmentName
-			);
-
-			return { text: line, image };
-		});
-	}
-
 	function buildTurnAgenticTimings(stats: ChatMessageAgenticTurnStats): ChatMessageAgenticTimings {
 		return {
 			turns: 1,
@@ -144,9 +124,8 @@
 			<MarkdownContent content={section.content} attachments={message?.extra} />
 		</div>
 	{:else if section.type === AgenticSectionType.TOOL_CALL_STREAMING}
-		{@const streamingIcon = isStreaming ? Loader2 : AlertTriangle}
-		{@const streamingIconClass = isStreaming ? 'h-4 w-4 animate-spin' : 'h-4 w-4 text-yellow-500'}
-		{@const streamingSubtitle = isStreaming ? '' : 'incomplete'}
+		{@const streamingIcon = isStreaming ? Loader2 : Loader2}
+		{@const streamingIconClass = isStreaming ? 'h-4 w-4 animate-spin' : 'h-4 w-4'}

 		<CollapsibleContentBlock
 			open={isExpanded(index, section)}
@@ -154,7 +133,7 @@
 			icon={streamingIcon}
 			iconClass={streamingIconClass}
 			title={section.toolName || 'Tool call'}
-			subtitle={streamingSubtitle}
+			subtitle={isStreaming ? '' : 'incomplete'}
 			{isStreaming}
 			onToggle={() => toggleExpanded(index, section)}
 		>
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -15,7 +15,7 @@
 	import { Check, X } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
 	import { Checkbox } from '$lib/components/ui/checkbox';
-	import { AGENTIC_TAGS, INPUT_CLASSES, REASONING_TAGS } from '$lib/constants';
+	import { INPUT_CLASSES } from '$lib/constants';
 	import { MessageRole, KeyboardKey, ChatMessageStatsView } from '$lib/enums';
 	import Label from '$lib/components/ui/label/label.svelte';
 	import { config } from '$lib/stores/settings.svelte';
@@ -23,6 +23,8 @@
 	import { modelsStore } from '$lib/stores/models.svelte';
 	import { ServerModelStatus } from '$lib/enums';

+	import { hasAgenticContent } from '$lib/utils';
+
 	interface Props {
 		class?: string;
 		deletionInfo: {
@@ -33,6 +35,7 @@
 		} | null;
 		isLastAssistantMessage?: boolean;
 		message: DatabaseMessage;
+		toolMessages?: DatabaseMessage[];
 		messageContent: string | undefined;
 		onCopy: () => void;
 		onConfirmDelete: () => void;
@@ -53,6 +56,7 @@
 		deletionInfo,
 		isLastAssistantMessage = false,
 		message,
+		toolMessages = [],
 		messageContent,
 		onConfirmDelete,
 		onContinue,
@@ -84,10 +88,8 @@
 		}
 	}

-	const hasAgenticMarkers = $derived(
-		messageContent?.includes(AGENTIC_TAGS.TOOL_CALL_START) ?? false
-	);
-	const hasReasoningMarkers = $derived(messageContent?.includes(REASONING_TAGS.START) ?? false);
+	const isAgentic = $derived(hasAgenticContent(message, toolMessages));
+	const hasReasoning = $derived(!!message.reasoningContent);
 	const processingState = useProcessingState();

 	let currentConfig = $derived(config());
@@ -145,7 +147,7 @@
 	}

 	let highlightAgenticTurns = $derived(
-		hasAgenticMarkers &&
+		isAgentic &&
 			(currentConfig.alwaysShowAgenticTurns || activeStatsView === ChatMessageStatsView.SUMMARY)
 	);

@@ -160,13 +162,14 @@
 		message?.role === MessageRole.ASSISTANT &&
 			isActivelyProcessing &&
 			hasNoContent &&
+			!isAgentic &&
 			isLastAssistantMessage
 	);

 	let showProcessingInfoBottom = $derived(
 		message?.role === MessageRole.ASSISTANT &&
 			isActivelyProcessing &&
-			!hasNoContent &&
+			(!hasNoContent || isAgentic) &&
 			isLastAssistantMessage
 	);

@@ -252,10 +255,10 @@
 			<pre class="raw-output">{messageContent || ''}</pre>
 		{:else}
 			<ChatMessageAgenticContent
-				content={messageContent || ''}
+				{message}
+				{toolMessages}
 				isStreaming={isChatStreaming()}
 				highlightTurns={highlightAgenticTurns}
-				{message}
 			/>
 		{/if}
 	{:else}
@@ -344,9 +347,7 @@
 			{onCopy}
 			{onEdit}
 			{onRegenerate}
-			onContinue={currentConfig.enableContinueGeneration && !hasReasoningMarkers
-				? onContinue
-				: undefined}
+			onContinue={currentConfig.enableContinueGeneration && !hasReasoning ? onContinue : undefined}
 			{onForkConversation}
 			{onDelete}
 			{onConfirmDelete}
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
@@ -6,7 +6,12 @@
 	import { chatStore } from '$lib/stores/chat.svelte';
 	import { conversationsStore, activeConversation } from '$lib/stores/conversations.svelte';
 	import { config } from '$lib/stores/settings.svelte';
-	import { copyToClipboard, formatMessageForClipboard, getMessageSiblings } from '$lib/utils';
+	import {
+		copyToClipboard,
+		formatMessageForClipboard,
+		getMessageSiblings,
+		hasAgenticContent
+	} from '$lib/utils';

 	interface Props {
 		class?: string;
@@ -119,32 +124,75 @@
 			? messages
 			: messages.filter((msg) => msg.type !== MessageRole.SYSTEM);

-		let lastAssistantIndex = -1;
+		// Build display entries, grouping agentic sessions into single entries.
+		// An agentic session = assistant(with tool_calls) → tool → assistant → tool → ... → assistant(final)
+		const result: Array<{
+			message: DatabaseMessage;
+			toolMessages: DatabaseMessage[];
+			isLastAssistantMessage: boolean;
+			siblingInfo: ChatMessageSiblingInfo;
+		}> = [];

-		for (let i = filteredMessages.length - 1; i >= 0; i--) {
-			if (filteredMessages[i].role === MessageRole.ASSISTANT) {
-				lastAssistantIndex = i;
+		for (let i = 0; i < filteredMessages.length; i++) {
+			const msg = filteredMessages[i];

+			// Skip tool messages - they're grouped with preceding assistant
+			if (msg.role === MessageRole.TOOL) continue;
+
+			const toolMessages: DatabaseMessage[] = [];
+			if (msg.role === MessageRole.ASSISTANT && hasAgenticContent(msg)) {
+				let j = i + 1;
+
+				while (j < filteredMessages.length) {
+					const next = filteredMessages[j];
+
+					if (next.role === MessageRole.TOOL) {
+						toolMessages.push(next);
+
+						j++;
+					} else if (next.role === MessageRole.ASSISTANT) {
+						toolMessages.push(next);
+
+						j++;
+					} else {
+						break;
+					}
+				}
+
+				i = j - 1;
+			} else if (msg.role === MessageRole.ASSISTANT) {
+				let j = i + 1;
+
+				while (j < filteredMessages.length && filteredMessages[j].role === MessageRole.TOOL) {
+					toolMessages.push(filteredMessages[j]);
+					j++;
+				}
+			}
+
+			const siblingInfo = getMessageSiblings(allConversationMessages, msg.id);
+
+			result.push({
+				message: msg,
+				toolMessages,
+				isLastAssistantMessage: false,
+				siblingInfo: siblingInfo || {
+					message: msg,
+					siblingIds: [msg.id],
+					currentIndex: 0,
+					totalSiblings: 1
+				}
+			});
+		}
+
+		// Mark the last assistant message
+		for (let i = result.length - 1; i >= 0; i--) {
+			if (result[i].message.role === MessageRole.ASSISTANT) {
+				result[i].isLastAssistantMessage = true;
 				break;
 			}
 		}

-		return filteredMessages.map((message, index) => {
-			const siblingInfo = getMessageSiblings(allConversationMessages, message.id);
-			const isLastAssistantMessage =
-				message.role === MessageRole.ASSISTANT && index === lastAssistantIndex;
-
-			return {
-				message,
-				isLastAssistantMessage,
-				siblingInfo: siblingInfo || {
-					message,
-					siblingIds: [message.id],
-					currentIndex: 0,
-					totalSiblings: 1
-				}
-			};
-		});
+		return result;
 	});
 </script>

@@ -152,11 +200,12 @@
 	class="flex h-full flex-col space-y-10 pt-24 {className}"
 	style="height: auto; min-height: calc(100dvh - 14rem);"
 >
-	{#each displayMessages as { message, isLastAssistantMessage, siblingInfo } (message.id)}
+	{#each displayMessages as { message, toolMessages, isLastAssistantMessage, siblingInfo } (message.id)}
 		<div use:fadeInView>
 			<ChatMessage
 				class="mx-auto w-full max-w-[48rem]"
 				{message}
+				{toolMessages}
 				{isLastAssistantMessage}
 				{siblingInfo}
 			/>
--- a/tools/server/webui/src/lib/components/app/chat/index.ts
+++ b/tools/server/webui/src/lib/components/app/chat/index.ts
@@ -425,21 +425,16 @@ export { default as ChatMessage } from './ChatMessages/ChatMessage.svelte';
 /**
 * **ChatMessageAgenticContent** - Agentic workflow output display
 *
- * Specialized renderer for assistant messages containing agentic workflow markers.
- * Parses structured content and displays tool calls and reasoning blocks as
- * interactive collapsible sections with real-time streaming support.
+ * Specialized renderer for assistant messages with tool calls and reasoning.
+ * Derives display sections from structured message data (toolCalls, reasoningContent,
+ * and child tool result messages) and renders them as interactive collapsible sections.
 *
 * **Architecture:**
- * - Uses `parseAgenticContent()` from `$lib/utils` to parse markers
+ * - Uses `deriveAgenticSections()` from `$lib/utils` to build sections from structured data
 * - Renders sections as CollapsibleContentBlock components
 * - Handles streaming state for progressive content display
 * - Falls back to MarkdownContent for plain text sections
 *
- * **Marker Format:**
- * - Tool calls: in constants/agentic.ts (AGENTIC_TAGS)
- * - Reasoning: in constants/agentic.ts (REASONING_TAGS)
- * - Partial markers handled gracefully during streaming
- *
 * **Execution States:**
 * - **Streaming**: Animated spinner, block expanded, auto-scroll enabled
 * - **Pending**: Waiting indicator for queued tool calls
--- a/tools/server/webui/src/lib/constants/agentic.ts
+++ b/tools/server/webui/src/lib/constants/agentic.ts
@@ -15,8 +15,11 @@ export const DEFAULT_AGENTIC_CONFIG: AgenticConfig = {
 	maxToolPreviewLines: 25
 } as const;

-// Agentic tool call tag markers
-export const AGENTIC_TAGS = {
+/**
+ * @deprecated Legacy marker tags - only used for migration of old stored messages.
+ * New messages use structured fields (reasoningContent, toolCalls, toolCallId).
+ */
+export const LEGACY_AGENTIC_TAGS = {
 	TOOL_CALL_START: '<<<AGENTIC_TOOL_CALL_START>>>',
 	TOOL_CALL_END: '<<<AGENTIC_TOOL_CALL_END>>>',
 	TOOL_NAME_PREFIX: '<<<TOOL_NAME:',
@@ -25,39 +28,25 @@ export const AGENTIC_TAGS = {
 	TAG_SUFFIX: '>>>'
 } as const;

-export const REASONING_TAGS = {
+/**
+ * @deprecated Legacy reasoning tags - only used for migration of old stored messages.
+ * New messages use the dedicated reasoningContent field.
+ */
+export const LEGACY_REASONING_TAGS = {
 	START: '<<<reasoning_content_start>>>',
 	END: '<<<reasoning_content_end>>>'
 } as const;

-// Regex for trimming leading/trailing newlines
-export const TRIM_NEWLINES_REGEX = /^\n+|\n+$/g;
-
-// Regex patterns for parsing agentic content
-export const AGENTIC_REGEX = {
-	// Matches completed tool calls (with END marker)
+/**
+ * @deprecated Legacy regex patterns - only used for migration of old stored messages.
+ */
+export const LEGACY_AGENTIC_REGEX = {
 	COMPLETED_TOOL_CALL:
 		/<<<AGENTIC_TOOL_CALL_START>>>\n<<<TOOL_NAME:(.+?)>>>\n<<<TOOL_ARGS_START>>>([\s\S]*?)<<<TOOL_ARGS_END>>>([\s\S]*?)<<<AGENTIC_TOOL_CALL_END>>>/g,
-	// Matches pending tool call (has NAME and ARGS but no END)
-	PENDING_TOOL_CALL:
-		/<<<AGENTIC_TOOL_CALL_START>>>\n<<<TOOL_NAME:(.+?)>>>\n<<<TOOL_ARGS_START>>>([\s\S]*?)<<<TOOL_ARGS_END>>>([\s\S]*)$/,
-	// Matches partial tool call (has START and NAME, ARGS still streaming)
-	PARTIAL_WITH_NAME:
-		/<<<AGENTIC_TOOL_CALL_START>>>\n<<<TOOL_NAME:(.+?)>>>\n<<<TOOL_ARGS_START>>>([\s\S]*)$/,
-	// Matches early tool call (just START marker)
-	EARLY_MATCH: /<<<AGENTIC_TOOL_CALL_START>>>([\s\S]*)$/,
-	// Matches partial marker at end of content
-	PARTIAL_MARKER: /<<<[A-Za-z_]*$/,
-	// Matches reasoning content blocks (including tags)
 	REASONING_BLOCK: /<<<reasoning_content_start>>>[\s\S]*?<<<reasoning_content_end>>>/g,
-	// Captures the reasoning text between start/end tags
 	REASONING_EXTRACT: /<<<reasoning_content_start>>>([\s\S]*?)<<<reasoning_content_end>>>/,
-	// Matches an opening reasoning tag and any remaining content (unterminated)
 	REASONING_OPEN: /<<<reasoning_content_start>>>[\s\S]*$/,
-	// Matches a complete agentic tool call display block (start to end marker)
 	AGENTIC_TOOL_CALL_BLOCK: /\n*<<<AGENTIC_TOOL_CALL_START>>>[\s\S]*?<<<AGENTIC_TOOL_CALL_END>>>/g,
-	// Matches a pending/partial agentic tool call (start marker with no matching end)
 	AGENTIC_TOOL_CALL_OPEN: /\n*<<<AGENTIC_TOOL_CALL_START>>>[\s\S]*$/,
-	// Matches tool name inside content
-	TOOL_NAME_EXTRACT: /<<<TOOL_NAME:([^>]+)>>>/
+	HAS_LEGACY_MARKERS: /<<<(?:AGENTIC_TOOL_CALL_START|reasoning_content_start)>>>/
 } as const;
--- a/tools/server/webui/src/lib/services/chat.service.ts
+++ b/tools/server/webui/src/lib/services/chat.service.ts
@@ -1,6 +1,7 @@
-import { getJsonHeaders, formatAttachmentText, isAbortError } from '$lib/utils';
+import { getJsonHeaders } from '$lib/utils/api-headers';
+import { formatAttachmentText } from '$lib/utils/formatters';
+import { isAbortError } from '$lib/utils/abort';
 import {
-	AGENTIC_REGEX,
 	ATTACHMENT_LABEL_PDF_FILE,
 	ATTACHMENT_LABEL_MCP_PROMPT,
 	ATTACHMENT_LABEL_MCP_RESOURCE
@@ -17,38 +18,6 @@ import type { DatabaseMessageExtraMcpPrompt, DatabaseMessageExtraMcpResource } f
 import { modelsStore } from '$lib/stores/models.svelte';

 export class ChatService {
-	private static stripReasoningContent(
-		content: ApiChatMessageData['content'] | null | undefined
-	): ApiChatMessageData['content'] | null | undefined {
-		if (!content) {
-			return content;
-		}
-
-		if (typeof content === 'string') {
-			return content
-				.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
-				.replace(AGENTIC_REGEX.REASONING_OPEN, '')
-				.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
-				.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '');
-		}
-
-		if (!Array.isArray(content)) {
-			return content;
-		}
-
-		return content.map((part: ApiChatMessageContentPart) => {
-			if (part.type !== ContentPartType.TEXT || !part.text) return part;
-			return {
-				...part,
-				text: part.text
-					.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
-					.replace(AGENTIC_REGEX.REASONING_OPEN, '')
-					.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
-					.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '')
-			};
-		});
-	}
-
 	/**
 	 *
 	 *
@@ -57,46 +26,6 @@ export class ChatService {
 	 *
 	 */

-	/**
-	 * Extracts reasoning text from content that contains internal reasoning tags.
-	 * Returns the concatenated reasoning content or undefined if none found.
-	 */
-	private static extractReasoningFromContent(
-		content: ApiChatMessageData['content'] | null | undefined
-	): string | undefined {
-		if (!content) return undefined;
-
-		const extractFromString = (text: string): string => {
-			const parts: string[] = [];
-			// Use a fresh regex instance to avoid shared lastIndex state
-			const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
-			let match = re.exec(text);
-			while (match) {
-				parts.push(match[1]);
-				// advance past the matched portion and retry
-				text = text.slice(match.index + match[0].length);
-				match = re.exec(text);
-			}
-			return parts.join('');
-		};
-
-		if (typeof content === 'string') {
-			const result = extractFromString(content);
-			return result || undefined;
-		}
-
-		if (!Array.isArray(content)) return undefined;
-
-		const parts: string[] = [];
-		for (const part of content) {
-			if (part.type === ContentPartType.TEXT && part.text) {
-				const result = extractFromString(part.text);
-				if (result) parts.push(result);
-			}
-		}
-		return parts.length > 0 ? parts.join('') : undefined;
-	}
-
 	/**
 	 * Sends a chat completion request to the llama.cpp server.
 	 * Supports both streaming and non-streaming responses with comprehensive parameter configuration.
@@ -201,20 +130,15 @@ export class ChatService {

 		const requestBody: ApiChatCompletionRequest = {
 			messages: normalizedMessages.map((msg: ApiChatMessageData) => {
-				// Always strip internal reasoning/agentic tags from content
-				const cleanedContent = ChatService.stripReasoningContent(msg.content);
 				const mapped: ApiChatCompletionRequest['messages'][0] = {
 					role: msg.role,
-					content: cleanedContent,
+					content: msg.content,
 					tool_calls: msg.tool_calls,
 					tool_call_id: msg.tool_call_id
 				};
-				// When preserving reasoning, extract it from raw content and send as separate field
-				if (!excludeReasoningFromContext) {
-					const reasoning = ChatService.extractReasoningFromContent(msg.content);
-					if (reasoning) {
-						mapped.reasoning_content = reasoning;
-					}
+				// Include reasoning_content from the dedicated field
+				if (!excludeReasoningFromContext && msg.reasoning_content) {
+					mapped.reasoning_content = msg.reasoning_content;
 				}
 				return mapped;
 			}),
@@ -730,6 +654,10 @@ export class ChatService {
 				content: message.content
 			};

+			if (message.reasoningContent) {
+				result.reasoning_content = message.reasoningContent;
+			}
+
 			if (toolCalls && toolCalls.length > 0) {
 				result.tool_calls = toolCalls;
 			}
@@ -858,6 +786,9 @@ export class ChatService {
 			role: message.role as MessageRole,
 			content: contentParts
 		};
+		if (message.reasoningContent) {
+			result.reasoning_content = message.reasoningContent;
+		}
 		if (toolCalls && toolCalls.length > 0) {
 			result.tool_calls = toolCalls;
 		}
--- a/tools/server/webui/src/lib/services/mcp.service.ts
+++ b/tools/server/webui/src/lib/services/mcp.service.ts
@@ -42,6 +42,7 @@ import type {
 import {
 	buildProxiedUrl,
 	buildProxiedHeaders,
+	getAuthHeaders,
 	throwIfAborted,
 	isAbortError,
 	createBase64DataUrl
@@ -124,7 +125,14 @@ export class MCPService {
 		const requestInit: RequestInit = {};

 		if (config.headers) {
-			requestInit.headers = buildProxiedHeaders(config.headers);
+			requestInit.headers = config.useProxy ? buildProxiedHeaders(config.headers) : config.headers;
+		}
+
+		if (useProxy) {
+			requestInit.headers = {
+				...getAuthHeaders(),
+				...(requestInit.headers as Record<string, string>)
+			};
 		}

 		if (config.credentials) {
--- a/tools/server/webui/src/lib/stores/agentic.svelte.ts
+++ b/tools/server/webui/src/lib/stores/agentic.svelte.ts
@@ -7,6 +7,10 @@
 * - Session state management
 * - Turn limit enforcement
 *
+ * Each agentic turn produces separate DB messages:
+ * - One assistant message per LLM turn (with tool_calls if any)
+ * - One tool result message per tool call execution
+ *
 * **Architecture & Relationships:**
 * - **ChatService**: Stateless API layer (sendMessage, streaming)
 * - **mcpStore**: MCP connection management and tool execution
@@ -16,7 +20,6 @@
 * @see mcpStore in stores/mcp.svelte.ts for MCP operations
 */

-import { SvelteMap } from 'svelte/reactivity';
 import { ChatService } from '$lib/services';
 import { config } from '$lib/stores/settings.svelte';
 import { mcpStore } from '$lib/stores/mcp.svelte';
@@ -24,7 +27,6 @@ import { modelsStore } from '$lib/stores/models.svelte';
 import { isAbortError } from '$lib/utils';
 import {
 	DEFAULT_AGENTIC_CONFIG,
-	AGENTIC_TAGS,
 	NEWLINE_SEPARATOR,
 	TURN_LIMIT_MESSAGE,
 	LLM_ERROR_BLOCK_START,
@@ -193,17 +195,6 @@ class AgenticStore {

 	async runAgenticFlow(params: AgenticFlowParams): Promise<AgenticFlowResult> {
 		const { conversationId, messages, options = {}, callbacks, signal, perChatOverrides } = params;
-		const {
-			onChunk,
-			onReasoningChunk,
-			onToolCallChunk,
-			onAttachments,
-			onModel,
-			onComplete,
-			onError,
-			onTimings,
-			onTurnComplete
-		} = callbacks;

 		const agenticConfig = this.getConfig(config(), perChatOverrides);
 		if (!agenticConfig.enabled) return { handled: false };
@@ -253,24 +244,14 @@ class AgenticStore {
 				options,
 				tools,
 				agenticConfig,
-				callbacks: {
-					onChunk,
-					onReasoningChunk,
-					onToolCallChunk,
-					onAttachments,
-					onModel,
-					onComplete,
-					onError,
-					onTimings,
-					onTurnComplete
-				},
+				callbacks,
 				signal
 			});
 			return { handled: true };
 		} catch (error) {
 			const normalizedError = error instanceof Error ? error : new Error(String(error));
 			this.updateSession(conversationId, { lastError: normalizedError });
-			onError?.(normalizedError);
+			callbacks.onError?.(normalizedError);
 			return { handled: true, error: normalizedError };
 		} finally {
 			this.updateSession(conversationId, { isRunning: false });
@@ -295,17 +276,20 @@ class AgenticStore {
 		const {
 			onChunk,
 			onReasoningChunk,
-			onToolCallChunk,
+			onToolCallsStreaming,
 			onAttachments,
 			onModel,
-			onComplete,
+			onAssistantTurnComplete,
+			createToolResultMessage,
+			createAssistantMessage,
+			onFlowComplete,
 			onTimings,
 			onTurnComplete
 		} = callbacks;

 		const sessionMessages: AgenticMessage[] = toAgenticMessages(messages);
-		const allToolCalls: ApiChatCompletionToolCall[] = [];
 		let capturedTimings: ChatMessageTimings | undefined;
+		let totalToolCallCount = 0;

 		const agenticTimings: ChatMessageAgenticTimings = {
 			turns: 0,
@@ -316,12 +300,7 @@ class AgenticStore {
 			llm: { predicted_n: 0, predicted_ms: 0, prompt_n: 0, prompt_ms: 0 }
 		};
 		const maxTurns = agenticConfig.maxTurns;
-		const maxToolPreviewLines = agenticConfig.maxToolPreviewLines;

-		// Resolve effective model for vision capability checks.
-		// In ROUTER mode, options.model is always set by the caller.
-		// In MODEL mode, options.model is undefined; use the single loaded model
-		// which carries modalities bridged from /props.
 		const effectiveModel = options.model || modelsStore.models[0]?.model || '';

 		for (let turn = 0; turn < maxTurns; turn++) {
@@ -329,23 +308,20 @@ class AgenticStore {
 			agenticTimings.turns = turn + 1;

 			if (signal?.aborted) {
-				onComplete?.(
-					'',
-					undefined,
-					this.buildFinalTimings(capturedTimings, agenticTimings),
-					undefined
-				);
+				onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 				return;
 			}

+			// For turns > 0, create a new assistant message via callback
+			if (turn > 0 && createAssistantMessage) {
+				await createAssistantMessage();
+			}
+
 			let turnContent = '';
+			let turnReasoningContent = '';
 			let turnToolCalls: ApiChatCompletionToolCall[] = [];
 			let lastStreamingToolCallName = '';
 			let lastStreamingToolCallArgsLength = 0;
-			const emittedToolCallStates = new SvelteMap<
-				number,
-				{ emittedOnce: boolean; lastArgs: string }
-			>();
 			let turnTimings: ChatMessageTimings | undefined;

 			const turnStats: ChatMessageAgenticTurnStats = {
@@ -366,30 +342,15 @@ class AgenticStore {
 							turnContent += chunk;
 							onChunk?.(chunk);
 						},
-						onReasoningChunk,
+						onReasoningChunk: (chunk: string) => {
+							turnReasoningContent += chunk;
+							onReasoningChunk?.(chunk);
+						},
 						onToolCallChunk: (serialized: string) => {
 							try {
 								turnToolCalls = JSON.parse(serialized) as ApiChatCompletionToolCall[];
-								for (let i = 0; i < turnToolCalls.length; i++) {
-									const toolCall = turnToolCalls[i];
-									const toolName = toolCall.function?.name ?? '';
-									const toolArgs = toolCall.function?.arguments ?? '';
-									const state = emittedToolCallStates.get(i) || {
-										emittedOnce: false,
-										lastArgs: ''
-									};
-									if (!state.emittedOnce) {
-										const output = `\n\n${AGENTIC_TAGS.TOOL_CALL_START}\n${AGENTIC_TAGS.TOOL_NAME_PREFIX}${toolName}${AGENTIC_TAGS.TAG_SUFFIX}\n${AGENTIC_TAGS.TOOL_ARGS_START}\n${toolArgs}`;
-										onChunk?.(output);
-										state.emittedOnce = true;
-										state.lastArgs = toolArgs;
-										emittedToolCallStates.set(i, state);
-									} else if (toolArgs.length > state.lastArgs.length) {
-										onChunk?.(toolArgs.slice(state.lastArgs.length));
-										state.lastArgs = toolArgs;
-										emittedToolCallStates.set(i, state);
-									}
-								}
+								onToolCallsStreaming?.(turnToolCalls);
+
 								if (turnToolCalls.length > 0 && turnToolCalls[0]?.function) {
 									const name = turnToolCalls[0].function.name || '';
 									const args = turnToolCalls[0].function.arguments || '';
@@ -442,77 +403,84 @@ class AgenticStore {
 				}
 			} catch (error) {
 				if (signal?.aborted) {
-					onComplete?.(
-						'',
-						undefined,
+					// Save whatever we have for this turn before exiting
+					await onAssistantTurnComplete?.(
+						turnContent,
+						turnReasoningContent || undefined,
 						this.buildFinalTimings(capturedTimings, agenticTimings),
 						undefined
 					);
-
+					onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 					return;
 				}
 				const normalizedError = error instanceof Error ? error : new Error('LLM stream error');
+				// Save error as content in the current turn
 				onChunk?.(`${LLM_ERROR_BLOCK_START}${normalizedError.message}${LLM_ERROR_BLOCK_END}`);
-				onComplete?.(
-					'',
-					undefined,
+				await onAssistantTurnComplete?.(
+					turnContent + `${LLM_ERROR_BLOCK_START}${normalizedError.message}${LLM_ERROR_BLOCK_END}`,
+					turnReasoningContent || undefined,
 					this.buildFinalTimings(capturedTimings, agenticTimings),
 					undefined
 				);
-
+				onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 				throw normalizedError;
 			}

+			// No tool calls = final turn, save and complete
 			if (turnToolCalls.length === 0) {
 				agenticTimings.perTurn!.push(turnStats);

-				onComplete?.(
-					'',
-					undefined,
-					this.buildFinalTimings(capturedTimings, agenticTimings),
+				const finalTimings = this.buildFinalTimings(capturedTimings, agenticTimings);
+
+				await onAssistantTurnComplete?.(
+					turnContent,
+					turnReasoningContent || undefined,
+					finalTimings,
 					undefined
 				);

+				if (finalTimings) onTurnComplete?.(finalTimings);
+
+				onFlowComplete?.(finalTimings);
+
 				return;
 			}

+			// Normalize and save assistant turn with tool calls
 			const normalizedCalls = this.normalizeToolCalls(turnToolCalls);
 			if (normalizedCalls.length === 0) {
-				onComplete?.(
-					'',
-					undefined,
+				await onAssistantTurnComplete?.(
+					turnContent,
+					turnReasoningContent || undefined,
 					this.buildFinalTimings(capturedTimings, agenticTimings),
 					undefined
 				);
+				onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 				return;
 			}

-			for (const call of normalizedCalls) {
-				allToolCalls.push({
-					id: call.id,
-					type: call.type,
-					function: call.function ? { ...call.function } : undefined
-				});
-			}
+			totalToolCallCount += normalizedCalls.length;
+			this.updateSession(conversationId, { totalToolCalls: totalToolCallCount });

-			this.updateSession(conversationId, { totalToolCalls: allToolCalls.length });
-			onToolCallChunk?.(JSON.stringify(allToolCalls));
+			// Save the assistant message with its tool calls
+			await onAssistantTurnComplete?.(
+				turnContent,
+				turnReasoningContent || undefined,
+				turnTimings,
+				normalizedCalls
+			);

+			// Add assistant message to session history
 			sessionMessages.push({
 				role: MessageRole.ASSISTANT,
 				content: turnContent || undefined,
 				tool_calls: normalizedCalls
 			});

+			// Execute each tool call and create result messages
 			for (const toolCall of normalizedCalls) {
 				if (signal?.aborted) {
-					onComplete?.(
-						'',
-						undefined,
-						this.buildFinalTimings(capturedTimings, agenticTimings),
-						undefined
-					);
-
+					onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 					return;
 				}

@@ -530,13 +498,7 @@ class AgenticStore {
 					result = executionResult.content;
 				} catch (error) {
 					if (isAbortError(error)) {
-						onComplete?.(
-							'',
-							undefined,
-							this.buildFinalTimings(capturedTimings, agenticTimings),
-							undefined
-						);
-
+						onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 						return;
 					}
 					result = `Error: ${error instanceof Error ? error.message : String(error)}`;
@@ -557,21 +519,27 @@ class AgenticStore {
 				turnStats.toolsMs += Math.round(toolDurationMs);

 				if (signal?.aborted) {
-					onComplete?.(
-						'',
-						undefined,
-						this.buildFinalTimings(capturedTimings, agenticTimings),
-						undefined
-					);
-
+					onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 					return;
 				}

 				const { cleanedResult, attachments } = this.extractBase64Attachments(result);
-				if (attachments.length > 0) onAttachments?.(attachments);

-				this.emitToolCallResult(cleanedResult, maxToolPreviewLines, onChunk);
+				// Create the tool result message in the DB
+				let toolResultMessage: DatabaseMessage | undefined;
+				if (createToolResultMessage) {
+					toolResultMessage = await createToolResultMessage(
+						toolCall.id,
+						cleanedResult,
+						attachments.length > 0 ? attachments : undefined
+					);
+				}

+				if (attachments.length > 0 && toolResultMessage) {
+					onAttachments?.(toolResultMessage.id, attachments);
+				}
+
+				// Build content parts for session history (including images for vision models)
 				const contentParts: ApiChatMessageContentPart[] = [
 					{ type: ContentPartType.TEXT, text: cleanedResult }
 				];
@@ -605,8 +573,15 @@ class AgenticStore {
 			}
 		}

+		// Turn limit reached
 		onChunk?.(TURN_LIMIT_MESSAGE);
-		onComplete?.('', undefined, this.buildFinalTimings(capturedTimings, agenticTimings), undefined);
+		await onAssistantTurnComplete?.(
+			TURN_LIMIT_MESSAGE,
+			undefined,
+			this.buildFinalTimings(capturedTimings, agenticTimings),
+			undefined
+		);
+		onFlowComplete?.(this.buildFinalTimings(capturedTimings, agenticTimings));
 	}

 	private buildFinalTimings(
@@ -633,23 +608,6 @@ class AgenticStore {
 		}));
 	}

-	private emitToolCallResult(
-		result: string,
-		maxLines: number,
-		emit?: (chunk: string) => void
-	): void {
-		if (!emit) {
-			return;
-		}
-
-		let output = `${NEWLINE_SEPARATOR}${AGENTIC_TAGS.TOOL_ARGS_END}`;
-		const lines = result.split(NEWLINE_SEPARATOR);
-		const trimmedLines = lines.length > maxLines ? lines.slice(-maxLines) : lines;
-
-		output += `${NEWLINE_SEPARATOR}${trimmedLines.join(NEWLINE_SEPARATOR)}${NEWLINE_SEPARATOR}${AGENTIC_TAGS.TOOL_CALL_END}${NEWLINE_SEPARATOR}`;
-		emit(output);
-	}
-
 	private extractBase64Attachments(result: string): {
 		cleanedResult: string;
 		attachments: DatabaseMessageExtra[];
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -12,7 +12,8 @@
 */

 import { SvelteMap } from 'svelte/reactivity';
-import { DatabaseService, ChatService } from '$lib/services';
+import { DatabaseService } from '$lib/services/database.service';
+import { ChatService } from '$lib/services/chat.service';
 import { conversationsStore } from '$lib/stores/conversations.svelte';
 import { config } from '$lib/stores/settings.svelte';
 import { agenticStore } from '$lib/stores/agentic.svelte';
@@ -34,7 +35,6 @@ import {
 import {
 	MAX_INACTIVE_CONVERSATION_STATES,
 	INACTIVE_CONVERSATION_STATE_MAX_AGE_MS,
-	REASONING_TAGS,
 	SYSTEM_MESSAGE_PLACEHOLDER
 } from '$lib/constants';
 import type {
@@ -50,15 +50,6 @@ interface ConversationStateEntry {
 	lastAccessed: number;
 }

-const countOccurrences = (source: string, token: string): number =>
-	source ? source.split(token).length - 1 : 0;
-const hasUnclosedReasoningTag = (content: string): boolean =>
-	countOccurrences(content, REASONING_TAGS.START) > countOccurrences(content, REASONING_TAGS.END);
-const wrapReasoningContent = (content: string, reasoningContent?: string): string => {
-	if (!reasoningContent) return content;
-	return `${REASONING_TAGS.START}${reasoningContent}${REASONING_TAGS.END}${content}`;
-};
-
 class ChatStore {
 	activeProcessingState = $state<ApiProcessingState | null>(null);
 	currentResponse = $state('');
@@ -557,83 +548,76 @@ class ChatStore {
 				await modelsStore.fetchModelProps(effectiveModel);
 		}

-		let streamedContent = '',
-			streamedToolCallContent = '',
-			isReasoningOpen = false,
-			hasStreamedChunks = false,
-			resolvedModel: string | null = null,
-			modelPersisted = false;
-		let streamedExtras: DatabaseMessageExtra[] = assistantMessage.extra
-			? JSON.parse(JSON.stringify(assistantMessage.extra))
-			: [];
+		// Mutable state for the current message being streamed
+		let currentMessageId = assistantMessage.id;
+		let streamedContent = '';
+		let streamedReasoningContent = '';
+		let resolvedModel: string | null = null;
+		let modelPersisted = false;
+		const convId = assistantMessage.convId;
+
 		const recordModel = (modelName: string | null | undefined, persistImmediately = true): void => {
 			if (!modelName) return;
 			const n = normalizeModelName(modelName);
 			if (!n || n === resolvedModel) return;
 			resolvedModel = n;
-			const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+			const idx = conversationsStore.findMessageIndex(currentMessageId);
 			conversationsStore.updateMessageAtIndex(idx, { model: n });
 			if (persistImmediately && !modelPersisted) {
 				modelPersisted = true;
-				DatabaseService.updateMessage(assistantMessage.id, { model: n }).catch(() => {
+				DatabaseService.updateMessage(currentMessageId, { model: n }).catch(() => {
 					modelPersisted = false;
 					resolvedModel = null;
 				});
 			}
 		};
-		const updateStreamingContent = () => {
-			this.setChatStreaming(assistantMessage.convId, streamedContent, assistantMessage.id);
-			const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+
+		const updateStreamingUI = () => {
+			this.setChatStreaming(convId, streamedContent, currentMessageId);
+			const idx = conversationsStore.findMessageIndex(currentMessageId);
 			conversationsStore.updateMessageAtIndex(idx, { content: streamedContent });
 		};
-		const appendContentChunk = (chunk: string) => {
-			if (isReasoningOpen) {
-				streamedContent += REASONING_TAGS.END;
-				isReasoningOpen = false;
-			}
-			streamedContent += chunk;
-			hasStreamedChunks = true;
-			updateStreamingContent();
-		};
-		const appendReasoningChunk = (chunk: string) => {
-			if (!isReasoningOpen) {
-				streamedContent += REASONING_TAGS.START;
-				isReasoningOpen = true;
-			}
-			streamedContent += chunk;
-			hasStreamedChunks = true;
-			updateStreamingContent();
-		};
-		const finalizeReasoning = () => {
-			if (isReasoningOpen) {
-				streamedContent += REASONING_TAGS.END;
-				isReasoningOpen = false;
-			}
+
+		const cleanupStreamingState = () => {
+			this.setStreamingActive(false);
+			this.setChatLoading(convId, false);
+			this.clearChatStreaming(convId);
+			this.setProcessingState(convId, null);
 		};
+
 		this.setStreamingActive(true);
-		this.setActiveProcessingConversation(assistantMessage.convId);
-		const abortController = this.getOrCreateAbortController(assistantMessage.convId);
+		this.setActiveProcessingConversation(convId);
+		const abortController = this.getOrCreateAbortController(convId);
+
 		const streamCallbacks: ChatStreamCallbacks = {
-			onChunk: (chunk: string) => appendContentChunk(chunk),
-			onReasoningChunk: (chunk: string) => appendReasoningChunk(chunk),
-			onToolCallChunk: (chunk: string) => {
-				const c = chunk.trim();
-				if (!c) return;
-				streamedToolCallContent = c;
-				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
-				conversationsStore.updateMessageAtIndex(idx, { toolCalls: streamedToolCallContent });
+			onChunk: (chunk: string) => {
+				streamedContent += chunk;
+				updateStreamingUI();
 			},
-			onAttachments: (extras: DatabaseMessageExtra[]) => {
+			onReasoningChunk: (chunk: string) => {
+				streamedReasoningContent += chunk;
+				// Update UI to show reasoning is being received
+				const idx = conversationsStore.findMessageIndex(currentMessageId);
+				conversationsStore.updateMessageAtIndex(idx, {
+					reasoningContent: streamedReasoningContent
+				});
+			},
+			onToolCallsStreaming: (toolCalls) => {
+				const idx = conversationsStore.findMessageIndex(currentMessageId);
+				conversationsStore.updateMessageAtIndex(idx, { toolCalls: JSON.stringify(toolCalls) });
+			},
+			onAttachments: (messageId: string, extras: DatabaseMessageExtra[]) => {
 				if (!extras.length) return;
-				streamedExtras = [...streamedExtras, ...extras];
-				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
-				conversationsStore.updateMessageAtIndex(idx, { extra: streamedExtras });
-				DatabaseService.updateMessage(assistantMessage.id, { extra: streamedExtras }).catch(
-					console.error
-				);
+				const idx = conversationsStore.findMessageIndex(messageId);
+				if (idx === -1) return;
+				const msg = conversationsStore.activeMessages[idx];
+				const updatedExtras = [...(msg.extra || []), ...extras];
+				conversationsStore.updateMessageAtIndex(idx, { extra: updatedExtras });
+				DatabaseService.updateMessage(messageId, { extra: updatedExtras }).catch(console.error);
 			},
 			onModel: (modelName: string) => recordModel(modelName),
 			onTurnComplete: (intermediateTimings: ChatMessageTimings) => {
+				// Update the first assistant message with cumulative agentic timings
 				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
 				conversationsStore.updateMessageAtIndex(idx, { timings: intermediateTimings });
 			},
@@ -651,56 +635,104 @@ class ChatStore {
 						cache_n: timings?.cache_n || 0,
 						prompt_progress: promptProgress
 					},
-					assistantMessage.convId
+					convId
 				);
 			},
-			onComplete: async (
-				finalContent?: string,
-				reasoningContent?: string,
-				timings?: ChatMessageTimings,
-				toolCallContent?: string
+			onAssistantTurnComplete: async (
+				content: string,
+				reasoningContent: string | undefined,
+				timings: ChatMessageTimings | undefined,
+				toolCalls: import('$lib/types/api').ApiChatCompletionToolCall[] | undefined
 			) => {
-				this.setStreamingActive(false);
-				finalizeReasoning();
-				const combinedContent = hasStreamedChunks
-					? streamedContent
-					: wrapReasoningContent(finalContent || '', reasoningContent);
 				const updateData: Record<string, unknown> = {
-					content: combinedContent,
-					toolCalls: toolCallContent || streamedToolCallContent,
+					content,
+					reasoningContent: reasoningContent || undefined,
+					toolCalls: toolCalls ? JSON.stringify(toolCalls) : '',
 					timings
 				};
-				if (streamedExtras.length > 0) updateData.extra = streamedExtras;
 				if (resolvedModel && !modelPersisted) updateData.model = resolvedModel;
-				await DatabaseService.updateMessage(assistantMessage.id, updateData);
-				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+				await DatabaseService.updateMessage(currentMessageId, updateData);
+				const idx = conversationsStore.findMessageIndex(currentMessageId);
 				const uiUpdate: Partial<DatabaseMessage> = {
-					content: combinedContent,
-					toolCalls: updateData.toolCalls as string
+					content,
+					reasoningContent: reasoningContent || undefined,
+					toolCalls: toolCalls ? JSON.stringify(toolCalls) : ''
 				};
-				if (streamedExtras.length > 0) uiUpdate.extra = streamedExtras;
 				if (timings) uiUpdate.timings = timings;
 				if (resolvedModel) uiUpdate.model = resolvedModel;
 				conversationsStore.updateMessageAtIndex(idx, uiUpdate);
-				await conversationsStore.updateCurrentNode(assistantMessage.id);
-				if (onComplete) await onComplete(combinedContent);
-				this.setChatLoading(assistantMessage.convId, false);
-				this.clearChatStreaming(assistantMessage.convId);
-				this.setProcessingState(assistantMessage.convId, null);
+				await conversationsStore.updateCurrentNode(currentMessageId);
+			},
+			createToolResultMessage: async (
+				toolCallId: string,
+				content: string,
+				extras?: DatabaseMessageExtra[]
+			) => {
+				const msg = await DatabaseService.createMessageBranch(
+					{
+						convId,
+						type: MessageType.TEXT,
+						role: MessageRole.TOOL,
+						content,
+						toolCallId,
+						timestamp: Date.now(),
+						toolCalls: '',
+						children: [],
+						extra: extras
+					},
+					currentMessageId
+				);
+				conversationsStore.addMessageToActive(msg);
+				await conversationsStore.updateCurrentNode(msg.id);
+				return msg;
+			},
+			createAssistantMessage: async () => {
+				// Reset streaming state for new message
+				streamedContent = '';
+				streamedReasoningContent = '';
+
+				const lastMsg =
+					conversationsStore.activeMessages[conversationsStore.activeMessages.length - 1];
+				const msg = await DatabaseService.createMessageBranch(
+					{
+						convId,
+						type: MessageType.TEXT,
+						role: MessageRole.ASSISTANT,
+						content: '',
+						timestamp: Date.now(),
+						toolCalls: '',
+						children: [],
+						model: resolvedModel
+					},
+					lastMsg.id
+				);
+				conversationsStore.addMessageToActive(msg);
+				currentMessageId = msg.id;
+				return msg;
+			},
+			onFlowComplete: (finalTimings?: ChatMessageTimings) => {
+				if (finalTimings) {
+					const idx = conversationsStore.findMessageIndex(assistantMessage.id);
+
+					conversationsStore.updateMessageAtIndex(idx, { timings: finalTimings });
+					DatabaseService.updateMessage(assistantMessage.id, { timings: finalTimings }).catch(
+						console.error
+					);
+				}
+
+				cleanupStreamingState();
+
+				if (onComplete) onComplete(streamedContent);
 				if (isRouterMode()) modelsStore.fetchRouterModels().catch(console.error);
 			},
 			onError: (error: Error) => {
 				this.setStreamingActive(false);
 				if (isAbortError(error)) {
-					this.setChatLoading(assistantMessage.convId, false);
-					this.clearChatStreaming(assistantMessage.convId);
-					this.setProcessingState(assistantMessage.convId, null);
+					cleanupStreamingState();
 					return;
 				}
 				console.error('Streaming error:', error);
-				this.setChatLoading(assistantMessage.convId, false);
-				this.clearChatStreaming(assistantMessage.convId);
-				this.setProcessingState(assistantMessage.convId, null);
+				cleanupStreamingState();
 				const idx = conversationsStore.findMessageIndex(assistantMessage.id);
 				if (idx !== -1) {
 					const failedMessage = conversationsStore.removeMessageAtIndex(idx);
@@ -717,12 +749,13 @@ class ChatStore {
 				if (onError) onError(error);
 			}
 		};
+
 		const perChatOverrides = conversationsStore.activeConversation?.mcpServerOverrides;

 		const agenticConfig = agenticStore.getConfig(config(), perChatOverrides);
 		if (agenticConfig.enabled) {
 			const agenticResult = await agenticStore.runAgenticFlow({
-				conversationId: assistantMessage.convId,
+				conversationId: convId,
 				messages: allMessages,
 				options: { ...this.getApiOptions(), ...(effectiveModel ? { model: effectiveModel } : {}) },
 				callbacks: streamCallbacks,
@@ -732,16 +765,50 @@ class ChatStore {
 			if (agenticResult.handled) return;
 		}

-		const completionOptions = {
-			...this.getApiOptions(),
-			...(effectiveModel ? { model: effectiveModel } : {}),
-			...streamCallbacks
-		};
-
+		// Non-agentic path: direct streaming into the single assistant message
 		await ChatService.sendMessage(
 			allMessages,
-			completionOptions,
-			assistantMessage.convId,
+			{
+				...this.getApiOptions(),
+				...(effectiveModel ? { model: effectiveModel } : {}),
+				stream: true,
+				onChunk: streamCallbacks.onChunk,
+				onReasoningChunk: streamCallbacks.onReasoningChunk,
+				onModel: streamCallbacks.onModel,
+				onTimings: streamCallbacks.onTimings,
+				onComplete: async (
+					finalContent?: string,
+					reasoningContent?: string,
+					timings?: ChatMessageTimings,
+					toolCalls?: string
+				) => {
+					const content = streamedContent || finalContent || '';
+					const reasoning = streamedReasoningContent || reasoningContent;
+					const updateData: Record<string, unknown> = {
+						content,
+						reasoningContent: reasoning || undefined,
+						toolCalls: toolCalls || '',
+						timings
+					};
+					if (resolvedModel && !modelPersisted) updateData.model = resolvedModel;
+					await DatabaseService.updateMessage(currentMessageId, updateData);
+					const idx = conversationsStore.findMessageIndex(currentMessageId);
+					const uiUpdate: Partial<DatabaseMessage> = {
+						content,
+						reasoningContent: reasoning || undefined,
+						toolCalls: toolCalls || ''
+					};
+					if (timings) uiUpdate.timings = timings;
+					if (resolvedModel) uiUpdate.model = resolvedModel;
+					conversationsStore.updateMessageAtIndex(idx, uiUpdate);
+					await conversationsStore.updateCurrentNode(currentMessageId);
+					cleanupStreamingState();
+					if (onComplete) await onComplete(content);
+					if (isRouterMode()) modelsStore.fetchRouterModels().catch(console.error);
+				},
+				onError: streamCallbacks.onError
+			},
+			convId,
 			abortController.signal
 		);
 	}
@@ -1033,56 +1100,40 @@ class ChatStore {
 			}

 			const originalContent = dbMessage.content;
+			const originalReasoning = dbMessage.reasoningContent || '';
 			const conversationContext = conversationsStore.activeMessages.slice(0, idx);
 			const contextWithContinue = [
 				...conversationContext,
 				{ role: MessageRole.ASSISTANT as const, content: originalContent }
 			];

-			let appendedContent = '',
-				hasReceivedContent = false,
-				isReasoningOpen = hasUnclosedReasoningTag(originalContent);
+			let appendedContent = '';
+			let appendedReasoning = '';
+			let hasReceivedContent = false;

 			const updateStreamingContent = (fullContent: string) => {
 				this.setChatStreaming(msg.convId, fullContent, msg.id);
 				conversationsStore.updateMessageAtIndex(idx, { content: fullContent });
 			};

-			const appendContentChunk = (chunk: string) => {
-				if (isReasoningOpen) {
-					appendedContent += REASONING_TAGS.END;
-					isReasoningOpen = false;
-				}
-				appendedContent += chunk;
-				hasReceivedContent = true;
-				updateStreamingContent(originalContent + appendedContent);
-			};
-
-			const appendReasoningChunk = (chunk: string) => {
-				if (!isReasoningOpen) {
-					appendedContent += REASONING_TAGS.START;
-					isReasoningOpen = true;
-				}
-				appendedContent += chunk;
-				hasReceivedContent = true;
-				updateStreamingContent(originalContent + appendedContent);
-			};
-
-			const finalizeReasoning = () => {
-				if (isReasoningOpen) {
-					appendedContent += REASONING_TAGS.END;
-					isReasoningOpen = false;
-				}
-			};
-
 			const abortController = this.getOrCreateAbortController(msg.convId);

 			await ChatService.sendMessage(
 				contextWithContinue,
 				{
 					...this.getApiOptions(),
-					onChunk: (chunk: string) => appendContentChunk(chunk),
-					onReasoningChunk: (chunk: string) => appendReasoningChunk(chunk),
+					onChunk: (chunk: string) => {
+						appendedContent += chunk;
+						hasReceivedContent = true;
+						updateStreamingContent(originalContent + appendedContent);
+					},
+					onReasoningChunk: (chunk: string) => {
+						appendedReasoning += chunk;
+						hasReceivedContent = true;
+						conversationsStore.updateMessageAtIndex(idx, {
+							reasoningContent: originalReasoning + appendedReasoning
+						});
+					},
 					onTimings: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
 						const tokensPerSecond =
 							timings?.predicted_ms && timings?.predicted_n
@@ -1105,21 +1156,23 @@ class ChatStore {
 						reasoningContent?: string,
 						timings?: ChatMessageTimings
 					) => {
-						finalizeReasoning();
-
-						const appendedFromCompletion = hasReceivedContent
-							? appendedContent
-							: wrapReasoningContent(finalContent || '', reasoningContent);
-						const fullContent = originalContent + appendedFromCompletion;
+						const finalAppendedContent = hasReceivedContent ? appendedContent : finalContent || '';
+						const finalAppendedReasoning = hasReceivedContent
+							? appendedReasoning
+							: reasoningContent || '';
+						const fullContent = originalContent + finalAppendedContent;
+						const fullReasoning = originalReasoning + finalAppendedReasoning || undefined;

 						await DatabaseService.updateMessage(msg.id, {
 							content: fullContent,
+							reasoningContent: fullReasoning,
 							timestamp: Date.now(),
 							timings
 						});

 						conversationsStore.updateMessageAtIndex(idx, {
 							content: fullContent,
+							reasoningContent: fullReasoning,
 							timestamp: Date.now(),
 							timings
 						});
@@ -1135,11 +1188,13 @@ class ChatStore {
 							if (hasReceivedContent && appendedContent) {
 								await DatabaseService.updateMessage(msg.id, {
 									content: originalContent + appendedContent,
+									reasoningContent: originalReasoning + appendedReasoning || undefined,
 									timestamp: Date.now()
 								});

 								conversationsStore.updateMessageAtIndex(idx, {
 									content: originalContent + appendedContent,
+									reasoningContent: originalReasoning + appendedReasoning || undefined,
 									timestamp: Date.now()
 								});
 							}
--- a/tools/server/webui/src/lib/stores/conversations.svelte.ts
+++ b/tools/server/webui/src/lib/stores/conversations.svelte.ts
@@ -23,7 +23,7 @@ import { browser } from '$app/environment';
 import { toast } from 'svelte-sonner';
 import { DatabaseService } from '$lib/services/database.service';
 import { config } from '$lib/stores/settings.svelte';
-import { filterByLeafNodeId, findLeafNode } from '$lib/utils';
+import { filterByLeafNodeId, findLeafNode, runLegacyMigration } from '$lib/utils';
 import type { McpServerOverride } from '$lib/types/database';
 import { MessageRole } from '$lib/enums';
 import {
@@ -128,6 +128,10 @@ class ConversationsStore {
 		if (this.isInitialized) return;

 		try {
+			// @deprecated Legacy migration for old marker-based messages.
+			// Remove once all users have migrated to the structured format.
+			await runLegacyMigration();
+
 			await this.loadConversations();
 			this.isInitialized = true;
 		} catch (error) {
--- a/tools/server/webui/src/lib/types/agentic.d.ts
+++ b/tools/server/webui/src/lib/types/agentic.d.ts
@@ -2,6 +2,7 @@ import type { MessageRole } from '$lib/enums';
 import { ToolCallType } from '$lib/enums';
 import type {
 	ApiChatCompletionRequest,
+	ApiChatCompletionToolCall,
 	ApiChatMessageContentPart,
 	ApiChatMessageData
 } from './api';
@@ -70,22 +71,48 @@ export interface AgenticSession {
 }

 /**
- * Callbacks for agentic flow execution
+ * Callbacks for agentic flow execution.
+ *
+ * The agentic loop creates separate DB messages for each turn:
+ * - assistant messages (one per LLM turn, with tool_calls if any)
+ * - tool result messages (one per tool call execution)
+ *
+ * The first assistant message is created by the caller before starting the flow.
+ * Subsequent messages are created via createToolResultMessage / createAssistantMessage.
 */
 export interface AgenticFlowCallbacks {
+	/** Content chunk for the current assistant message */
 	onChunk?: (chunk: string) => void;
+	/** Reasoning content chunk for the current assistant message */
 	onReasoningChunk?: (chunk: string) => void;
-	onToolCallChunk?: (serializedToolCalls: string) => void;
-	onAttachments?: (extras: DatabaseMessageExtra[]) => void;
+	/** Tool calls being streamed (partial, accumulating) for the current turn */
+	onToolCallsStreaming?: (toolCalls: ApiChatCompletionToolCall[]) => void;
+	/** Attachments extracted from tool results */
+	onAttachments?: (messageId: string, extras: DatabaseMessageExtra[]) => void;
+	/** Model name detected from response */
 	onModel?: (model: string) => void;
-	onComplete?: (
+	/** Current assistant turn's streaming is complete - save to DB */
+	onAssistantTurnComplete?: (
 		content: string,
-		reasoningContent?: string,
-		timings?: ChatMessageTimings,
-		toolCalls?: string
-	) => void;
+		reasoningContent: string | undefined,
+		timings: ChatMessageTimings | undefined,
+		toolCalls: ApiChatCompletionToolCall[] | undefined
+	) => Promise<void>;
+	/** Create a tool result message in the DB tree */
+	createToolResultMessage?: (
+		toolCallId: string,
+		content: string,
+		extras?: DatabaseMessageExtra[]
+	) => Promise<DatabaseMessage>;
+	/** Create a new assistant message for the next agentic turn */
+	createAssistantMessage?: () => Promise<DatabaseMessage>;
+	/** Entire agentic flow is complete */
+	onFlowComplete?: (timings?: ChatMessageTimings) => void;
+	/** Error during flow */
 	onError?: (error: Error) => void;
+	/** Timing updates during streaming */
 	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
+	/** An agentic turn (LLM + tool execution) completed - intermediate timing update */
 	onTurnComplete?: (intermediateTimings: ChatMessageTimings) => void;
 }

--- a/tools/server/webui/src/lib/types/chat.d.ts
+++ b/tools/server/webui/src/lib/types/chat.d.ts
@@ -1,5 +1,6 @@
 import type { ErrorDialogType } from '$lib/enums';
-import type { DatabaseMessageExtra } from './database';
+import type { ApiChatCompletionToolCall } from './api';
+import type { DatabaseMessage, DatabaseMessageExtra } from './database';

 export interface ChatUploadedFile {
 	id: string;
@@ -99,21 +100,28 @@ export interface ChatMessageToolCallTiming {
 }

 /**
- * Callbacks for streaming chat responses
+ * Callbacks for streaming chat responses (used by both agentic and non-agentic paths)
 */
 export interface ChatStreamCallbacks {
 	onChunk?: (chunk: string) => void;
 	onReasoningChunk?: (chunk: string) => void;
-	onToolCallChunk?: (chunk: string) => void;
-	onAttachments?: (extras: DatabaseMessageExtra[]) => void;
+	onToolCallsStreaming?: (toolCalls: ApiChatCompletionToolCall[]) => void;
+	onAttachments?: (messageId: string, extras: DatabaseMessageExtra[]) => void;
 	onModel?: (model: string) => void;
 	onTimings?: (timings?: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => void;
-	onComplete?: (
-		content?: string,
-		reasoningContent?: string,
-		timings?: ChatMessageTimings,
-		toolCallContent?: string
-	) => void;
+	onAssistantTurnComplete?: (
+		content: string,
+		reasoningContent: string | undefined,
+		timings: ChatMessageTimings | undefined,
+		toolCalls: ApiChatCompletionToolCall[] | undefined
+	) => Promise<void>;
+	createToolResultMessage?: (
+		toolCallId: string,
+		content: string,
+		extras?: DatabaseMessageExtra[]
+	) => Promise<DatabaseMessage>;
+	createAssistantMessage?: () => Promise<DatabaseMessage>;
+	onFlowComplete?: (timings?: ChatMessageTimings) => void;
 	onError?: (error: Error) => void;
 	onTurnComplete?: (intermediateTimings: ChatMessageTimings) => void;
 }
--- a/tools/server/webui/src/lib/types/database.d.ts
+++ b/tools/server/webui/src/lib/types/database.d.ts
@@ -92,6 +92,8 @@ export interface DatabaseMessage {
 	 * @deprecated - left for backward compatibility
 	 */
 	thinking?: string;
+	/** Reasoning content produced by the model (separate from visible content) */
+	reasoningContent?: string;
 	/** Serialized JSON array of tool calls made by assistant messages */
 	toolCalls?: string;
 	/** Tool call ID for tool result messages (role: 'tool') */
--- a/tools/server/webui/src/lib/utils/agentic.ts
+++ b/tools/server/webui/src/lib/utils/agentic.ts
@@ -1,8 +1,15 @@
-import { AgenticSectionType } from '$lib/enums';
-import { AGENTIC_TAGS, AGENTIC_REGEX, REASONING_TAGS, TRIM_NEWLINES_REGEX } from '$lib/constants';
+import { AgenticSectionType, MessageRole } from '$lib/enums';
+import { ATTACHMENT_SAVED_REGEX, NEWLINE_SEPARATOR } from '$lib/constants';
+import type { ApiChatCompletionToolCall } from '$lib/types/api';
+import type {
+	DatabaseMessage,
+	DatabaseMessageExtra,
+	DatabaseMessageExtraImageFile
+} from '$lib/types/database';
+import { AttachmentType } from '$lib/enums';

 /**
- * Represents a parsed section of agentic content
+ * Represents a parsed section of agentic content for display
 */
 export interface AgenticSection {
 	type: AgenticSectionType;
@@ -10,63 +17,70 @@ export interface AgenticSection {
 	toolName?: string;
 	toolArgs?: string;
 	toolResult?: string;
+	toolResultExtras?: DatabaseMessageExtra[];
 }

 /**
- * Represents a segment of content that may contain reasoning blocks
+ * Represents a tool result line that may reference an image attachment
 */
-type ReasoningSegment = {
-	type:
-		| AgenticSectionType.TEXT
-		| AgenticSectionType.REASONING
-		| AgenticSectionType.REASONING_PENDING;
-	content: string;
+export type ToolResultLine = {
+	text: string;
+	image?: DatabaseMessageExtraImageFile;
 };

 /**
- * Parses agentic content into structured sections
+ * Derives display sections from a single assistant message and its direct tool results.
 *
- * Main parsing function that processes content containing:
- * - Tool calls (completed, pending, or streaming)
- * - Reasoning blocks (completed or streaming)
- * - Regular text content
- *
- * The parser handles chronological display of agentic flow output, maintaining
- * the order of operations and properly identifying different states of tool calls
- * and reasoning blocks during streaming.
- *
- * @param rawContent - The raw content string to parse
- * @returns Array of structured agentic sections ready for display
- *
- * @example
- * ```typescript
- * const content = "Some text <<<AGENTIC_TOOL_CALL>>>tool_name...";
- * const sections = parseAgenticContent(content);
- * // Returns: [{ type: 'text', content: 'Some text' }, { type: 'tool_call_streaming', ... }]
- * ```
+ * @param message - The assistant message
+ * @param toolMessages - Tool result messages for this assistant's tool_calls
+ * @param streamingToolCalls - Partial tool calls during streaming (not yet persisted)
 */
-export function parseAgenticContent(rawContent: string): AgenticSection[] {
-	if (!rawContent) return [];
-
-	const segments = splitReasoningSegments(rawContent);
+function deriveSingleTurnSections(
+	message: DatabaseMessage,
+	toolMessages: DatabaseMessage[] = [],
+	streamingToolCalls: ApiChatCompletionToolCall[] = []
+): AgenticSection[] {
 	const sections: AgenticSection[] = [];

-	for (const segment of segments) {
-		if (segment.type === AgenticSectionType.TEXT) {
-			sections.push(...parseToolCallContent(segment.content));
-			continue;
-		}
-
-		if (segment.type === AgenticSectionType.REASONING) {
-			if (segment.content.trim()) {
-				sections.push({ type: AgenticSectionType.REASONING, content: segment.content });
-			}
-			continue;
-		}
-
+	// 1. Reasoning content (from dedicated field)
+	if (message.reasoningContent) {
 		sections.push({
-			type: AgenticSectionType.REASONING_PENDING,
-			content: segment.content
+			type: AgenticSectionType.REASONING,
+			content: message.reasoningContent
+		});
+	}
+
+	// 2. Text content
+	if (message.content?.trim()) {
+		sections.push({
+			type: AgenticSectionType.TEXT,
+			content: message.content
+		});
+	}
+
+	// 3. Persisted tool calls (from message.toolCalls field)
+	const toolCalls = parseToolCalls(message.toolCalls);
+	for (const tc of toolCalls) {
+		const resultMsg = toolMessages.find((m) => m.toolCallId === tc.id);
+		sections.push({
+			type: resultMsg ? AgenticSectionType.TOOL_CALL : AgenticSectionType.TOOL_CALL_PENDING,
+			content: resultMsg?.content || '',
+			toolName: tc.function?.name,
+			toolArgs: tc.function?.arguments,
+			toolResult: resultMsg?.content,
+			toolResultExtras: resultMsg?.extra
+		});
+	}
+
+	// 4. Streaming tool calls (not yet persisted - currently being received)
+	for (const tc of streamingToolCalls) {
+		// Skip if already in persisted tool calls
+		if (tc.id && toolCalls.find((t) => t.id === tc.id)) continue;
+		sections.push({
+			type: AgenticSectionType.TOOL_CALL_STREAMING,
+			content: '',
+			toolName: tc.function?.name,
+			toolArgs: tc.function?.arguments
 		});
 	}

@@ -74,211 +88,123 @@ export function parseAgenticContent(rawContent: string): AgenticSection[] {
 }

 /**
- * Parses content containing tool call markers
+ * Derives display sections from structured message data.
 *
- * Identifies and extracts tool calls from content, handling:
- * - Completed tool calls with name, arguments, and results
- * - Pending tool calls (execution in progress)
- * - Streaming tool calls (arguments being received)
- * - Early-stage tool calls (just started)
+ * Handles both single-turn (one assistant + its tool results) and multi-turn
+ * agentic sessions (multiple assistant + tool messages grouped together).
 *
- * @param rawContent - The raw content string to parse
- * @returns Array of agentic sections representing tool calls and text
+ * When `toolMessages` contains continuation assistant messages (from multi-turn
+ * agentic flows), they are processed in order to produce sections across all turns.
+ *
+ * @param message - The first/anchor assistant message
+ * @param toolMessages - Tool result messages and continuation assistant messages
+ * @param streamingToolCalls - Partial tool calls during streaming (not yet persisted)
+ * @param isStreaming - Whether the message is currently being streamed
 */
-function parseToolCallContent(rawContent: string): AgenticSection[] {
-	if (!rawContent) return [];
+export function deriveAgenticSections(
+	message: DatabaseMessage,
+	toolMessages: DatabaseMessage[] = [],
+	streamingToolCalls: ApiChatCompletionToolCall[] = []
+): AgenticSection[] {
+	const hasAssistantContinuations = toolMessages.some((m) => m.role === MessageRole.ASSISTANT);
+
+	if (!hasAssistantContinuations) {
+		return deriveSingleTurnSections(message, toolMessages, streamingToolCalls);
+	}

 	const sections: AgenticSection[] = [];

-	const completedToolCallRegex = new RegExp(AGENTIC_REGEX.COMPLETED_TOOL_CALL.source, 'g');
+	const firstTurnToolMsgs = collectToolMessages(toolMessages, 0);
+	sections.push(...deriveSingleTurnSections(message, firstTurnToolMsgs));

-	let lastIndex = 0;
-	let match;
+	let i = firstTurnToolMsgs.length;

-	while ((match = completedToolCallRegex.exec(rawContent)) !== null) {
-		if (match.index > lastIndex) {
-			const textBefore = rawContent.slice(lastIndex, match.index).trim();
-			if (textBefore) {
-				sections.push({ type: AgenticSectionType.TEXT, content: textBefore });
-			}
+	while (i < toolMessages.length) {
+		const msg = toolMessages[i];
+
+		if (msg.role === MessageRole.ASSISTANT) {
+			const turnToolMsgs = collectToolMessages(toolMessages, i + 1);
+			const isLastTurn = i + 1 + turnToolMsgs.length >= toolMessages.length;
+
+			sections.push(
+				...deriveSingleTurnSections(msg, turnToolMsgs, isLastTurn ? streamingToolCalls : [])
+			);
+
+			i += 1 + turnToolMsgs.length;
+		} else {
+			i++;
 		}
-
-		const toolName = match[1];
-		const toolArgs = match[2];
-		const toolResult = match[3].replace(TRIM_NEWLINES_REGEX, '');
-
-		sections.push({
-			type: AgenticSectionType.TOOL_CALL,
-			content: toolResult,
-			toolName,
-			toolArgs,
-			toolResult
-		});
-
-		lastIndex = match.index + match[0].length;
-	}
-
-	const remainingContent = rawContent.slice(lastIndex);
-
-	const pendingMatch = remainingContent.match(AGENTIC_REGEX.PENDING_TOOL_CALL);
-	const partialWithNameMatch = remainingContent.match(AGENTIC_REGEX.PARTIAL_WITH_NAME);
-	const earlyMatch = remainingContent.match(AGENTIC_REGEX.EARLY_MATCH);
-
-	if (pendingMatch) {
-		const pendingIndex = remainingContent.indexOf(AGENTIC_TAGS.TOOL_CALL_START);
-
-		if (pendingIndex > 0) {
-			const textBefore = remainingContent.slice(0, pendingIndex).trim();
-
-			if (textBefore) {
-				sections.push({ type: AgenticSectionType.TEXT, content: textBefore });
-			}
-		}
-
-		const toolName = pendingMatch[1];
-		const toolArgs = pendingMatch[2];
-		const streamingResult = (pendingMatch[3] || '').replace(TRIM_NEWLINES_REGEX, '');
-
-		sections.push({
-			type: AgenticSectionType.TOOL_CALL_PENDING,
-			content: streamingResult,
-			toolName,
-			toolArgs,
-			toolResult: streamingResult || undefined
-		});
-	} else if (partialWithNameMatch) {
-		const pendingIndex = remainingContent.indexOf(AGENTIC_TAGS.TOOL_CALL_START);
-
-		if (pendingIndex > 0) {
-			const textBefore = remainingContent.slice(0, pendingIndex).trim();
-			if (textBefore) {
-				sections.push({ type: AgenticSectionType.TEXT, content: textBefore });
-			}
-		}
-
-		const partialArgs = partialWithNameMatch[2] || '';
-
-		sections.push({
-			type: AgenticSectionType.TOOL_CALL_STREAMING,
-			content: '',
-			toolName: partialWithNameMatch[1],
-			toolArgs: partialArgs || undefined,
-			toolResult: undefined
-		});
-	} else if (earlyMatch) {
-		const pendingIndex = remainingContent.indexOf(AGENTIC_TAGS.TOOL_CALL_START);
-
-		if (pendingIndex > 0) {
-			const textBefore = remainingContent.slice(0, pendingIndex).trim();
-			if (textBefore) {
-				sections.push({ type: AgenticSectionType.TEXT, content: textBefore });
-			}
-		}
-
-		const nameMatch = earlyMatch[1]?.match(AGENTIC_REGEX.TOOL_NAME_EXTRACT);
-
-		sections.push({
-			type: AgenticSectionType.TOOL_CALL_STREAMING,
-			content: '',
-			toolName: nameMatch?.[1],
-			toolArgs: undefined,
-			toolResult: undefined
-		});
-	} else if (lastIndex < rawContent.length) {
-		let remainingText = rawContent.slice(lastIndex).trim();
-
-		const partialMarkerMatch = remainingText.match(AGENTIC_REGEX.PARTIAL_MARKER);
-
-		if (partialMarkerMatch) {
-			remainingText = remainingText.slice(0, partialMarkerMatch.index).trim();
-		}
-
-		if (remainingText) {
-			sections.push({ type: AgenticSectionType.TEXT, content: remainingText });
-		}
-	}
-
-	if (sections.length === 0 && rawContent.trim()) {
-		sections.push({ type: AgenticSectionType.TEXT, content: rawContent });
 	}

 	return sections;
 }

 /**
- * Strips partial marker from text content
- *
- * Removes incomplete agentic markers (e.g., "<<<", "<<<AGENTIC") that may appear
- * at the end of streaming content.
- *
- * @param text - The text content to process
- * @returns Text with partial markers removed
+ * Collect consecutive tool messages starting at `startIndex`.
 */
-function stripPartialMarker(text: string): string {
-	const partialMarkerMatch = text.match(AGENTIC_REGEX.PARTIAL_MARKER);
+function collectToolMessages(messages: DatabaseMessage[], startIndex: number): DatabaseMessage[] {
+	const result: DatabaseMessage[] = [];

-	if (partialMarkerMatch) {
-		return text.slice(0, partialMarkerMatch.index).trim();
+	for (let i = startIndex; i < messages.length; i++) {
+		if (messages[i].role === MessageRole.TOOL) {
+			result.push(messages[i]);
+		} else {
+			break;
+		}
 	}

-	return text;
+	return result;
 }

 /**
- * Splits raw content into segments based on reasoning blocks
- *
- * Identifies and extracts reasoning content wrapped in REASONING_TAGS.START/END markers,
- * separating it from regular text content. Handles both complete and incomplete
- * (streaming) reasoning blocks.
- *
- * @param rawContent - The raw content string to parse
- * @returns Array of reasoning segments with their types and content
+ * Parse tool result text into lines, matching image attachments by name.
 */
-function splitReasoningSegments(rawContent: string): ReasoningSegment[] {
-	if (!rawContent) return [];
+export function parseToolResultWithImages(
+	toolResult: string,
+	extras?: DatabaseMessageExtra[]
+): ToolResultLine[] {
+	const lines = toolResult.split(NEWLINE_SEPARATOR);
+	return lines.map((line) => {
+		const match = line.match(ATTACHMENT_SAVED_REGEX);
+		if (!match || !extras) return { text: line };

-	const segments: ReasoningSegment[] = [];
-	let cursor = 0;
+		const attachmentName = match[1];
+		const image = extras.find(
+			(e): e is DatabaseMessageExtraImageFile =>
+				e.type === AttachmentType.IMAGE && e.name === attachmentName
+		);

-	while (cursor < rawContent.length) {
-		const startIndex = rawContent.indexOf(REASONING_TAGS.START, cursor);
+		return { text: line, image };
+	});
+}

-		if (startIndex === -1) {
-			const remainingText = rawContent.slice(cursor);
+/**
+ * Safely parse the toolCalls JSON string from a DatabaseMessage.
+ */
+function parseToolCalls(toolCallsJson?: string): ApiChatCompletionToolCall[] {
+	if (!toolCallsJson) return [];

-			if (remainingText) {
-				segments.push({ type: AgenticSectionType.TEXT, content: remainingText });
-			}
+	try {
+		const parsed = JSON.parse(toolCallsJson);

-			break;
-		}
+		return Array.isArray(parsed) ? parsed : [];
+	} catch {
+		return [];
+	}
+}

-		if (startIndex > cursor) {
-			const textBefore = rawContent.slice(cursor, startIndex);
+/**
+ * Check if a message has agentic content (tool calls or is part of an agentic flow).
+ */
+export function hasAgenticContent(
+	message: DatabaseMessage,
+	toolMessages: DatabaseMessage[] = []
+): boolean {
+	if (message.toolCalls) {
+		const tc = parseToolCalls(message.toolCalls);

-			if (textBefore) {
-				segments.push({ type: AgenticSectionType.TEXT, content: textBefore });
-			}
-		}
-
-		const contentStart = startIndex + REASONING_TAGS.START.length;
-		const endIndex = rawContent.indexOf(REASONING_TAGS.END, contentStart);
-
-		if (endIndex === -1) {
-			const pendingContent = rawContent.slice(contentStart);
-
-			segments.push({
-				type: AgenticSectionType.REASONING_PENDING,
-				content: stripPartialMarker(pendingContent)
-			});
-
-			break;
-		}
-
-		const reasoningContent = rawContent.slice(contentStart, endIndex);
-		segments.push({ type: AgenticSectionType.REASONING, content: reasoningContent });
-		cursor = endIndex + REASONING_TAGS.END.length;
+		if (tc.length > 0) return true;
 	}

-	return segments;
+	return toolMessages.length > 0;
 }
--- a/tools/server/webui/src/lib/utils/cors-proxy.ts
+++ b/tools/server/webui/src/lib/utils/cors-proxy.ts
@@ -28,7 +28,7 @@ export function buildProxiedHeaders(headers: Record<string, string>): Record<str
 	const proxiedHeaders: Record<string, string> = {};

 	for (const [key, value] of Object.entries(headers)) {
-		proxiedHeaders[`X-Proxy-Header-${key}`] = value;
+		proxiedHeaders[`x-proxy-header-${key}`] = value;
 	}

 	return proxiedHeaders;
--- a/tools/server/webui/src/lib/utils/index.ts
+++ b/tools/server/webui/src/lib/utils/index.ts
@@ -149,8 +149,17 @@ export { parseHeadersToArray, serializeHeaders } from './headers';
 // Favicon utilities
 export { getFaviconUrl } from './favicon';

-// Agentic content parsing utilities
-export { parseAgenticContent, type AgenticSection } from './agentic';
+// Agentic content utilities (structured section derivation)
+export {
+	deriveAgenticSections,
+	parseToolResultWithImages,
+	hasAgenticContent,
+	type AgenticSection,
+	type ToolResultLine
+} from './agentic';
+
+// Legacy migration utilities
+export { runLegacyMigration, isMigrationNeeded } from './legacy-migration';

 // Cache utilities
 export { TTLCache, ReactiveTTLMap, type TTLCacheOptions } from './cache-ttl';
--- a/tools/server/webui/src/lib/utils/legacy-migration.ts
+++ b/tools/server/webui/src/lib/utils/legacy-migration.ts
@@ -0,0 +1,345 @@
+/**
+ * @deprecated Legacy migration utility — remove at some point in the future once all users have migrated to the new structured agentic message format.
+ *
+ * Converts old marker-based agentic messages to the new structured format
+ * with separate messages per turn.
+ *
+ * Old format: Single assistant message with markers in content:
+ *   <<<reasoning_content_start>>>...<<<reasoning_content_end>>>
+ *   <<<AGENTIC_TOOL_CALL_START>>>...<<<AGENTIC_TOOL_CALL_END>>>
+ *
+ * New format: Separate messages per turn:
+ *   - assistant (content + reasoningContent + toolCalls)
+ *   - tool (toolCallId + content)
+ *   - assistant (next turn)
+ *   - ...
+ */
+
+import { LEGACY_AGENTIC_REGEX, LEGACY_REASONING_TAGS } from '$lib/constants';
+import { DatabaseService } from '$lib/services/database.service';
+import { MessageRole, MessageType } from '$lib/enums';
+import type { DatabaseMessage } from '$lib/types/database';
+
+const MIGRATION_DONE_KEY = 'llama-webui-migration-v2-done';
+
+/**
+ * @deprecated Part of legacy migration — remove with the migration module.
+ * Check if migration has been performed.
+ */
+export function isMigrationNeeded(): boolean {
+	try {
+		return !localStorage.getItem(MIGRATION_DONE_KEY);
+	} catch {
+		return false;
+	}
+}
+
+/**
+ * Mark migration as done.
+ */
+function markMigrationDone(): void {
+	try {
+		localStorage.setItem(MIGRATION_DONE_KEY, String(Date.now()));
+	} catch {
+		// Ignore localStorage errors
+	}
+}
+
+/**
+ * Check if a message has legacy markers in its content.
+ */
+function hasLegacyMarkers(message: DatabaseMessage): boolean {
+	if (!message.content) return false;
+	return LEGACY_AGENTIC_REGEX.HAS_LEGACY_MARKERS.test(message.content);
+}
+
+/**
+ * Extract reasoning content from legacy marker format.
+ */
+function extractLegacyReasoning(content: string): { reasoning: string; cleanContent: string } {
+	let reasoning = '';
+	let cleanContent = content;
+
+	// Extract all reasoning blocks
+	const re = new RegExp(LEGACY_AGENTIC_REGEX.REASONING_EXTRACT.source, 'g');
+	let match;
+	while ((match = re.exec(content)) !== null) {
+		reasoning += match[1];
+	}
+
+	// Remove reasoning tags from content
+	cleanContent = cleanContent
+		.replace(new RegExp(LEGACY_AGENTIC_REGEX.REASONING_BLOCK.source, 'g'), '')
+		.replace(LEGACY_AGENTIC_REGEX.REASONING_OPEN, '');
+
+	return { reasoning, cleanContent };
+}
+
+/**
+ * Parse legacy content with tool call markers into structured turns.
+ */
+interface ParsedTurn {
+	textBefore: string;
+	toolCalls: Array<{
+		name: string;
+		args: string;
+		result: string;
+	}>;
+}
+
+function parseLegacyToolCalls(content: string): ParsedTurn[] {
+	const turns: ParsedTurn[] = [];
+	const regex = new RegExp(LEGACY_AGENTIC_REGEX.COMPLETED_TOOL_CALL.source, 'g');
+
+	let lastIndex = 0;
+	let currentTurn: ParsedTurn = { textBefore: '', toolCalls: [] };
+	let match;
+
+	while ((match = regex.exec(content)) !== null) {
+		const textBefore = content.slice(lastIndex, match.index).trim();
+
+		// If there's text between tool calls and we already have tool calls,
+		// that means a new turn started (text after tool results = new LLM turn)
+		if (textBefore && currentTurn.toolCalls.length > 0) {
+			turns.push(currentTurn);
+			currentTurn = { textBefore, toolCalls: [] };
+		} else if (textBefore && currentTurn.toolCalls.length === 0) {
+			currentTurn.textBefore = textBefore;
+		}
+
+		currentTurn.toolCalls.push({
+			name: match[1],
+			args: match[2],
+			result: match[3].replace(/^\n+|\n+$/g, '')
+		});
+
+		lastIndex = match.index + match[0].length;
+	}
+
+	// Any remaining text after the last tool call
+	const remainingText = content.slice(lastIndex).trim();
+
+	if (currentTurn.toolCalls.length > 0) {
+		turns.push(currentTurn);
+	}
+
+	// If there's text after all tool calls, it's the final assistant response
+	if (remainingText) {
+		// Remove any partial/open markers
+		const cleanRemaining = remainingText
+			.replace(LEGACY_AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '')
+			.trim();
+		if (cleanRemaining) {
+			turns.push({ textBefore: cleanRemaining, toolCalls: [] });
+		}
+	}
+
+	// If no tool calls found at all, return the original content as a single turn
+	if (turns.length === 0) {
+		turns.push({ textBefore: content.trim(), toolCalls: [] });
+	}
+
+	return turns;
+}
+
+/**
+ * Migrate a single conversation's messages from legacy format to new format.
+ */
+async function migrateConversation(convId: string): Promise<number> {
+	const allMessages = await DatabaseService.getConversationMessages(convId);
+	let migratedCount = 0;
+
+	for (const message of allMessages) {
+		if (message.role !== MessageRole.ASSISTANT) continue;
+		if (!hasLegacyMarkers(message)) {
+			// Still check for reasoning-only markers (no tool calls)
+			if (message.content?.includes(LEGACY_REASONING_TAGS.START)) {
+				const { reasoning, cleanContent } = extractLegacyReasoning(message.content);
+				await DatabaseService.updateMessage(message.id, {
+					content: cleanContent.trim(),
+					reasoningContent: reasoning || undefined
+				});
+				migratedCount++;
+			}
+			continue;
+		}
+
+		// Has agentic markers - full migration needed
+		const { reasoning, cleanContent } = extractLegacyReasoning(message.content);
+		const turns = parseLegacyToolCalls(cleanContent);
+
+		// Parse existing toolCalls JSON to try to match IDs
+		let existingToolCalls: Array<{ id: string; function?: { name: string; arguments: string } }> =
+			[];
+		if (message.toolCalls) {
+			try {
+				existingToolCalls = JSON.parse(message.toolCalls);
+			} catch {
+				// Ignore
+			}
+		}
+
+		// First turn uses the existing message
+		const firstTurn = turns[0];
+		if (!firstTurn) continue;
+
+		// Match tool calls from the first turn to existing IDs
+		const firstTurnToolCalls = firstTurn.toolCalls.map((tc, i) => {
+			const existing =
+				existingToolCalls.find((e) => e.function?.name === tc.name) || existingToolCalls[i];
+			return {
+				id: existing?.id || `legacy_tool_${i}`,
+				type: 'function' as const,
+				function: { name: tc.name, arguments: tc.args }
+			};
+		});
+
+		// Update the existing message for the first turn
+		await DatabaseService.updateMessage(message.id, {
+			content: firstTurn.textBefore,
+			reasoningContent: reasoning || undefined,
+			toolCalls: firstTurnToolCalls.length > 0 ? JSON.stringify(firstTurnToolCalls) : ''
+		});
+
+		let currentParentId = message.id;
+		let toolCallIdCounter = existingToolCalls.length;
+
+		// Create tool result messages for the first turn
+		for (let i = 0; i < firstTurn.toolCalls.length; i++) {
+			const tc = firstTurn.toolCalls[i];
+			const toolCallId = firstTurnToolCalls[i]?.id || `legacy_tool_${i}`;
+
+			const toolMsg = await DatabaseService.createMessageBranch(
+				{
+					convId,
+					type: MessageType.TEXT,
+					role: MessageRole.TOOL,
+					content: tc.result,
+					toolCallId,
+					timestamp: message.timestamp + i + 1,
+					toolCalls: '',
+					children: []
+				},
+				currentParentId
+			);
+			currentParentId = toolMsg.id;
+		}
+
+		// Create messages for subsequent turns
+		for (let turnIdx = 1; turnIdx < turns.length; turnIdx++) {
+			const turn = turns[turnIdx];
+
+			const turnToolCalls = turn.toolCalls.map((tc, i) => {
+				const idx = toolCallIdCounter + i;
+				const existing = existingToolCalls[idx];
+				return {
+					id: existing?.id || `legacy_tool_${idx}`,
+					type: 'function' as const,
+					function: { name: tc.name, arguments: tc.args }
+				};
+			});
+			toolCallIdCounter += turn.toolCalls.length;
+
+			// Create assistant message for this turn
+			const assistantMsg = await DatabaseService.createMessageBranch(
+				{
+					convId,
+					type: MessageType.TEXT,
+					role: MessageRole.ASSISTANT,
+					content: turn.textBefore,
+					timestamp: message.timestamp + turnIdx * 100,
+					toolCalls: turnToolCalls.length > 0 ? JSON.stringify(turnToolCalls) : '',
+					children: [],
+					model: message.model
+				},
+				currentParentId
+			);
+			currentParentId = assistantMsg.id;
+
+			// Create tool result messages for this turn
+			for (let i = 0; i < turn.toolCalls.length; i++) {
+				const tc = turn.toolCalls[i];
+				const toolCallId = turnToolCalls[i]?.id || `legacy_tool_${toolCallIdCounter + i}`;
+
+				const toolMsg = await DatabaseService.createMessageBranch(
+					{
+						convId,
+						type: MessageType.TEXT,
+						role: MessageRole.TOOL,
+						content: tc.result,
+						toolCallId,
+						timestamp: message.timestamp + turnIdx * 100 + i + 1,
+						toolCalls: '',
+						children: []
+					},
+					currentParentId
+				);
+				currentParentId = toolMsg.id;
+			}
+		}
+
+		// Re-parent any children of the original message to the last created message
+		// (the original message's children list was the next user message or similar)
+		if (message.children.length > 0 && currentParentId !== message.id) {
+			for (const childId of message.children) {
+				// Skip children we just created (they were already properly parented)
+				const child = allMessages.find((m) => m.id === childId);
+				if (!child) continue;
+				// Only re-parent non-tool messages that were original children
+				if (child.role !== MessageRole.TOOL) {
+					await DatabaseService.updateMessage(childId, { parent: currentParentId });
+					// Add to new parent's children
+					const newParent = await DatabaseService.getConversationMessages(convId).then((msgs) =>
+						msgs.find((m) => m.id === currentParentId)
+					);
+					if (newParent && !newParent.children.includes(childId)) {
+						await DatabaseService.updateMessage(currentParentId, {
+							children: [...newParent.children, childId]
+						});
+					}
+				}
+			}
+			// Clear re-parented children from the original message
+			await DatabaseService.updateMessage(message.id, { children: [] });
+		}
+
+		migratedCount++;
+	}
+
+	return migratedCount;
+}
+
+/**
+ * @deprecated Part of legacy migration — remove with the migration module.
+ * Run the full migration across all conversations.
+ * This should be called once at app startup if migration is needed.
+ */
+export async function runLegacyMigration(): Promise<void> {
+	if (!isMigrationNeeded()) return;
+
+	console.log('[Migration] Starting legacy message format migration...');
+
+	try {
+		const conversations = await DatabaseService.getAllConversations();
+		let totalMigrated = 0;
+
+		for (const conv of conversations) {
+			const count = await migrateConversation(conv.id);
+			totalMigrated += count;
+		}
+
+		if (totalMigrated > 0) {
+			console.log(
+				`[Migration] Migrated ${totalMigrated} messages across ${conversations.length} conversations`
+			);
+		} else {
+			console.log('[Migration] No legacy messages found, marking as done');
+		}
+
+		markMigrationDone();
+	} catch (error) {
+		console.error('[Migration] Failed to migrate legacy messages:', error);
+		// Still mark as done to avoid infinite retry loops
+		markMigrationDone();
+	}
+}
--- a/tools/server/webui/svelte.config.js
+++ b/tools/server/webui/svelte.config.js
@@ -22,7 +22,7 @@ const config = {
 			strict: true
 		}),
 		output: {
-			bundleStrategy: 'inline'
+			bundleStrategy: 'single'
 		},
 		alias: {
 			$styles: 'src/styles'
--- a/tools/server/webui/tests/e2e/demo.test.ts
+++ b/tools/server/webui/tests/e2e/demo.test.ts
@@ -2,5 +2,5 @@ import { expect, test } from '@playwright/test';

 test('home page has expected h1', async ({ page }) => {
 	await page.goto('/');
-	await expect(page.locator('h1')).toBeVisible();
+	await expect(page.locator('h1').first()).toBeVisible();
 });
--- a/tools/server/webui/tests/unit/agentic-sections.test.ts
+++ b/tools/server/webui/tests/unit/agentic-sections.test.ts
@@ -0,0 +1,211 @@
+import { describe, it, expect } from 'vitest';
+import { deriveAgenticSections, hasAgenticContent } from '$lib/utils/agentic';
+import { AgenticSectionType, MessageRole } from '$lib/enums';
+import type { DatabaseMessage } from '$lib/types/database';
+import type { ApiChatCompletionToolCall } from '$lib/types/api';
+
+function makeAssistant(overrides: Partial<DatabaseMessage> = {}): DatabaseMessage {
+	return {
+		id: overrides.id ?? 'ast-1',
+		convId: 'conv-1',
+		type: 'text',
+		timestamp: Date.now(),
+		role: MessageRole.ASSISTANT,
+		content: overrides.content ?? '',
+		parent: null,
+		children: [],
+		...overrides
+	} as DatabaseMessage;
+}
+
+function makeToolMsg(overrides: Partial<DatabaseMessage> = {}): DatabaseMessage {
+	return {
+		id: overrides.id ?? 'tool-1',
+		convId: 'conv-1',
+		type: 'text',
+		timestamp: Date.now(),
+		role: MessageRole.TOOL,
+		content: overrides.content ?? 'tool result',
+		parent: null,
+		children: [],
+		toolCallId: overrides.toolCallId ?? 'call_1',
+		...overrides
+	} as DatabaseMessage;
+}
+
+describe('deriveAgenticSections', () => {
+	it('returns empty array for assistant with no content', () => {
+		const msg = makeAssistant({ content: '' });
+		const sections = deriveAgenticSections(msg);
+		expect(sections).toEqual([]);
+	});
+
+	it('returns text section for simple assistant message', () => {
+		const msg = makeAssistant({ content: 'Hello world' });
+		const sections = deriveAgenticSections(msg);
+		expect(sections).toHaveLength(1);
+		expect(sections[0].type).toBe(AgenticSectionType.TEXT);
+		expect(sections[0].content).toBe('Hello world');
+	});
+
+	it('returns reasoning + text for message with reasoning', () => {
+		const msg = makeAssistant({
+			content: 'Answer is 4.',
+			reasoningContent: 'Let me think...'
+		});
+		const sections = deriveAgenticSections(msg);
+		expect(sections).toHaveLength(2);
+		expect(sections[0].type).toBe(AgenticSectionType.REASONING);
+		expect(sections[0].content).toBe('Let me think...');
+		expect(sections[1].type).toBe(AgenticSectionType.TEXT);
+	});
+
+	it('single turn: assistant with tool calls and results', () => {
+		const msg = makeAssistant({
+			content: 'Let me check.',
+			toolCalls: JSON.stringify([
+				{ id: 'call_1', type: 'function', function: { name: 'search', arguments: '{"q":"test"}' } }
+			])
+		});
+		const toolResult = makeToolMsg({
+			toolCallId: 'call_1',
+			content: 'Found 3 results'
+		});
+		const sections = deriveAgenticSections(msg, [toolResult]);
+		expect(sections).toHaveLength(2);
+		expect(sections[0].type).toBe(AgenticSectionType.TEXT);
+		expect(sections[1].type).toBe(AgenticSectionType.TOOL_CALL);
+		expect(sections[1].toolName).toBe('search');
+		expect(sections[1].toolResult).toBe('Found 3 results');
+	});
+
+	it('single turn: pending tool call without result', () => {
+		const msg = makeAssistant({
+			toolCalls: JSON.stringify([
+				{ id: 'call_1', type: 'function', function: { name: 'bash', arguments: '{}' } }
+			])
+		});
+		const sections = deriveAgenticSections(msg, []);
+		expect(sections).toHaveLength(1);
+		expect(sections[0].type).toBe(AgenticSectionType.TOOL_CALL_PENDING);
+		expect(sections[0].toolName).toBe('bash');
+	});
+
+	it('multi-turn: two assistant turns grouped as one session', () => {
+		const assistant1 = makeAssistant({
+			id: 'ast-1',
+			content: 'Turn 1 text',
+			toolCalls: JSON.stringify([
+				{ id: 'call_1', type: 'function', function: { name: 'search', arguments: '{"q":"foo"}' } }
+			])
+		});
+		const tool1 = makeToolMsg({ id: 'tool-1', toolCallId: 'call_1', content: 'result 1' });
+		const assistant2 = makeAssistant({
+			id: 'ast-2',
+			content: 'Final answer based on results.'
+		});
+
+		// toolMessages contains both tool result and continuation assistant
+		const sections = deriveAgenticSections(assistant1, [tool1, assistant2]);
+		expect(sections).toHaveLength(3);
+		// Turn 1
+		expect(sections[0].type).toBe(AgenticSectionType.TEXT);
+		expect(sections[0].content).toBe('Turn 1 text');
+		expect(sections[1].type).toBe(AgenticSectionType.TOOL_CALL);
+		expect(sections[1].toolName).toBe('search');
+		expect(sections[1].toolResult).toBe('result 1');
+		// Turn 2 (final)
+		expect(sections[2].type).toBe(AgenticSectionType.TEXT);
+		expect(sections[2].content).toBe('Final answer based on results.');
+	});
+
+	it('multi-turn: three turns with tool calls', () => {
+		const assistant1 = makeAssistant({
+			id: 'ast-1',
+			content: '',
+			toolCalls: JSON.stringify([
+				{ id: 'call_1', type: 'function', function: { name: 'list_files', arguments: '{}' } }
+			])
+		});
+		const tool1 = makeToolMsg({ id: 'tool-1', toolCallId: 'call_1', content: 'file1 file2' });
+		const assistant2 = makeAssistant({
+			id: 'ast-2',
+			content: 'Reading file1...',
+			toolCalls: JSON.stringify([
+				{
+					id: 'call_2',
+					type: 'function',
+					function: { name: 'read_file', arguments: '{"path":"file1"}' }
+				}
+			])
+		});
+		const tool2 = makeToolMsg({ id: 'tool-2', toolCallId: 'call_2', content: 'contents of file1' });
+		const assistant3 = makeAssistant({
+			id: 'ast-3',
+			content: 'Here is the analysis.',
+			reasoningContent: 'The file contains...'
+		});
+
+		const sections = deriveAgenticSections(assistant1, [tool1, assistant2, tool2, assistant3]);
+		// Turn 1: tool_call (no text since content is empty)
+		// Turn 2: text + tool_call
+		// Turn 3: reasoning + text
+		expect(sections).toHaveLength(5);
+		expect(sections[0].type).toBe(AgenticSectionType.TOOL_CALL);
+		expect(sections[0].toolName).toBe('list_files');
+		expect(sections[1].type).toBe(AgenticSectionType.TEXT);
+		expect(sections[1].content).toBe('Reading file1...');
+		expect(sections[2].type).toBe(AgenticSectionType.TOOL_CALL);
+		expect(sections[2].toolName).toBe('read_file');
+		expect(sections[3].type).toBe(AgenticSectionType.REASONING);
+		expect(sections[4].type).toBe(AgenticSectionType.TEXT);
+		expect(sections[4].content).toBe('Here is the analysis.');
+	});
+
+	it('multi-turn: streaming tool calls on last turn', () => {
+		const assistant1 = makeAssistant({
+			toolCalls: JSON.stringify([
+				{ id: 'call_1', type: 'function', function: { name: 'search', arguments: '{}' } }
+			])
+		});
+		const tool1 = makeToolMsg({ toolCallId: 'call_1', content: 'result' });
+		const assistant2 = makeAssistant({ id: 'ast-2', content: '' });
+
+		const streamingToolCalls: ApiChatCompletionToolCall[] = [
+			{ id: 'call_2', type: 'function', function: { name: 'write_file', arguments: '{"pa' } }
+		];
+
+		const sections = deriveAgenticSections(assistant1, [tool1, assistant2], streamingToolCalls);
+		// Turn 1: tool_call
+		// Turn 2 (streaming): streaming tool call
+		expect(sections.some((s) => s.type === AgenticSectionType.TOOL_CALL)).toBe(true);
+		expect(sections.some((s) => s.type === AgenticSectionType.TOOL_CALL_STREAMING)).toBe(true);
+	});
+});
+
+describe('hasAgenticContent', () => {
+	it('returns false for plain assistant', () => {
+		const msg = makeAssistant({ content: 'Just text' });
+		expect(hasAgenticContent(msg)).toBe(false);
+	});
+
+	it('returns true when message has toolCalls', () => {
+		const msg = makeAssistant({
+			toolCalls: JSON.stringify([
+				{ id: 'call_1', type: 'function', function: { name: 'test', arguments: '{}' } }
+			])
+		});
+		expect(hasAgenticContent(msg)).toBe(true);
+	});
+
+	it('returns true when toolMessages are provided', () => {
+		const msg = makeAssistant();
+		const tool = makeToolMsg();
+		expect(hasAgenticContent(msg, [tool])).toBe(true);
+	});
+
+	it('returns false for empty toolCalls JSON', () => {
+		const msg = makeAssistant({ toolCalls: '[]' });
+		expect(hasAgenticContent(msg)).toBe(false);
+	});
+});
--- a/tools/server/webui/tests/unit/agentic-strip.test.ts
+++ b/tools/server/webui/tests/unit/agentic-strip.test.ts
@@ -1,17 +1,22 @@
 import { describe, it, expect } from 'vitest';
-import { AGENTIC_REGEX } from '$lib/constants/agentic';
+import { LEGACY_AGENTIC_REGEX } from '$lib/constants/agentic';

-// Mirror the logic in ChatService.stripReasoningContent so we can test it in isolation.
-// The real function is private static, so we replicate the strip pipeline here.
-function stripContextMarkers(content: string): string {
+/**
+ * Tests for legacy marker stripping (used in migration).
+ * The new system does not embed markers in content - these tests verify
+ * the legacy regex patterns still work for the migration code.
+ */
+
+// Mirror the legacy stripping logic used during migration
+function stripLegacyContextMarkers(content: string): string {
 	return content
-		.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
-		.replace(AGENTIC_REGEX.REASONING_OPEN, '')
-		.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
-		.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '');
+		.replace(new RegExp(LEGACY_AGENTIC_REGEX.REASONING_BLOCK.source, 'g'), '')
+		.replace(LEGACY_AGENTIC_REGEX.REASONING_OPEN, '')
+		.replace(new RegExp(LEGACY_AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK.source, 'g'), '')
+		.replace(LEGACY_AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '');
 }

-// A realistic complete tool call block as stored in message.content after a turn.
+// A realistic complete tool call block as stored in old message.content
 const COMPLETE_BLOCK =
 	'\n\n<<<AGENTIC_TOOL_CALL_START>>>\n' +
 	'<<<TOOL_NAME:bash_tool>>>\n' +
@@ -30,11 +35,10 @@ const OPEN_BLOCK =
 	'<<<TOOL_ARGS_END>>>\n' +
 	'partial output...';

-describe('agentic marker stripping for context', () => {
+describe('legacy agentic marker stripping (for migration)', () => {
 	it('strips a complete tool call block, leaving surrounding text', () => {
 		const input = 'Before.' + COMPLETE_BLOCK + 'After.';
-		const result = stripContextMarkers(input);
-		// markers gone; residual newlines between fragments are fine
+		const result = stripLegacyContextMarkers(input);
 		expect(result).not.toContain('<<<');
 		expect(result).toContain('Before.');
 		expect(result).toContain('After.');
@@ -42,7 +46,7 @@ describe('agentic marker stripping for context', () => {

 	it('strips multiple complete tool call blocks', () => {
 		const input = 'A' + COMPLETE_BLOCK + 'B' + COMPLETE_BLOCK + 'C';
-		const result = stripContextMarkers(input);
+		const result = stripLegacyContextMarkers(input);
 		expect(result).not.toContain('<<<');
 		expect(result).toContain('A');
 		expect(result).toContain('B');
@@ -51,19 +55,19 @@ describe('agentic marker stripping for context', () => {

 	it('strips an open/partial tool call block (no END marker)', () => {
 		const input = 'Lead text.' + OPEN_BLOCK;
-		const result = stripContextMarkers(input);
+		const result = stripLegacyContextMarkers(input);
 		expect(result).toBe('Lead text.');
 		expect(result).not.toContain('<<<');
 	});

 	it('does not alter content with no markers', () => {
 		const input = 'Just a normal assistant response.';
-		expect(stripContextMarkers(input)).toBe(input);
+		expect(stripLegacyContextMarkers(input)).toBe(input);
 	});

 	it('strips reasoning block independently', () => {
 		const input = '<<<reasoning_content_start>>>think hard<<<reasoning_content_end>>>Answer.';
-		expect(stripContextMarkers(input)).toBe('Answer.');
+		expect(stripLegacyContextMarkers(input)).toBe('Answer.');
 	});

 	it('strips both reasoning and agentic blocks together', () => {
@@ -71,11 +75,21 @@ describe('agentic marker stripping for context', () => {
 			'<<<reasoning_content_start>>>plan<<<reasoning_content_end>>>' +
 			'Some text.' +
 			COMPLETE_BLOCK;
-		expect(stripContextMarkers(input)).not.toContain('<<<');
-		expect(stripContextMarkers(input)).toContain('Some text.');
+		expect(stripLegacyContextMarkers(input)).not.toContain('<<<');
+		expect(stripLegacyContextMarkers(input)).toContain('Some text.');
 	});

 	it('empty string survives', () => {
-		expect(stripContextMarkers('')).toBe('');
+		expect(stripLegacyContextMarkers('')).toBe('');
+	});
+
+	it('detects legacy markers', () => {
+		expect(LEGACY_AGENTIC_REGEX.HAS_LEGACY_MARKERS.test('normal text')).toBe(false);
+		expect(
+			LEGACY_AGENTIC_REGEX.HAS_LEGACY_MARKERS.test('text<<<AGENTIC_TOOL_CALL_START>>>more')
+		).toBe(true);
+		expect(LEGACY_AGENTIC_REGEX.HAS_LEGACY_MARKERS.test('<<<reasoning_content_start>>>think')).toBe(
+			true
+		);
 	});
 });
--- a/tools/server/webui/tests/unit/reasoning-context.test.ts
+++ b/tools/server/webui/tests/unit/reasoning-context.test.ts
@@ -1,196 +1,89 @@
 import { describe, it, expect } from 'vitest';
-import { AGENTIC_REGEX, REASONING_TAGS } from '$lib/constants/agentic';
-import { ContentPartType } from '$lib/enums';
+import { MessageRole } from '$lib/enums';

-// Replicate ChatService.extractReasoningFromContent (private static)
-function extractReasoningFromContent(
-	content: string | Array<{ type: string; text?: string }> | null | undefined
-): string | undefined {
-	if (!content) return undefined;
+/**
+ * Tests for the new reasoning content handling.
+ * In the new architecture, reasoning content is stored in a dedicated
+ * `reasoningContent` field on DatabaseMessage, not embedded in content with tags.
+ * The API sends it as `reasoning_content` on ApiChatMessageData.
+ */

-	const extractFromString = (text: string): string => {
-		const parts: string[] = [];
-		const re = new RegExp(AGENTIC_REGEX.REASONING_EXTRACT.source);
-		let match = re.exec(text);
-		while (match) {
-			parts.push(match[1]);
-			text = text.slice(match.index + match[0].length);
-			match = re.exec(text);
-		}
-		return parts.join('');
-	};
-
-	if (typeof content === 'string') {
-		const result = extractFromString(content);
-		return result || undefined;
-	}
-
-	if (!Array.isArray(content)) return undefined;
-
-	const parts: string[] = [];
-	for (const part of content) {
-		if (part.type === ContentPartType.TEXT && part.text) {
-			const result = extractFromString(part.text);
-			if (result) parts.push(result);
-		}
-	}
-	return parts.length > 0 ? parts.join('') : undefined;
-}
-
-// Replicate ChatService.stripReasoningContent (private static)
-function stripReasoningContent(
-	content: string | Array<{ type: string; text?: string }> | null | undefined
-): typeof content {
-	if (!content) return content;
-
-	if (typeof content === 'string') {
-		return content
-			.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
-			.replace(AGENTIC_REGEX.REASONING_OPEN, '')
-			.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
-			.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '');
-	}
-
-	if (!Array.isArray(content)) return content;
-
-	return content.map((part) => {
-		if (part.type !== ContentPartType.TEXT || !part.text) return part;
-		return {
-			...part,
-			text: part.text
-				.replace(AGENTIC_REGEX.REASONING_BLOCK, '')
-				.replace(AGENTIC_REGEX.REASONING_OPEN, '')
-				.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_BLOCK, '')
-				.replace(AGENTIC_REGEX.AGENTIC_TOOL_CALL_OPEN, '')
+describe('reasoning content in new structured format', () => {
+	it('reasoning is stored as separate field, not in content', () => {
+		// Simulate what the new chat store does
+		const message = {
+			content: 'The answer is 4.',
+			reasoningContent: 'Let me think: 2+2=4, basic arithmetic.'
 		};
-	});
-}

-// Simulate the message mapping logic from ChatService.sendMessage
-function buildApiMessage(
-	content: string,
-	excludeReasoningFromContext: boolean
-): { role: string; content: string; reasoning_content?: string } {
-	const cleaned = stripReasoningContent(content) as string;
-	const mapped: { role: string; content: string; reasoning_content?: string } = {
-		role: 'assistant',
-		content: cleaned
-	};
-	if (!excludeReasoningFromContext) {
-		const reasoning = extractReasoningFromContent(content);
-		if (reasoning) {
-			mapped.reasoning_content = reasoning;
+		// Content should be clean
+		expect(message.content).not.toContain('<<<');
+		expect(message.content).toBe('The answer is 4.');
+
+		// Reasoning in dedicated field
+		expect(message.reasoningContent).toBe('Let me think: 2+2=4, basic arithmetic.');
+	});
+
+	it('convertDbMessageToApiChatMessageData includes reasoning_content', () => {
+		// Simulate the conversion logic
+		const dbMessage = {
+			role: MessageRole.ASSISTANT,
+			content: 'The answer is 4.',
+			reasoningContent: 'Let me think: 2+2=4, basic arithmetic.'
+		};
+
+		const apiMessage: Record<string, unknown> = {
+			role: dbMessage.role,
+			content: dbMessage.content
+		};
+		if (dbMessage.reasoningContent) {
+			apiMessage.reasoning_content = dbMessage.reasoningContent;
 		}
-	}
-	return mapped;
-}

-// Helper: wrap reasoning the same way the chat store does during streaming
-function wrapReasoning(reasoning: string, content: string): string {
-	return `${REASONING_TAGS.START}${reasoning}${REASONING_TAGS.END}${content}`;
-}
-
-describe('reasoning content extraction', () => {
-	it('extracts reasoning from tagged string content', () => {
-		const input = wrapReasoning('step 1, step 2', 'The answer is 42.');
-		const result = extractReasoningFromContent(input);
-		expect(result).toBe('step 1, step 2');
+		expect(apiMessage.content).toBe('The answer is 4.');
+		expect(apiMessage.reasoning_content).toBe('Let me think: 2+2=4, basic arithmetic.');
+		// No internal tags leak into either field
+		expect(apiMessage.content).not.toContain('<<<');
+		expect(apiMessage.reasoning_content).not.toContain('<<<');
 	});

-	it('returns undefined when no reasoning tags present', () => {
-		expect(extractReasoningFromContent('Just a normal response.')).toBeUndefined();
+	it('API message excludes reasoning when excludeReasoningFromContext is true', () => {
+		const dbMessage = {
+			role: MessageRole.ASSISTANT,
+			content: 'The answer is 4.',
+			reasoningContent: 'internal thinking'
+		};
+
+		const excludeReasoningFromContext = true;
+
+		const apiMessage: Record<string, unknown> = {
+			role: dbMessage.role,
+			content: dbMessage.content
+		};
+		if (!excludeReasoningFromContext && dbMessage.reasoningContent) {
+			apiMessage.reasoning_content = dbMessage.reasoningContent;
+		}
+
+		expect(apiMessage.content).toBe('The answer is 4.');
+		expect(apiMessage.reasoning_content).toBeUndefined();
 	});

-	it('returns undefined for null/empty input', () => {
-		expect(extractReasoningFromContent(null)).toBeUndefined();
-		expect(extractReasoningFromContent(undefined)).toBeUndefined();
-		expect(extractReasoningFromContent('')).toBeUndefined();
-	});
+	it('handles messages with no reasoning', () => {
+		const dbMessage = {
+			role: MessageRole.ASSISTANT,
+			content: 'No reasoning here.',
+			reasoningContent: undefined
+		};

-	it('extracts reasoning from content part arrays', () => {
-		const input = [
-			{
-				type: ContentPartType.TEXT,
-				text: wrapReasoning('thinking hard', 'result')
-			}
-		];
-		expect(extractReasoningFromContent(input)).toBe('thinking hard');
-	});
+		const apiMessage: Record<string, unknown> = {
+			role: dbMessage.role,
+			content: dbMessage.content
+		};
+		if (dbMessage.reasoningContent) {
+			apiMessage.reasoning_content = dbMessage.reasoningContent;
+		}

-	it('handles multiple reasoning blocks', () => {
-		const input =
-			REASONING_TAGS.START +
-			'block1' +
-			REASONING_TAGS.END +
-			'middle' +
-			REASONING_TAGS.START +
-			'block2' +
-			REASONING_TAGS.END +
-			'end';
-		expect(extractReasoningFromContent(input)).toBe('block1block2');
-	});
-
-	it('ignores non-text content parts', () => {
-		const input = [{ type: 'image_url', text: wrapReasoning('hidden', 'img') }];
-		expect(extractReasoningFromContent(input)).toBeUndefined();
-	});
-});
-
-describe('strip reasoning content', () => {
-	it('removes reasoning tags from string content', () => {
-		const input = wrapReasoning('internal thoughts', 'visible answer');
-		expect(stripReasoningContent(input)).toBe('visible answer');
-	});
-
-	it('removes reasoning from content part arrays', () => {
-		const input = [
-			{
-				type: ContentPartType.TEXT,
-				text: wrapReasoning('thoughts', 'answer')
-			}
-		];
-		const result = stripReasoningContent(input) as Array<{ type: string; text?: string }>;
-		expect(result[0].text).toBe('answer');
-	});
-});
-
-describe('API message building with reasoning preservation', () => {
-	const storedContent = wrapReasoning('Let me think: 2+2=4, basic arithmetic.', 'The answer is 4.');
-
-	it('preserves reasoning_content when excludeReasoningFromContext is false', () => {
-		const msg = buildApiMessage(storedContent, false);
-		expect(msg.content).toBe('The answer is 4.');
-		expect(msg.reasoning_content).toBe('Let me think: 2+2=4, basic arithmetic.');
-		// no internal tags leak into either field
-		expect(msg.content).not.toContain('<<<');
-		expect(msg.reasoning_content).not.toContain('<<<');
-	});
-
-	it('strips reasoning_content when excludeReasoningFromContext is true', () => {
-		const msg = buildApiMessage(storedContent, true);
-		expect(msg.content).toBe('The answer is 4.');
-		expect(msg.reasoning_content).toBeUndefined();
-	});
-
-	it('handles content with no reasoning in both modes', () => {
-		const plain = 'No reasoning here.';
-		const msgPreserve = buildApiMessage(plain, false);
-		const msgExclude = buildApiMessage(plain, true);
-		expect(msgPreserve.content).toBe(plain);
-		expect(msgPreserve.reasoning_content).toBeUndefined();
-		expect(msgExclude.content).toBe(plain);
-		expect(msgExclude.reasoning_content).toBeUndefined();
-	});
-
-	it('cleans agentic tool call blocks from content even when preserving reasoning', () => {
-		const input =
-			wrapReasoning('plan', 'text') +
-			'\n\n<<<AGENTIC_TOOL_CALL_START>>>\n' +
-			'<<<TOOL_NAME:bash>>>\n' +
-			'<<<TOOL_ARGS_START>>>\n{}\n<<<TOOL_ARGS_END>>>\nout\n' +
-			'<<<AGENTIC_TOOL_CALL_END>>>\n';
-		const msg = buildApiMessage(input, false);
-		expect(msg.content).not.toContain('<<<');
-		expect(msg.reasoning_content).toBe('plan');
+		expect(apiMessage.content).toBe('No reasoning here.');
+		expect(apiMessage.reasoning_content).toBeUndefined();
 	});
 });
--- a/tools/server/webui/vite.config.ts
+++ b/tools/server/webui/vite.config.ts
@@ -1,7 +1,6 @@
 import tailwindcss from '@tailwindcss/vite';
 import { sveltekit } from '@sveltejs/kit/vite';
-import * as fflate from 'fflate';
-import { readFileSync, writeFileSync, existsSync } from 'fs';
+import { readFileSync, writeFileSync, existsSync, readdirSync, copyFileSync } from 'fs';
 import { dirname, resolve } from 'path';
 import { fileURLToPath } from 'url';

@@ -20,15 +19,13 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();

-const MAX_BUNDLE_SIZE = 2 * 1024 * 1024;
-
 /**
 * the maximum size of an embedded asset in bytes,
 * e.g. maximum size of embedded font (see node_modules/katex/dist/fonts/*.woff2)
 */
 const MAX_ASSET_SIZE = 32000;

-/** public/index.html.gz minified flag */
+/** public/index.html minified flag */
 const ENABLE_JS_MINIFICATION = true;

 function llamaCppBuildPlugin() {
@@ -40,7 +37,6 @@ function llamaCppBuildPlugin() {
 			setTimeout(() => {
 				try {
 					const indexPath = resolve('../public/index.html');
-					const gzipPath = resolve('../public/index.html.gz');

 					if (!existsSync(indexPath)) {
 						return;
@@ -61,26 +57,35 @@ function llamaCppBuildPlugin() {

 					content = content.replace(/\r/g, '');
 					content = GUIDE_FOR_FRONTEND + '\n' + content;
+					content = content.replace(/\/_app\/immutable\/bundle\.[^"]+\.js/g, './bundle.js');
+					content = content.replace(
+						/\/_app\/immutable\/assets\/bundle\.[^"]+\.css/g,
+						'./bundle.css'
+					);

-					const compressed = fflate.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 });
+					writeFileSync(indexPath, content, 'utf-8');
+					console.log('✓ Updated index.html');

-					compressed[0x4] = 0;
-					compressed[0x5] = 0;
-					compressed[0x6] = 0;
-					compressed[0x7] = 0;
-					compressed[0x9] = 0;
-
-					if (compressed.byteLength > MAX_BUNDLE_SIZE) {
-						throw new Error(
-							`Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` +
-								`Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.ts.\n`
-						);
+					// Copy bundle.*.js -> ../public/bundle.js
+					const immutableDir = resolve('../public/_app/immutable');
+					const bundleDir = resolve('../public/_app/immutable/assets');
+					if (existsSync(immutableDir)) {
+						const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/));
+						if (jsFiles.length > 0) {
+							copyFileSync(resolve(immutableDir, jsFiles[0]), resolve('../public/bundle.js'));
+							console.log(`✓ Copied ${jsFiles[0]} -> bundle.js`);
+						}
+					}
+					// Copy bundle.*.css -> ../public/bundle.css
+					if (existsSync(bundleDir)) {
+						const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/));
+						if (cssFiles.length > 0) {
+							copyFileSync(resolve(bundleDir, cssFiles[0]), resolve('../public/bundle.css'));
+							console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`);
+						}
 					}
-
-					writeFileSync(gzipPath, compressed);
-					console.log('✓ Created index.html.gz');
 				} catch (error) {
-					console.error('Failed to create gzip file:', error);
+					console.error('Failed to update index.html:', error);
 				}
 			}, 100);
 		}
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@@ -551,6 +551,8 @@ int main(int argc, char ** argv) {
    params.sampling.top_k = 4;
    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, };

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
        return 1;
    }
@@ -558,8 +560,6 @@ int main(int argc, char ** argv) {
    const int n_parallel = params.n_parallel;
    const int n_predict  = params.n_predict;

-    common_init();
-
    // init LLM

    llama_backend_init();
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
    set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")

    set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
-    set(BORINGSSL_VERSION "0.20260211.0" CACHE STRING "BoringSSL version")
+    set(BORINGSSL_VERSION "0.20260327.0" CACHE STRING "BoringSSL version")

    message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")