chat : fix hunyuan auto-detection (#15114 )

Signed-off-by: stevenkuang <stevenkuang@tencent.com>
CANN: add support for ACL Graph (#15065 )
2026-05-04 00:34:21 +02:00 · 2025-08-06 11:48:30 +02:00 · 2025-08-06 14:12:42 +08:00 · 2025-08-05 16:26:38 -07:00
6 changed files with 281 additions and 50 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -179,6 +179,7 @@ jobs:
      - name: Test
        id: cmake_test
        run: |
+          export LLAMA_SET_ROWS=0
          cd build
          ctest -L main --verbose --timeout 900

@@ -437,6 +438,7 @@ jobs:
      - name: Test
        id: cmake_test
        run: |
+          export LLAMA_SET_ROWS=0
          cd build
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 3600
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -31,6 +31,13 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
+option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
+
+if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
+    message(FATAL_ERROR
+        "CANN Graph (ACL graph mode) is not supported on 310P devices. "
+        "Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
+endif()

 if (CANN_INSTALL_DIR)
    # Only Support Linux.
@@ -68,6 +75,13 @@ if (CANN_INSTALL_DIR)

    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")

+    if (USE_ACL_GRAPH)
+        target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
+        message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
+    else()
+        message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
+    endif()
+
    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 else()
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -337,6 +337,29 @@ private:
    int32_t device_;
 };

+#ifdef USE_ACL_GRAPH
+struct ggml_graph_node_properties {
+    void * node_address;
+    ggml_op node_op;
+    int64_t ne[GGML_MAX_DIMS];
+    size_t nb[GGML_MAX_DIMS];
+    void * src_address[GGML_MAX_SRC];
+    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+};
+
+struct ggml_cann_graph {
+    ~ggml_cann_graph() {
+        if (graph != nullptr) {
+            aclmdlRIDestroy(graph);
+        }
+    }
+
+    aclmdlRI graph = nullptr;
+
+    std::vector<ggml_graph_node_properties> ggml_graph_properties;
+};
+#endif  // USE_ACL_GRAPH
+
 /**
 * @brief Context for managing CANN backend operations.
 */
@@ -345,8 +368,13 @@ struct ggml_backend_cann_context {
    std::string name;                /**< Name of the device. */
    std::string description;         /**< Description of the device. */
    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
+#ifdef USE_ACL_GRAPH
+    /// Cached CANN ACL graph used for executing the current ggml computation graph.
+    std::unique_ptr<ggml_cann_graph> cann_graph;
+#endif
    cann_task_queue task_queue;
    bool async_mode;
+    bool support_set_rows;

    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */

@@ -362,6 +390,14 @@ struct ggml_backend_cann_context {
        async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
        GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
            device, async_mode ? "ON" : "OFF");
+
+        support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
+        GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
+
+        if (!support_set_rows) {
+            GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
+                    "Falling back to eager mode.\n", __func__);
+        }
    }

    /**
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2075,6 +2075,160 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
 }

+#ifdef USE_ACL_GRAPH
+/**
+ * @brief Populate the internal CANN graph node properties from the ggml computation graph.
+ *
+ * This function copies all node attributes (operation type, dimensions, strides, input sources,
+ * and operation parameters) into the cached CANN graph structure for later reuse or comparison.
+ *
+ * @param cann_ctx  The CANN backend context.
+ * @param cgraph    The ggml computational graph.
+ */
+static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
+    for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
+        ggml_tensor * node = cgraph->nodes[node_idx];
+        cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
+        cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
+
+        for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
+            cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
+            cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
+        }
+        for (int src = 0; src < GGML_MAX_SRC; src++) {
+            cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
+                node->src[src] ? node->src[src]->data : nullptr;
+        }
+        memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
+    }
+}
+
+/**
+ * @brief Check if a ggml tensor node matches a previously captured CANN graph node.
+ *
+ * This function compares all relevant fields (address, op type, shape, source inputs, op params)
+ * to determine whether the current node matches a previously recorded version.
+ *
+ * @param node                  The current ggml tensor node.
+ * @param graph_node_properties The stored properties of a CANN graph node.
+ * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
+ */
+static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    if (node->data != graph_node_properties->node_address &&
+           node->op != GGML_OP_VIEW) {
+        return false;
+    }
+    if (node->op != graph_node_properties->node_op) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (node->ne[i] != graph_node_properties->ne[i]) {
+            return false;
+        }
+        if (node->nb[i] != graph_node_properties->nb[i]) {
+            return false;
+        }
+    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (node->src[i] &&
+            node->src[i]->data != graph_node_properties->src_address[i] &&
+            node->op != GGML_OP_VIEW
+        ) {
+            return false;
+        }
+    }
+    if (node->op == GGML_OP_SCALE &&
+        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
+        return false;
+    }
+    return true;
+}
+
+/**
+ * @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
+ *
+ * This checks whether the number or properties of ggml graph nodes have changed
+ * compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
+ *
+ * @param cann_ctx  The CANN backend context.
+ * @param cgraph    The current ggml computation graph.
+ * @return true if an update is required; false otherwise.
+ */
+static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
+    // The number of nodes is different, so the graph needs to be reconstructed.
+    if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
+        cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
+        return true;
+    }
+
+    // The number of nodes is the same; iterate over each node to check whether they match.
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        bool has_matching_properties = ggml_graph_node_has_matching_properties(
+            cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
+        if(!has_matching_properties) {
+            return true;
+        }
+    }
+    return false;
+}
+#endif  // USE_ACL_GRAPH
+
+/**
+ * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
+ *
+ * If CANN graph execution is enabled and graph capture is required, this function begins
+ * graph capture, runs the graph, ends capture, and stores the captured graph.
+ *
+ * Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
+ *
+ * @param cann_ctx                 The CANN backend context.
+ * @param cgraph                   The ggml computation graph.
+ * @param use_cann_graph           Whether to use CANN graph execution.
+ * @param cann_graph_update_required Whether graph capture is needed due to graph changes.
+ */
+static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
+    bool & use_cann_graph, bool & cann_graph_update_required) {
+#ifdef USE_ACL_GRAPH
+    if (use_cann_graph && cann_graph_update_required) {
+        if (cann_ctx->cann_graph->graph != nullptr) {
+            ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
+            cann_ctx->cann_graph->graph = nullptr;
+        }
+        ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
+    }
+#endif // USE_ACL_GRAPH
+
+    // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
+    // With the use of CANN graphs, the execution will be performed by the graph launch.
+    if (!use_cann_graph || cann_graph_update_required) {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            ggml_tensor * node = cgraph->nodes[i];
+
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+
+            bool ok = ggml_cann_compute_forward(*cann_ctx, node);
+            if (!ok) {
+                GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+            }
+            GGML_ASSERT(ok);
+        }
+    }
+
+#ifdef USE_ACL_GRAPH
+    if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
+        ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
+    }
+
+    if (use_cann_graph) {
+        // Execute graph
+        ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
+    }
+#endif // USE_ACL_GRAPH
+}
+
+
 /**
 * @brief Computes a computational graph using a CANN backend.
 *
@@ -2091,27 +2245,38 @@ static enum ggml_status ggml_backend_cann_graph_compute(
    ggml_backend_t backend, ggml_cgraph* cgraph) {
    ggml_backend_cann_context* cann_ctx =
        (ggml_backend_cann_context*)backend->context;
-
    ggml_cann_set_device(cann_ctx->device);
-    //release temp buffer create by set tensor.
    release_nz_workspace();
+#ifdef USE_ACL_GRAPH
+    bool use_cann_graph = true;
+    bool cann_graph_update_required = false;

-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor* node = cgraph->nodes[i];
-
-        if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
-            continue;
-        }
-
-        bool ok = ggml_cann_compute_forward(*cann_ctx, node);
-
-        if (!ok) {
-            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
-                    node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
+    // check environment LLAMA_SET_ROWS
+    if (!cann_ctx->support_set_rows) {
+        use_cann_graph = false;
    }

+    if (use_cann_graph) {
+        if (cann_ctx->cann_graph == nullptr) {
+            cann_ctx->cann_graph.reset(new ggml_cann_graph());
+            cann_graph_update_required = true;
+        }
+
+        cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
+        set_ggml_graph_node_properties(cann_ctx, cgraph);
+    }
+#else
+    bool use_cann_graph = false;
+    bool cann_graph_update_required = false;
+#endif  // USE_ACL_GRAPH
+
+    evaluate_and_capture_cann_graph(
+        cann_ctx,
+        cgraph,
+        use_cann_graph,
+        cann_graph_update_required
+    );
+
    return GGML_STATUS_SUCCESS;
 }

@@ -2226,12 +2391,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                // only support F32 and F16.
                return false;
            }
-
-            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
-                // unsupport dst is not contiguous.
-                return false;
-            }
-
            return true;
        } break;
        case GGML_OP_CONT: {
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -118,8 +118,6 @@ struct webgpu_context_struct {
    wgpu::Limits   limits;

    std::recursive_mutex mutex;
-    std::mutex           get_tensor_mutex;
-    std::mutex           init_mutex;

    bool device_init = false;

@@ -139,6 +137,8 @@ struct webgpu_context_struct {

    // Parameter buffers associated with the staged command buffers
    std::vector<webgpu_param_bufs> staged_param_bufs;
+
+    std::vector<wgpu::FutureWaitInfo> callback_futures;
 };

 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -221,25 +221,39 @@ static void ggml_webgpu_create_buffer(wgpu::Device &    device,

 /** WebGPU Actions */

+// Wait for the queue to finish processing all submitted work
 static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) {
-    // Wait for the queue to finish processing all commands
-    ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
-                              wgpu::CallbackMode::AllowSpontaneous,
-                              [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                                  if (status != wgpu::QueueWorkDoneStatus::Success) {
-                                      GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
-                                  }
-                              }),
-                          UINT64_MAX);
+    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
+    if (ctx->callback_futures.empty()) {
+        // no existing callbacks, wait on queue submission
+        ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
+                                  wgpu::CallbackMode::AllowSpontaneous,
+                                  [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                      if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                          GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
+                                      }
+                                  }),
+                              UINT64_MAX);
+    } else {
+        // existing callbacks, wait on them
+        ctx->instance.WaitAny(ctx->callback_futures.size(), ctx->callback_futures.data(), UINT64_MAX);
+        ctx->callback_futures.clear();
+    }
 }

 static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
    std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_submit_queue()");
+    if (ctx->staged_command_bufs.empty()) {
+        // Nothing to submit
+        return;
+    }
    ctx->queue.Submit(ctx->staged_command_bufs.size(), ctx->staged_command_bufs.data());
    ctx->staged_command_bufs.clear();
    std::vector<webgpu_param_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
+
    // Free the staged parameter buffers once the submission completes
-    ctx->queue.OnSubmittedWorkDone(
+    wgpu::Future f = ctx->queue.OnSubmittedWorkDone(
        wgpu::CallbackMode::AllowSpontaneous,
        [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
            if (status != wgpu::QueueWorkDoneStatus::Success) {
@@ -248,6 +262,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) {
            // Free the staged parameter buffers
            ctx->param_buf_pool.free_bufs(staged_param_bufs);
        });
+    ctx->callback_futures.push_back({ f });
 }

 static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
@@ -273,7 +288,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
                                                  std::vector<uint32_t>             params,
                                                  std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                  uint32_t                          wg_x,
-                                                  bool                              submit_imm = false) {
+                                                  bool                              submit_and_wait = false) {
    webgpu_param_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();

    ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
@@ -304,17 +319,18 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
    pass.DispatchWorkgroups(wg_x, 1, 1);
    pass.End();
    wgpu::CommandBuffer commands = encoder.Finish();
-    if (submit_imm) {
-        // Submit immediately
+    if (submit_and_wait) {
+        // Submit and wait immediately
        ctx->queue.Submit(1, &commands);
-        ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
-                                       [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
-                                           if (status != wgpu::QueueWorkDoneStatus::Success) {
-                                               GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
-                                                              message.data);
-                                           }
-                                           ctx->param_buf_pool.free_bufs({ params_bufs });
-                                       });
+        ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(
+                                  wgpu::CallbackMode::AllowSpontaneous,
+                                  [ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+                                      if (status != wgpu::QueueWorkDoneStatus::Success) {
+                                          GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data);
+                                      }
+                                      ctx->param_buf_pool.free_bufs({ params_bufs });
+                                  }),
+                              UINT64_MAX);
    } else {
        // Lock the context mutex when pushing to the staging vectors.
        std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
@@ -579,6 +595,9 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
        // memset the remaining bytes
        ggml_backend_webgpu_buffer_memset(
            webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
+    } else {
+        // wait for WriteBuffer to complete
+        ggml_backend_webgpu_wait_on_submission(webgpu_ctx);
    }
 }

@@ -602,7 +621,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
        final_size = size + (4 - (size % 4));
    }

-    std::lock_guard<std::mutex> lock(webgpu_ctx->get_tensor_mutex);
+    std::lock_guard<std::recursive_mutex> lock(webgpu_ctx->mutex);

    if (webgpu_ctx->get_tensor_staging_buf == nullptr || webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
        // Create a new staging buffer if it doesn't exist or is too small
@@ -768,10 +787,11 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
    webgpu_context                       webgpu_ctx = dev_ctx->webgpu_ctx;

    // Multiple threads may try to initialize the device
-    std::lock_guard<std::mutex> lock(webgpu_ctx->init_mutex);
+    std::lock_guard<std::recursive_mutex> lock(webgpu_ctx->mutex);
    if (!webgpu_ctx->device_init) {
        // Initialize device
-        std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16, wgpu::FeatureName::ImplicitDeviceSynchronization };
+        std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
+                                                             wgpu::FeatureName::ImplicitDeviceSynchronization };
        wgpu::DeviceDescriptor         dev_desc;
        dev_desc.requiredLimits       = &webgpu_ctx->limits;
        dev_desc.requiredFeatures     = required_features.data();
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -193,11 +193,11 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_LLAMA4;
    } else if (tmpl_contains("<|endofuserprompt|>")) {
        return LLM_CHAT_TEMPLATE_DOTS1;
-    } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
+    } else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
    } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
        return LLM_CHAT_TEMPLATE_OPENAI_MOE;
-    } else if (tmpl_contains("<｜hy_place▁holder▁no▁2｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
+    } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
    } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
        return LLM_CHAT_TEMPLATE_KIMI_K2;