clean up logging and timing

simplify replay submission
fix queue debug utils label
2026-06-26 13:39:42 +02:00 · 2026-06-17 13:47:53 +02:00 · 2026-06-17 13:32:30 +02:00 · 2026-06-17 13:20:52 +02:00 · 2026-06-12 16:43:11 +02:00 · 2026-06-12 16:38:46 +02:00
1 changed files with 336 additions and 16 deletions
@@ -1754,6 +1754,55 @@ struct ggml_vk_garbage_collector {
    std::vector<vk_context> contexts;
 };

+// --- Vulkan graph caching (command buffer reuse) ---
+
+struct vk_graph_node_properties {
+    ggml_tensor node;
+    void * src_data_ptrs[GGML_MAX_SRC];
+    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
+    size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
+};
+
+struct vk_cached_submission {
+    vk_command_buffer * cmd_buffer;
+    bool signal_almost_ready_fence;
+};
+
+struct vk_cached_graph {
+    uint64_t uid = 0;
+    bool warmup_complete = false;
+    bool disabled = false;
+    int64_t last_used_time = 0;
+
+    std::vector<vk_graph_node_properties> node_props;
+
+    vk_command_pool cached_cmd_pool;
+    std::vector<vk::DescriptorPool> cached_descriptor_pools;
+    std::vector<vk::DescriptorSet> cached_descriptor_sets;
+
+    vk_buffer prealloc_x_at_capture, prealloc_y_at_capture;
+    vk_buffer prealloc_split_k_at_capture;
+
+    std::vector<vk_cached_submission> submissions;
+
+    bool uses_transfer_queue = false;
+
+    void destroy(vk::Device & dev) {
+        cached_cmd_pool.destroy(dev);
+        for (auto & pool : cached_descriptor_pools) {
+            dev.destroyDescriptorPool(pool);
+        }
+        cached_descriptor_pools.clear();
+        cached_descriptor_sets.clear();
+        submissions.clear();
+        prealloc_x_at_capture = nullptr;
+        prealloc_y_at_capture = nullptr;
+        prealloc_split_k_at_capture = nullptr;
+        warmup_complete = false;
+        uid = 0;
+    }
+};
+
 static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
 static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested = nullptr);
 static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
@@ -2021,6 +2070,34 @@ struct ggml_backend_vk_context {
    vk_command_pool compute_cmd_pool;
    vk_command_pool transfer_cmd_pool;

+    // --- Vulkan graph caching ---
+    std::unordered_map<const void *, std::unique_ptr<vk_cached_graph>> vk_graphs;
+    int64_t last_graph_eviction_sweep = 0;
+    vk_cached_graph * active_capture_graph = nullptr;
+
+    static const bool disable_vk_graphs_due_to_env;
+
+    vk_cached_graph * vk_graph(const void * first_node_ptr) {
+        const int64_t time_now = ggml_time_us();
+        if (time_now - last_graph_eviction_sweep >= 5'000'000) {
+            last_graph_eviction_sweep = time_now;
+            for (auto it = vk_graphs.begin(); it != vk_graphs.end(); ) {
+                if (time_now - it->second->last_used_time >= 10'000'000) {
+                    it->second->destroy(device->device);
+                    it = vk_graphs.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+        }
+        auto it = vk_graphs.find(first_node_ptr);
+        if (it == vk_graphs.end()) {
+            it = vk_graphs.emplace(first_node_ptr, std::make_unique<vk_cached_graph>()).first;
+        }
+        it->second->last_used_time = time_now;
+        return it->second.get();
+    }
+
    // number of additional consecutive nodes that are being fused with the
    // node currently being processed
    int num_additional_fused_ops {};
@@ -2042,6 +2119,8 @@ struct ggml_backend_vk_context {
    int32_t query_idx {};
 };

+const bool ggml_backend_vk_context::disable_vk_graphs_due_to_env = (getenv("GGML_VK_DISABLE_GRAPHS") != nullptr);
+
 static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT

 static uint64_t vk_tensor_offset(const ggml_tensor * tensor) {
@@ -2596,7 +2675,14 @@ static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx,

 static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {

-    if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
+    auto & pools = ctx->active_capture_graph
+        ? ctx->active_capture_graph->cached_descriptor_pools
+        : ctx->descriptor_pools;
+    auto & sets = ctx->active_capture_graph
+        ? ctx->active_capture_graph->cached_descriptor_sets
+        : ctx->descriptor_sets;
+
+    if (sets.size() >= ctx->pipeline_descriptor_set_requirements) {
        // Enough descriptors are available
        return;
    }
@@ -2604,29 +2690,29 @@ static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx
    vk_device& device = ctx->device;

    // Grow by 50% to avoid frequent allocations
-    uint32_t needed = std::max(3 * ctx->descriptor_sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements});
-    uint32_t to_alloc = needed - ctx->descriptor_sets.size();
-    uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
-    uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+    uint32_t needed = std::max(3 * sets.size() / 2, size_t{ctx->pipeline_descriptor_set_requirements});
+    uint32_t to_alloc = needed - sets.size();
+    uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
+    uint32_t pool_idx = sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;

    while (to_alloc > 0) {
        const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
        to_alloc -= alloc_count;
        pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;

-        if (pool_idx >= ctx->descriptor_pools.size()) {
+        if (pool_idx >= pools.size()) {
            vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
            vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
-            ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
+            pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
        }

        std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
        for (uint32_t i = 0; i < alloc_count; i++) {
            layouts[i] = device->dsl;
        }
-        vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
-        std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
-        ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
+        vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pools[pool_idx], alloc_count, layouts.data());
+        std::vector<vk::DescriptorSet> new_sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
+        sets.insert(sets.end(), new_sets.begin(), new_sets.end());

        pool_idx++;
    }
@@ -7319,12 +7405,15 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
    GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
                wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
                wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
-    GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
+    auto & ds_vec = ctx->active_capture_graph
+        ? ctx->active_capture_graph->cached_descriptor_sets
+        : ctx->descriptor_sets;
+    GGML_ASSERT(ctx->descriptor_set_idx < ds_vec.size());
    GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
    GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
    GGML_ASSERT(pipeline->push_constant_size == push_constant_size(push_constants));

-    vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
+    vk::DescriptorSet& descriptor_set = ds_vec[ctx->descriptor_set_idx++];
    vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
    ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});

@@ -7348,13 +7437,13 @@ static void ggml_vk_ctx_end(vk_context& ctx) {
    ctx->s = nullptr;
 }

-static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
+static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx, bool one_time = true) {
    VK_LOG_DEBUG("ggml_vk_ctx_begin(" << device->name << ")");
    if (subctx->s != nullptr) {
        ggml_vk_ctx_end(subctx);
    }

-    subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
+    subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p, one_time) });
    subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
 }

@@ -7363,10 +7452,14 @@ static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) {
    if (!ctx->compute_ctx.expired()) {
        result = ctx->compute_ctx.lock();
    } else {
-        result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
+        vk_command_pool & pool = ctx->active_capture_graph
+            ? ctx->active_capture_graph->cached_cmd_pool
+            : ctx->compute_cmd_pool;
+        bool one_time = (ctx->active_capture_graph == nullptr);
+        result = ggml_vk_create_context(ctx, pool);

        ctx->compute_ctx = result;
-        ggml_vk_ctx_begin(ctx->device, result);
+        ggml_vk_ctx_begin(ctx->device, result, one_time);
    }

    if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
@@ -14471,6 +14564,25 @@ static void ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
            memset(mset.dst, mset.val, mset.n);
        }

+        // During capture, save submissions and detect staging copies
+        if (ctx->active_capture_graph) {
+            bool has_staging = !subctx->in_memcpys.empty() || !subctx->out_memcpys.empty() || !subctx->memsets.empty();
+            if (has_staging) {
+                ctx->active_capture_graph->disabled = true;
+            } else {
+                bool signal_ar = (almost_ready && !ctx->almost_ready_fence_pending);
+                for (auto & seq : subctx->seqs) {
+                    for (auto & submission : seq) {
+                        ctx->active_capture_graph->submissions.push_back({
+                            submission.buffer,
+                            signal_ar,
+                        });
+                        signal_ar = false;
+                    }
+                }
+            }
+        }
+
        if (almost_ready && !ctx->almost_ready_fence_pending) {
            ggml_vk_submit(subctx, ctx->almost_ready_fence);
            ctx->almost_ready_fence_pending = true;
@@ -14567,6 +14679,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
    ctx->device->device.destroyFence(ctx->fence);
    ctx->device->device.destroyFence(ctx->almost_ready_fence);

+    for (auto & [key, graph] : ctx->vk_graphs) {
+        graph->destroy(ctx->device->device);
+    }
+    ctx->vk_graphs.clear();
+
    for (auto& pool : ctx->descriptor_pools) {
        ctx->device->device.destroyDescriptorPool(pool);
    }
@@ -15670,10 +15787,162 @@ static int32_t find_first_set(uint32_t x) {
    return ret;
 }

+// --- Vulkan graph caching helpers ---
+
+static const void * ggml_vk_graph_get_key(ggml_cgraph * cgraph) {
+    return cgraph->nodes[0];
+}
+
+static bool ggml_vk_graph_update_required(ggml_cgraph * cgraph, vk_cached_graph * graph) {
+    // UID fast path
+    if (cgraph->uid != 0 && cgraph->uid == graph->uid) {
+        GGML_ASSERT((int)graph->node_props.size() == cgraph->n_nodes);
+        return false;
+    }
+    graph->uid = cgraph->uid;
+
+    bool changed = false;
+
+    if ((int)graph->node_props.size() != cgraph->n_nodes) {
+        changed = true;
+        graph->node_props.resize(cgraph->n_nodes);
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        vk_graph_node_properties prop = {};
+        memcpy(&prop.node, cgraph->nodes[i], sizeof(ggml_tensor));
+        for (int j = 0; j < GGML_MAX_SRC; ++j) {
+            if (cgraph->nodes[i]->src[j]) {
+                prop.src_data_ptrs[j] = cgraph->nodes[i]->src[j]->data;
+                memcpy(prop.src_ne[j], cgraph->nodes[i]->src[j]->ne, sizeof(prop.src_ne[j]));
+                memcpy(prop.src_nb[j], cgraph->nodes[i]->src[j]->nb, sizeof(prop.src_nb[j]));
+            }
+        }
+        if (changed || memcmp(&graph->node_props[i], &prop, sizeof(prop)) != 0) {
+            graph->node_props[i] = prop;
+            changed = true;
+        }
+    }
+
+    return changed;
+}
+
+static bool ggml_vk_graph_replay(ggml_backend_vk_context * ctx, vk_cached_graph * cached) {
+    VK_LOG_DEBUG("ggml_vk_graph_replay()");
+
+    // Validate prealloc buffer handles haven't changed since capture
+    if (cached->prealloc_x_at_capture  != ctx->prealloc_x ||
+        cached->prealloc_y_at_capture  != ctx->prealloc_y ||
+        cached->prealloc_split_k_at_capture != ctx->prealloc_split_k) {
+        cached->warmup_complete = false;
+        cached->submissions.clear();
+        return false;
+    }
+
+    ggml_vk_submit_transfer_ctx(ctx);
+
+    // Handle add_rms_partials memset if needed
+    if (ctx->prealloc_size_add_rms_partials && ctx->prealloc_add_rms_partials) {
+        ggml_vk_preallocate_buffers(ctx, nullptr);
+        vk_context memset_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
+        ggml_vk_ctx_begin(ctx->device, memset_ctx);
+        ggml_vk_buffer_memset_async(memset_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials);
+        ggml_vk_sync_buffers(ctx, memset_ctx);
+        ggml_vk_ctx_end(memset_ctx);
+        ggml_vk_submit(memset_ctx, {});
+        ctx->submit_pending = true;
+    }
+
+    for (size_t i = 0; i < cached->submissions.size(); i++) {
+        auto & sub = cached->submissions[i];
+
+        vk::Semaphore wait_sem;
+        uint64_t wait_val = 0;
+        vk::PipelineStageFlags wait_stage;
+        uint32_t wait_count = 0;
+
+        // First submission may need to wait on transfer semaphore
+        if (i == 0 && ctx->device->async_use_transfer_queue &&
+            ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) {
+            wait_sem = ctx->transfer_semaphore.s;
+            wait_val = ctx->transfer_semaphore.value;
+            wait_stage = ctx->device->compute_queue.stage_flags;
+            wait_count = 1;
+            ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value;
+        }
+
+        vk::Fence fence = {};
+        if (sub.signal_almost_ready_fence && !ctx->almost_ready_fence_pending) {
+            fence = ctx->almost_ready_fence;
+            ctx->almost_ready_fence_pending = true;
+        }
+
+        vk::SubmitInfo si{
+            wait_count, &wait_sem, &wait_stage,
+            1, &sub.cmd_buffer->buf,
+            0, nullptr,
+        };
+
+        if (wait_count > 0) {
+            vk::TimelineSemaphoreSubmitInfo tl_info{
+                1, &wait_val,
+                0, nullptr,
+            };
+            si.setPNext(&tl_info);
+
+            std::lock_guard<std::mutex> guard(queue_mutex);
+            ctx->device->compute_queue.queue.submit({ si }, fence);
+        } else {
+            std::lock_guard<std::mutex> guard(queue_mutex);
+            ctx->device->compute_queue.queue.submit({ si }, fence);
+        }
+        ctx->submit_pending = true;
+    }
+
+    return true;
+}
+
 static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;

+    // --- Vulkan graph caching ---
+    bool use_vk_graph = false;
+    bool vk_graph_capture = false;
+    vk_cached_graph * cached = nullptr;
+
+    if (cgraph->n_nodes > 0 && !ctx->disable_vk_graphs_due_to_env
+#ifdef GGML_VULKAN_CHECK_RESULTS
+        && false
+#endif
+        && !vk_perf_logger_enabled) {
+
+        const void * graph_key = ggml_vk_graph_get_key(cgraph);
+        cached = ctx->vk_graph(graph_key);
+
+        if (!cached->disabled) {
+            bool properties_changed = ggml_vk_graph_update_required(cgraph, cached);
+
+            if (!cached->warmup_complete) {
+                if (!properties_changed) {
+                    cached->warmup_complete = true;
+                    use_vk_graph = true;
+                    vk_graph_capture = true;
+                } else {
+                    VK_LOG_DEBUG("ggml_vulkan: graph " << graph_key << " (" << cgraph->n_nodes << " nodes) - warmup");
+                }
+            } else {
+                if (properties_changed) {
+                    cached->destroy(ctx->device->device);
+                    VK_LOG_DEBUG("ggml_vulkan: graph " << graph_key << " (" << cgraph->n_nodes << " nodes) - invalidated");
+                } else if (!cached->submissions.empty()) {
+                    use_vk_graph = true;
+                    vk_graph_capture = false;
+                }
+            }
+        }
+    }
+
    if (vk_instance.debug_utils_support) {
        vk::DebugUtilsLabelEXT dul = {};
        dul.pLabelName = "ggml_backend_vk_graph_compute";
@@ -15681,6 +15950,37 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
    }

+    // --- REPLAY PATH ---
+    if (use_vk_graph && !vk_graph_capture && cached) {
+        ctx->prealloc_size_add_rms_partials_offset = 0;
+        ctx->do_add_rms_partials = false;
+        ctx->do_add_rms_partials_offset_calculation = false;
+
+        if (ggml_vk_graph_replay(ctx, cached)) {
+            VK_LOG_DEBUG("ggml_vulkan: graph (" << cgraph->n_nodes << " nodes, " << cached->submissions.size() << " submissions) - replayed");
+
+            if (!ctx->device->support_async) {
+                ggml_vk_synchronize(ctx);
+                ggml_vk_graph_cleanup(ctx);
+            }
+
+            if (vk_instance.debug_utils_support) {
+                vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT(ctx->device->compute_queue.queue);
+            }
+            return GGML_STATUS_SUCCESS;
+        }
+        VK_LOG_DEBUG("ggml_vulkan: graph " << cgraph->nodes[0] << " (" << cgraph->n_nodes << " nodes) - replay failed (prealloc changed)");
+        use_vk_graph = false;
+    }
+
+    // --- CAPTURE SETUP ---
+    if (use_vk_graph && vk_graph_capture && cached) {
+        cached->submissions.clear();
+        cached->cached_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
+        cached->uses_transfer_queue = ctx->device->async_use_transfer_queue;
+        ctx->active_capture_graph = cached;
+    }
+
    ctx->prealloc_size_add_rms_partials_offset = 0;
    ctx->do_add_rms_partials = false;
    ctx->do_add_rms_partials_offset_calculation = false;
@@ -16064,10 +16364,30 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        ctx->perf_logger->print_timings();
    }

+    // --- CAPTURE FINALIZATION ---
+    if (ctx->active_capture_graph) {
+        vk_cached_graph * cap = ctx->active_capture_graph;
+        ctx->active_capture_graph = nullptr;
+
+        if (cap->disabled) {
+            cap->destroy(ctx->device->device);
+            VK_LOG_DEBUG("ggml_vulkan: graph (" << cgraph->n_nodes << " nodes) - capture disabled (staging copies)");
+        } else {
+            cap->prealloc_x_at_capture = ctx->prealloc_x;
+            cap->prealloc_y_at_capture = ctx->prealloc_y;
+            cap->prealloc_split_k_at_capture = ctx->prealloc_split_k;
+            VK_LOG_DEBUG("ggml_vulkan: graph (" << cgraph->n_nodes << " nodes) - captured");
+        }
+    }
+
    if (!ctx->device->support_async) {
        ggml_vk_synchronize(ctx);
    }

+    if (vk_instance.debug_utils_support) {
+        vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT(ctx->device->compute_queue.queue);
+    }
+
    return GGML_STATUS_SUCCESS;

    UNUSED(backend);
Author	SHA1	Message	Date
Ruben Ortlam	42874dfd8f	clean up logging and timing	2026-06-17 13:47:53 +02:00
Ruben Ortlam	71d9373b82	simplify replay submission	2026-06-17 13:32:30 +02:00
Ruben Ortlam	f10a92dd17	fix queue debug utils label	2026-06-17 13:20:52 +02:00
Ruben Ortlam	9c1d7406b6	Revert "submit only twice for graph reuse" This reverts commit `e218a39018`.	2026-06-12 16:43:11 +02:00
Ruben Ortlam	e218a39018	submit only twice for graph reuse	2026-06-12 16:38:46 +02:00
Ruben Ortlam	ccceabc031	vulkan: capture and replay command buffers where possible	2026-06-12 16:38:46 +02:00