server : update readme + return json for "meta" field

server : passthrough the /models endpoint during loading
cmake: simplify vulkan shader test logic (#13263 )
2026-05-08 02:34:14 +02:00 · 2025-05-14 15:30:12 +03:00 · 2025-05-14 14:17:20 +03:00 · 2025-05-14 07:53:57 -03:00 · 2025-05-14 11:55:26 +02:00 · 2025-05-14 10:26:12 +02:00
22 changed files with 918 additions and 982 deletions
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -31,7 +31,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload

 ## Pre-quantized models

-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org

 Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`

--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4358,7 +4358,7 @@ static bool ggml_metal_encode_node(
                // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
                //       for now avoiding mainly to keep the number of templates/kernels a bit lower
                //       these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
-                if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
+                if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
                    switch (src1->type) {
                        case GGML_TYPE_F16:
                            {
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3887,6 +3887,11 @@ kernel void kernel_flash_attn_ext_vec(
                sm[tiisg] = pm[ic + tiisg];
            }

+            // skip -INF blocks
+            if (simd_max(sm[tiisg]) == -INFINITY) {
+                continue;
+            }
+
            // Q*K^T
            {
                // each simdgroup processes 1 query and NE (NW/NL) head elements
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -15,6 +15,32 @@ function(detect_host_compiler)
    set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE)
 endfunction()

+# Function to test shader extension support
+# Parameters:
+#  EXTENSION_NAME - Name of the extension to test (e.g., "GL_EXT_integer_dot_product")
+#  TEST_SHADER_FILE - Path to the test shader file
+#  RESULT_VARIABLE - Name of the variable to set (ON/OFF) based on test result
+function(test_shader_extension_support EXTENSION_NAME TEST_SHADER_FILE RESULT_VARIABLE)
+    execute_process(
+        COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${TEST_SHADER_FILE}"
+        OUTPUT_VARIABLE glslc_output
+        ERROR_VARIABLE glslc_error
+    )
+
+    if (${glslc_error} MATCHES ".*extension not supported: ${EXTENSION_NAME}.*")
+        message(STATUS "${EXTENSION_NAME} not supported by glslc")
+        set(${RESULT_VARIABLE} OFF PARENT_SCOPE)
+    else()
+        message(STATUS "${EXTENSION_NAME} supported by glslc")
+        set(${RESULT_VARIABLE} ON PARENT_SCOPE)
+        add_compile_definitions(${RESULT_VARIABLE})
+
+        # Ensure the extension support is forwarded to vulkan-shaders-gen
+        list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -D${RESULT_VARIABLE}=ON)
+        set(VULKAN_SHADER_GEN_CMAKE_ARGS "${VULKAN_SHADER_GEN_CMAKE_ARGS}" PARENT_SCOPE)
+    endif()
+endfunction()
+
 if (Vulkan_FOUND)
    message(STATUS "Vulkan found")

@@ -23,69 +49,35 @@ if (Vulkan_FOUND)
                             ../../include/ggml-vulkan.h
                            )

-    # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
+    set(VULKAN_SHADER_GEN_CMAKE_ARGS
+        -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
+        -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
+    )

-    if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*")
-        message(STATUS "GL_KHR_cooperative_matrix not supported by glslc")
-        set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_KHR_cooperative_matrix supported by glslc")
-        set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
-    endif()
+    # Test all shader extensions
+    test_shader_extension_support(
+        "GL_KHR_cooperative_matrix"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp"
+        "GGML_VULKAN_COOPMAT_GLSLC_SUPPORT"
+    )

-    # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
+    test_shader_extension_support(
+        "GL_NV_cooperative_matrix2"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
+        "GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT"
+    )

-    if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
-        message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
-        set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
-        set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
-    endif()
+    test_shader_extension_support(
+        "GL_EXT_integer_dot_product"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_integer_dot_support.comp"
+        "GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT"
+    )

-    # Compile a test shader to determine whether GL_EXT_integer_dot_product is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_integer_dot_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
-
-    if (${glslc_error} MATCHES ".*extension not supported: GL_EXT_integer_dot_product.*")
-        message(STATUS "GL_EXT_integer_dot_product not supported by glslc")
-        set(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_EXT_integer_dot_product supported by glslc")
-        set(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
-    endif()
-
-    # Compile a test shader to determine whether GL_EXT_bfloat16 is supported.
-    # If it's not, there will be an error to stderr.
-    # If it's supported, set a define to indicate that we should compile those shaders
-    execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_bfloat16_support.comp"
-                    OUTPUT_VARIABLE glslc_output
-                    ERROR_VARIABLE glslc_error)
-
-    if (${glslc_error} MATCHES ".*extension not supported: GL_EXT_bfloat16.*")
-        message(STATUS "GL_EXT_bfloat16 not supported by glslc")
-        set(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT OFF)
-    else()
-        message(STATUS "GL_EXT_bfloat16 supported by glslc")
-        set(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT ON)
-        add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
-    endif()
+    test_shader_extension_support(
+        "GL_EXT_bfloat16"
+        "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_bfloat16_support.comp"
+        "GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT"
+    )

    target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
    target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
@@ -124,16 +116,8 @@ if (Vulkan_FOUND)
        add_compile_definitions(GGML_VULKAN_RUN_TESTS)
    endif()

-    if (NOT CMAKE_CROSSCOMPILING)
-        add_subdirectory(vulkan-shaders)
-        if (MSVC)
-            foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
-                string(TOUPPER ${CONFIG} CONFIG)
-                set_target_properties(vulkan-shaders-gen PROPERTIES
-                    RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-            endforeach()
-        endif()
-    else()
+    # Set up toolchain for host compilation whether cross-compiling or not
+    if (CMAKE_CROSSCOMPILING)
        if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
            set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
        else()
@@ -146,25 +130,31 @@ if (Vulkan_FOUND)
            configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/host-toolchain.cmake.in ${CMAKE_BINARY_DIR}/host-toolchain.cmake @ONLY)
            set(HOST_CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/host-toolchain.cmake)
        endif()
-        message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
-
-        include(ExternalProject)
-        # Native build through ExternalProject_Add
-        ExternalProject_Add(
-            vulkan-shaders-gen
-            SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
-            CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE}
-                    -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
-                    -DGGML_VULKAN_COOPMAT_GLSLC_SUPPORT=${GGML_VULKAN_COOPMAT_GLSLC_SUPPORT}
-                    -DGGML_VULKAN_COOPMAT2_GLSLC_SUPPORT=${GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT}
-                    -DGGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT=${GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT}
-                    -DGGML_VULKAN_BFLOAT16_GLSLC_SUPPORT=${GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT}
-            BUILD_COMMAND ${CMAKE_COMMAND} --build .
-            INSTALL_COMMAND ${CMAKE_COMMAND} --install .
-            INSTALL_DIR ${CMAKE_BINARY_DIR}
-        )
-        ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
+    else()
+        # For non-cross-compiling, use empty toolchain (use host compiler)
+        set(HOST_CMAKE_TOOLCHAIN_FILE "")
    endif()
+
+    # Always use ExternalProject_Add approach
+    include(ExternalProject)
+
+    # Add toolchain file if cross-compiling
+    if (CMAKE_CROSSCOMPILING)
+        list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE})
+        message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
+    endif()
+
+    # Native build through ExternalProject_Add
+    ExternalProject_Add(
+        vulkan-shaders-gen
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
+        CMAKE_ARGS ${VULKAN_SHADER_GEN_CMAKE_ARGS}
+        BUILD_COMMAND ${CMAKE_COMMAND} --build .
+        INSTALL_COMMAND ${CMAKE_COMMAND} --install .
+        INSTALL_DIR ${CMAKE_BINARY_DIR}
+    )
+    ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
+
    set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
    set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
    set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
@@ -175,9 +165,8 @@ if (Vulkan_FOUND)
    file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
    set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)

-    if (CMAKE_CROSSCOMPILING)
-        set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
-    endif()
+    # Add build and install dependencies for all builds
+    set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)

    add_custom_command(
        OUTPUT ${_ggml_vk_header}
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -288,6 +288,9 @@ struct vk_device_struct {
    bool coopmat_acc_f32_support {};
    bool coopmat_acc_f16_support {};
    bool coopmat_bf16_support {};
+    bool coopmat_support_16x16x16_f16acc {};
+    bool coopmat_support_16x16x16_f32acc {};
+    bool coopmat1_fa_support {};
    uint32_t coopmat_m;
    uint32_t coopmat_n;
    uint32_t coopmat_k;
@@ -410,6 +413,13 @@ struct vk_device_struct {
    vk_pipeline pipeline_flash_attn_f32_f16_D128_cm2[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D256_cm2[GGML_TYPE_COUNT][2][2][2];

+    vk_pipeline pipeline_flash_attn_f32_f16_D64_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D80_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D96_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D112_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D128_cm1[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D256_cm1[GGML_TYPE_COUNT][2][2][2];
+
    vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2];
@@ -1588,19 +1598,36 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
    );
 }

+enum FaCodePath {
+    FA_SCALAR,
+    FA_COOPMAT1,
+    FA_COOPMAT2,
+};
+
 // number of rows/cols for flash attention shader
 static constexpr uint32_t flash_attention_num_small_rows = 32;
 static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
 static constexpr uint32_t scalar_flash_attention_num_large_rows = 8;

-static uint32_t get_fa_num_small_rows(bool scalar) {
-    return scalar ? scalar_flash_attention_num_small_rows : flash_attention_num_small_rows;
+// The FA coopmat1 shader assumes 16x16x16 matrix multiply support.
+// 128 threads split into four subgroups, each subgroup does 1/4
+// of the Bc dimension.
+static constexpr uint32_t coopmat1_flash_attention_num_large_rows = 16;
+static constexpr uint32_t scalar_flash_attention_Bc = 64;
+static constexpr uint32_t scalar_flash_attention_workgroup_size = 128;
+
+static uint32_t get_fa_num_small_rows(FaCodePath path) {
+    if (path == FA_COOPMAT2) {
+        return flash_attention_num_small_rows;
+    } else {
+        return scalar_flash_attention_num_small_rows;
+    }
 }

-static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
+static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
    GGML_UNUSED(clamp);

-    if (scalar) {
+    if (path == FA_SCALAR) {
        if (small_rows) {
            return {scalar_flash_attention_num_small_rows, 64};
        } else {
@@ -1608,9 +1635,17 @@ static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t cl
        }
    }

+    if (path == FA_COOPMAT1) {
+        if (small_rows) {
+            return {scalar_flash_attention_num_small_rows, scalar_flash_attention_Bc};
+        } else {
+            return {coopmat1_flash_attention_num_large_rows, scalar_flash_attention_Bc};
+        }
+    }
+
    // small rows, large cols
    if (small_rows) {
-        return {get_fa_num_small_rows(scalar), 32};
+        return {get_fa_num_small_rows(FA_COOPMAT2), 32};
    }

    // small cols to reduce register count
@@ -1907,17 +1942,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                      parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
    };

-    auto const &fa_wg_denoms = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
-        return {fa_rows_cols(scalar, D, clamp, type, small_rows)[0], 1, 1};
+    auto const &fa_wg_denoms = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
+        return {fa_rows_cols(path, D, clamp, type, small_rows)[0], 1, 1};
    };

-    auto const &fa_spec_constants = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
+    auto const &fa_spec_constants = [&](FaCodePath path, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
        // For large number of rows, 128 invocations seems to work best.
        // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
        // can't use 256 for D==80.
        // For scalar, use 128 (arbitrary)
-        uint32_t wg_size = scalar ? 128 : ((small_rows && (D % 32) == 0) ? 256 : 128);
-        auto rows_cols = fa_rows_cols(scalar, D, clamp, type, small_rows);
+        uint32_t wg_size = (path == FA_SCALAR || path == FA_COOPMAT1)
+                            ? scalar_flash_attention_workgroup_size
+                            : ((small_rows && (D % 32) == 0) ? 256 : 128);
+        auto rows_cols = fa_rows_cols(path, D, clamp, type, small_rows);

        // D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it.
        // D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader.
@@ -1929,36 +1966,43 @@ static void ggml_vk_load_shaders(vk_device& device) {
        return {wg_size, rows_cols[0], rows_cols[1], (D), clamp, D_split};
    };

-#define CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, D) \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true);     \
+#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, D) \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,false), fa_spec_constants(FAPATH, D,1,TYPE,false), 1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,false), fa_spec_constants(FAPATH, D,0,TYPE,false), fa_rows_cols(FAPATH,D,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true),   1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true),   fa_rows_cols(FAPATH,D,0,TYPE,true)[1],  true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,1,TYPE,true), fa_spec_constants(FAPATH, D,1,TYPE,true),   1,                                      true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, D,0,TYPE,true), fa_spec_constants(FAPATH, D,0,TYPE,true),   fa_rows_cols(FAPATH,D,0,TYPE,true)[1],  true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0));     \

-#define CREATE_FA(TYPE, NAMELC, SCALAR, SUFFIX) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 64) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 80) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 96) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 112) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 128) \
-        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 256)
+#define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128) \
+        CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256)

-    CREATE_FA(GGML_TYPE_F16, f16, true, )
-    CREATE_FA(GGML_TYPE_Q4_0, q4_0, true, )
-    CREATE_FA(GGML_TYPE_Q8_0, q8_0, true, )
+    CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
+    CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
+    CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, )
+#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    if (device->coopmat1_fa_support) {
+        CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1)
+        CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1)
+        CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1)
+    }
+#endif
 #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    if (device->coopmat2) {
-        CREATE_FA(GGML_TYPE_F16, f16, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q4_0, q4_0, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q4_1, q4_1, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q5_0, q5_0, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q5_1, q5_1, false, _cm2)
-        CREATE_FA(GGML_TYPE_Q8_0, q8_0, false, _cm2)
-        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, false, _cm2)
+        CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT2, _cm2)
+        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT2, _cm2)
    }
 #endif
 #undef CREATE_FA2
@@ -2041,17 +2085,17 @@ static void ggml_vk_load_shaders(vk_device& device) {
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
 #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true);   \
        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true);   \
        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true);   \
        if (device->mul_mat ## ID ## _l[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true);   \
        if (device->mul_mat ## ID ## _m[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true);   \
        if (device->mul_mat ## ID ## _s[TYPE]) \
-            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \
+            ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true);   \

        // Create 2 variants, {f16,f32} accumulator
 #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
@@ -3009,6 +3053,11 @@ static vk_device ggml_vk_get_device(size_t idx) {

 #if defined(VK_KHR_cooperative_matrix)
        device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
+
+        // coopmat1 fa shader currently assumes 32 invocations per subgroup
+        device->coopmat1_fa_support = device->coopmat_support && device->subgroup_require_full_support &&
+                                      device->subgroup_size_control && device->subgroup_min_size <= 32 &&
+                                      device->subgroup_max_size >= 32;
 #endif

        if (coopmat2_support) {
@@ -3143,6 +3192,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
                            // Only enable if shape is identical
                            device->coopmat_acc_f32_support = true;
                        }
+                        if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) {
+                            device->coopmat_support_16x16x16_f32acc = true;
+                        }
                    } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 &&
                               (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) {
                        // coopmat sizes not set yet
@@ -3155,6 +3207,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
                            // Only enable if shape is identical
                            device->coopmat_acc_f16_support = true;
                        }
+                        if (prop.MSize == 16 && prop.NSize == 16 && prop.KSize == 16) {
+                            device->coopmat_support_16x16x16_f16acc = true;
+                        }
                    }
                } else if ((vk::ComponentTypeKHR)prop.AType      == vk::ComponentTypeKHR::eSint8 &&
                           (vk::ComponentTypeKHR)prop.BType      == vk::ComponentTypeKHR::eSint8 &&
@@ -5688,6 +5743,36 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
    }
 }

+static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, const uint32_t D, bool f32acc) {
+    // Needs to be kept up to date on shader changes
+    const uint32_t wg_size = scalar_flash_attention_workgroup_size;
+    const uint32_t Br = scalar_flash_attention_num_large_rows;
+    const uint32_t Bc = scalar_flash_attention_Bc;
+
+    const uint32_t acctype = f32acc ? 4 : 2;
+    const uint32_t f16vec4 = 8;
+
+    const uint32_t tmpsh = wg_size * sizeof(float);
+    const uint32_t tmpshv4 = wg_size * 4 * acctype;
+
+    const uint32_t Qf = Br * (D / 4 + 2) * f16vec4;
+
+    const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br;
+    const uint32_t sfsh = Bc * sfshstride * acctype;
+
+    const uint32_t kshstride = D / 4 + 2;
+    const uint32_t ksh = Bc * kshstride * f16vec4;
+
+    const uint32_t slope = Br * sizeof(float);
+
+    const uint32_t total_size = tmpsh + tmpshv4 + Qf + sfsh + ksh + slope;
+    const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize;
+
+    VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(D=" << D << ", f32acc=" << f32acc << ", total_size=" << total_size << ", supported=" << supported);
+
+    return supported;
+}
+
 static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) {
    VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3];
    std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3];
@@ -5738,7 +5823,19 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    assert(q->type == GGML_TYPE_F32);
    assert(k->type == v->type);

-    bool scalar = !ctx->device->coopmat2;
+    FaCodePath path = ctx->device->coopmat2 ? FA_COOPMAT2 :
+                      ctx->device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR;
+
+    if (path == FA_COOPMAT1) {
+        const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) ||
+                                             (dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc);
+
+        const bool coopmat_shmem_supported = ggml_vk_flash_attn_coopmat_shmem_support(ctx->device, D, dst->op_params[3] == GGML_PREC_F32);
+
+        if (!coopmat_shape_supported || !coopmat_shmem_supported) {
+            path = FA_SCALAR;
+        }
+    }

    uint32_t gqa_ratio = 1;
    uint32_t qk_ratio = neq2 / nek2;
@@ -5746,9 +5843,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    uint32_t workgroups_y = (uint32_t)neq2;
    uint32_t workgroups_z = (uint32_t)neq3;

-    // For scalar FA, we can use the "large" size to accommodate qga.
-    // For coopmat FA, we always use the small size (which is still pretty large for gqa).
-    const uint32_t max_gqa = scalar ? scalar_flash_attention_num_large_rows : get_fa_num_small_rows(false);
+    // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
+    // For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
+    uint32_t max_gqa;
+    switch (path) {
+    case FA_SCALAR:
+    case FA_COOPMAT1:
+        // We may switch from coopmat1 to scalar, so use the scalar limit for both
+        max_gqa = scalar_flash_attention_num_large_rows;
+        break;
+    case FA_COOPMAT2:
+        max_gqa = get_fa_num_small_rows(FA_COOPMAT2);
+        break;
+    default:
+        GGML_ASSERT(0);
+    }

    if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa &&
        qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
@@ -5761,11 +5870,16 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    }

    vk_pipeline *pipelines;
-    // XXX TODO other backends may be changing accumulator precision to default to f32 soon
-    bool f32acc = scalar || dst->op_params[3] == GGML_PREC_F32;
-    bool small_rows = N <= get_fa_num_small_rows(scalar);
+    bool small_rows = N <= get_fa_num_small_rows(path);

-    if (scalar) {
+    if (small_rows && path == FA_COOPMAT1) {
+        path = FA_SCALAR;
+    }
+
+    bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
+
+    switch (path) {
+    case FA_SCALAR:
        switch (D) {
        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
@@ -5777,7 +5891,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
            GGML_ASSERT(!"unsupported D value");
            return;
        }
-    } else {
+        break;
+    case FA_COOPMAT1:
+        switch (D) {
+        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm1[k->type][f32acc][small_rows][0]; break;
+        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm1[k->type][f32acc][small_rows][0]; break;
+        case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm1[k->type][f32acc][small_rows][0]; break;
+        case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm1[k->type][f32acc][small_rows][0]; break;
+        case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm1[k->type][f32acc][small_rows][0]; break;
+        case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm1[k->type][f32acc][small_rows][0]; break;
+        default:
+            GGML_ASSERT(!"unsupported D value");
+            return;
+        }
+        break;
+    case FA_COOPMAT2:
        switch (D) {
        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm2[k->type][f32acc][small_rows][0]; break;
        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm2[k->type][f32acc][small_rows][0]; break;
@@ -5789,6 +5917,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
            GGML_ASSERT(!"unsupported D value");
            return;
        }
+        break;
+    default:
+        GGML_ASSERT(0);
    }
    assert(pipelines);

--- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
@@ -5,18 +5,35 @@ find_package (Threads REQUIRED)

 if (GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+    message(STATUS "Enabling coopmat glslc support")
 endif()
 if (GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    message(STATUS "Enabling coopmat2 glslc support")
 endif()
 if (GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
+    message(STATUS "Enabling dot glslc support")
 endif()
 if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
    add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
+    message(STATUS "Enabling bfloat16 glslc support")
 endif()
+
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
+
+# Configure output directories for MSVC builds
+if(MSVC)
+    # Get the main project's runtime output directory if possible
+    if(DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+        foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
+            string(TOUPPER ${CONFIG} CONFIG)
+            set_target_properties(${TARGET} PROPERTIES
+                RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        endforeach()
+    endif()
+endif()
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@@ -12,6 +12,7 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

+layout (constant_id = 0) const uint32_t WorkGroupSize = 128;
 layout (constant_id = 1) const uint32_t Br = 1;
 layout (constant_id = 2) const uint32_t Bc = 32;
 layout (constant_id = 3) const uint32_t D = 32;
@@ -19,7 +20,7 @@ layout (constant_id = 3) const uint32_t D = 32;
 layout (constant_id = 5) const uint32_t D_split = 16;
 const uint32_t D_per_thread = D / D_split;

-const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split;
+const uint32_t cols_per_iter = WorkGroupSize / D_split;
 const uint32_t cols_per_thread = Bc / cols_per_iter;

 layout (push_constant) uniform parameter {
@@ -134,8 +135,8 @@ ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const i
    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
 }

-shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
-shared vec4 tmpshv4[gl_WorkGroupSize.x];
+shared FLOAT_TYPE tmpsh[WorkGroupSize];
+shared vec4 tmpshv4[WorkGroupSize];

 shared float masksh[Bc][Br];
 shared vec4 Qf[Br][D / 4];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@@ -0,0 +1,506 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_KHR_cooperative_matrix : enable
+
+#include "types.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 1) const uint32_t Br = 1;
+layout (constant_id = 2) const uint32_t Bc = 32;
+layout (constant_id = 3) const uint32_t D = 32;
+
+layout (constant_id = 5) const uint32_t D_split = 16;
+
+const uint32_t D_per_thread = D / D_split;
+const uint32_t row_split = 4;
+const uint32_t rows_per_thread = Br / row_split;
+const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split / row_split;
+const uint32_t cols_per_thread = Bc / cols_per_iter;
+
+layout (push_constant) uniform parameter {
+    uint32_t N;
+    uint32_t KV;
+
+    uint32_t ne1;
+    uint32_t ne2;
+    uint32_t ne3;
+
+    uint32_t neq2;
+    uint32_t neq3;
+    uint32_t nek2;
+    uint32_t nek3;
+    uint32_t nev2;
+    uint32_t nev3;
+    uint32_t nem1;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb03;
+    uint32_t nb11;
+    uint32_t nb12;
+    uint32_t nb13;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t nb31;
+
+    float scale;
+    float max_bias;
+    float logit_softcap;
+
+    uint32_t mask;
+    uint32_t n_head_log2;
+    float m0;
+    float m1;
+
+    uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
+} p;
+
+layout (binding = 0) readonly buffer Q {float data_q[];};
+layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
+layout (binding = 1) readonly buffer K {float16_t data_k[];};
+layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
+layout (binding = 2) readonly buffer V {float16_t data_v[];};
+layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
+layout (binding = 3) readonly buffer M {float16_t data_m[];};
+layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
+
+#if defined(A_TYPE_PACKED16)
+#define BINDING_IDX_K 0
+#define BINDING_IDX_V 1
+layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
+#endif
+
+#if defined(DATA_A_Q4_0)
+#define BLOCK_BYTE_SIZE 18
+
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
+    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
+    uint shift = (iqs & 0x10) >> 2;
+    vui_lo >>= shift;
+    vui_hi >>= shift;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+#define BLOCK_BYTE_SIZE 34
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
+    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+}
+#endif
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// Store the output when doing grouped query attention.
+// Rows index by Q's dimension 2, and the first N rows are valid.
+D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    uint32_t offset = (iq2 + r) * D + c;
+    data_o[o_offset + offset] = D_TYPE(elem);
+    return elem;
+}
+
+// Store column zero. This is used to save per-row m and L values for split_k.
+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c == 0) {
+        uint32_t offset = iq2 + r;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
+// Load the slope matrix, indexed by Q's dimension 2.
+ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
+{
+    const uint32_t h = iq2 + (r % p.gqa_ratio);
+
+    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
+    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
+
+    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
+}
+
+// These need to be supported N,M values for a MatBc x MatBr x 16 coopmatmuladd
+const uint32_t MatBr = 16;
+const uint32_t MatBc = 16;
+
+shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
+shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x];
+
+const uint32_t qstride = D / 4 + 2; // in units of f16vec4
+shared f16vec4 Qf[Br * qstride];
+
+// Avoid padding for D==256 to make it fit in 48KB shmem.
+const uint32_t sfshstride = (D <= 128) ? (Br + 8) : Br;
+shared ACC_TYPE sfsh[Bc * sfshstride];
+
+const uint32_t kshstride = D / 4 + 2; // in units of f16vec4
+shared f16vec4 ksh[Bc * kshstride];
+
+shared float slope[Br];
+
+void main() {
+#ifdef NEEDS_INIT_IQ_SHMEM
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+    const uint32_t tid = gl_LocalInvocationIndex;
+    const uint32_t N = p.N;
+    const uint32_t KV = p.KV;
+
+    const uint32_t threads_per_rowgroup = gl_WorkGroupSize.x / row_split;
+    const uint32_t row_tid = gl_LocalInvocationIndex / threads_per_rowgroup;
+    const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
+    const uint32_t col_tid = (gl_LocalInvocationIndex % threads_per_rowgroup) / D_split;
+
+#define tile_row(r) (row_tid * rows_per_thread + (r))
+
+    uint32_t i = gl_WorkGroupID.x;
+    uint32_t split_k_index = 0;
+
+    if (p.k_num > 1) {
+        i = 0;
+        split_k_index = gl_WorkGroupID.x;
+    }
+
+    const uint32_t Tr = CEIL_DIV(N, Br);
+
+    const uint32_t start_j = split_k_index * p.split_kv / Bc;
+    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
+
+    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
+    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
+    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
+    const uint32_t iq3 = gl_WorkGroupID.z;
+
+    // broadcast factors
+    const uint32_t rk2 = p.neq2/p.nek2;
+    const uint32_t rk3 = p.neq3/p.nek3;
+
+    const uint32_t rv2 = p.neq2/p.nev2;
+    const uint32_t rv3 = p.neq3/p.nev3;
+
+    // k indices
+    const uint32_t ik3 = iq3 / rk3;
+    const uint32_t ik2 = iq2 / rk2;
+
+    // v indices
+    const uint32_t iv3 = iq3 / rv3;
+    const uint32_t iv2 = iq2 / rv2;
+
+    // nb?1 are already divided by the type size and are in units of elements.
+    // When using grouped query attention, Q is indexed by iq2, so the stride
+    // should be nb02 (which is in bytes).
+    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
+    uint32_t k_stride = p.nb11;
+    uint32_t v_stride = p.nb21;
+    // When using grouped query attention, all rows use the same mask (stride 0).
+    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
+    // that prevents the compiler from folding the "&" through the select
+    // and breaking the alignment detection.
+    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
+
+    uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
+
+    [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
+        uint32_t d = (idx + tid) % (D / 4);
+        uint32_t r = (idx + tid) / (D / 4);
+        if (r < Br && d < D / 4 &&
+            i * Br + r < N) {
+            Qf[r * qstride + d] = f16vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d] * p.scale);
+        }
+    }
+    barrier();
+
+    ACC_TYPEV4 Of[rows_per_thread][D_per_thread / 4];
+    [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            Of[r][d] = ACC_TYPEV4(0.0);
+        }
+    }
+
+    float Lf[rows_per_thread], Mf[rows_per_thread];
+
+    // Use -FLT_MAX/2 rather than -inf to reduce the possibility of NaNs, e.g. when computing Mold-M.
+    const float NEG_FLT_MAX_OVER_2 = uintBitsToFloat(0xFEFFFFFF);
+
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        Lf[r] = 0;
+        Mf[r] = NEG_FLT_MAX_OVER_2;
+    }
+
+    // ALiBi
+    if (p.max_bias > 0.0f) {
+        if (tid < Br) {
+            uint r = tid;
+            slope[r] = perElemOpComputeSlope(r, col_tid, ACC_TYPE(0), iq2);
+        }
+        barrier();
+    } else {
+        if (tid < Br) {
+            uint r = tid;
+            slope[r] = 1.0;
+        }
+        barrier();
+    }
+
+#if BLOCK_SIZE > 1
+    uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / BLOCK_BYTE_SIZE;
+    uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / BLOCK_BYTE_SIZE;
+#else
+    uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
+    uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
+#endif
+
+    [[dont_unroll]]
+    for (uint32_t j = start_j; j < end_j; ++j) {
+
+        [[unroll]] for (uint32_t idx = 0; idx < Bc * D / 4; idx += gl_WorkGroupSize.x) {
+            uint32_t d = (idx + tid) % (D / 4);
+            uint32_t c = (idx + tid) / (D / 4);
+            if (c < Bc && d < D / 4) {
+#if BLOCK_SIZE > 1
+                uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
+                uint ib = coord / BLOCK_SIZE;
+                uint iqs = (coord % BLOCK_SIZE);
+                f16vec4 K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K));
+#else
+                f16vec4 K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
+#endif
+
+                ksh[c * kshstride + d] = K_Tf;
+            }
+        }
+        barrier();
+
+        // K * Q^T -> S^T: Bc x D * D x Br -> Bc x Br
+        // Bc split across workgroup (four subgroups), loop over D in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16
+        // This is written transposed in order to allow for N being 8 if implementations need it
+        coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator> SfMat = coopmat<ACC_TYPE, gl_ScopeSubgroup, MatBc, MatBr, gl_MatrixUseAccumulator>(0);
+        coopmat<float16_t, gl_ScopeSubgroup, MatBc, 16, gl_MatrixUseA> KMat;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, MatBr, gl_MatrixUseB> QMat;
+
+        for (uint32_t d = 0; d < D / 16; ++d) {
+            coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor);
+
+            uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4;
+            coopMatLoad(KMat, ksh, coord, kshstride, gl_CooperativeMatrixLayoutRowMajor);
+
+            SfMat = coopMatMulAdd(KMat, QMat, SfMat);
+        }
+
+        uint coord = gl_SubgroupID * MatBc * sfshstride;
+        coopMatStore(SfMat, sfsh, coord, sfshstride, gl_CooperativeMatrixLayoutRowMajor);
+        barrier();
+
+        if (p.logit_softcap != 0.0f) {
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
+                uint32_t c = (idx + tid) / Br;
+                uint32_t r = (idx + tid) % Br;
+                if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
+                    sfsh[c * sfshstride + r] = ACC_TYPE(p.logit_softcap * tanh(sfsh[c * sfshstride + r]));
+                }
+            }
+            barrier();
+        }
+
+        if (p.mask != 0) {
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
+                uint32_t c = (idx + tid) % Bc;
+                uint32_t r = (idx + tid) / Bc;
+                if (idx + tid < Bc * Br || idx + gl_WorkGroupSize.x <= Bc * Br) {
+                    sfsh[c * sfshstride + r] += ACC_TYPE(slope[r] * float(data_m[(i * Br + r) * m_stride + (j * Bc + c)]));
+                }
+            }
+            barrier();
+        }
+
+        float eMf[rows_per_thread];
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            float rowmaxf = sfsh[tile_row(r) + (0 * cols_per_iter + col_tid) * sfshstride];
+            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                rowmaxf = max(rowmaxf, float(sfsh[tile_row(r) + (c * cols_per_iter + col_tid) * sfshstride]));
+            }
+            float Moldf = Mf[r];
+
+            // M = max(rowmax, Mold)
+            // P = e^(S - M)
+            // eM = e^(Mold - M)
+            Mf[r] = max(rowmaxf, Moldf);
+            eMf[r] = exp(Moldf - Mf[r]);
+        }
+
+        [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+            [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+                Of[r][d] = float16_t(eMf[r]) * Of[r][d];
+            }
+        }
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            Lf[r] = eMf[r]*Lf[r];
+        }
+
+        [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+            float Pf[rows_per_thread];
+            [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+                Pf[r] = exp(sfsh[tile_row(r) + (c * cols_per_iter + col_tid) * sfshstride] - Mf[r]);
+                Lf[r] += Pf[r];
+            }
+            [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+#if BLOCK_SIZE > 1
+                uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
+                uint ib = coord / BLOCK_SIZE;
+                uint iqs = (coord % BLOCK_SIZE);
+                vec4 Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
+#else
+                vec4 Vf = vec4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
+#endif
+                [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+                    Of[r][d] += float16_t(Pf[r]) * ACC_TYPEV4(Vf);
+                }
+            }
+        }
+
+        barrier();
+    }
+
+    // reduce across threads
+
+    float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        FLOAT_TYPE M = Mf[r];
+        tmpsh[tid] = M;
+        // Compute max across the row
+        barrier();
+        [[unroll]] for (int s = int(gl_WorkGroupSize.x / row_split) / 2; s >= D_split; s >>= 1) {
+            M = max(M, tmpsh[tid ^ s]);
+            barrier();
+            tmpsh[tid] = M;
+            barrier();
+        }
+        rowmaxf[r] = tmpsh[d_tid + row_tid * threads_per_rowgroup];
+        barrier();
+    }
+
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        Moldf[r] = Mf[r];
+
+        // M = max(rowmax, Mold)
+        // eM = e^(Mold - M)
+        Mf[r] = max(rowmaxf[r], Moldf[r]);
+        eMf[r] = exp(Moldf[r] - Mf[r]);
+
+        Lf[r] = eMf[r]*Lf[r];
+    }
+
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        FLOAT_TYPE L = Lf[r];
+        tmpsh[tid] = L;
+        // Compute sum across the row
+        barrier();
+        [[unroll]] for (int s = int(gl_WorkGroupSize.x / row_split) / 2; s >= D_split; s >>= 1) {
+            L += tmpsh[tid ^ s];
+            barrier();
+            tmpsh[tid] = L;
+            barrier();
+        }
+        Lf[r] = tmpsh[d_tid + row_tid * threads_per_rowgroup];
+        barrier();
+    }
+
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+
+            Of[r][d] = float16_t(eMf[r]) * Of[r][d];
+            tmpshv4[tid] = Of[r][d];
+
+            barrier();
+            [[unroll]] for (int s = int(gl_WorkGroupSize.x / row_split) / 2; s >= D_split; s >>= 1) {
+                Of[r][d] += tmpshv4[tid ^ s];
+                barrier();
+                tmpshv4[tid] = Of[r][d];
+                barrier();
+            }
+            Of[r][d] = tmpshv4[d_tid + row_tid * threads_per_rowgroup];
+            barrier();
+        }
+    }
+
+    // If there is split_k, then the split_k resolve shader does the final
+    // division by L. Store the intermediate O value and per-row m and L values.
+    if (p.k_num > 1) {
+        uint32_t o_offset = D * p.ne1 * split_k_index;
+
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            if (tile_row(r) < N) {
+                [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
+                        perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N);
+                    }
+                }
+            }
+        }
+
+        o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            if (tile_row(r) < N) {
+                perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
+                perElemOpStoreCol0(tile_row(r), 0u, ACC_TYPE(Mf[r]), o_offset + p.ne1, iq2, N);
+            }
+        }
+
+        return;
+    }
+
+    float Lfrcp[rows_per_thread];
+    [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+        Lfrcp[r] = 1.0 / Lf[r];
+    }
+
+    [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            Of[r][d] *= float16_t(Lfrcp[r]);
+        }
+    }
+
+    uint32_t o_offset = iq3*p.ne2*p.ne1;
+
+    if (p.gqa_ratio > 1) {
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            if (tile_row(r) < N) {
+                [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
+                        perElemOpGqaStore(tile_row(r), 4*(d * D_split + d_tid) + comp, float(Of[r][d][comp]), o_offset, iq2, N);
+                    }
+                }
+            }
+        }
+    } else {
+        [[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
+            if (i * Br + tile_row(r) < N) {
+                [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
+                        data_o[o_offset + iq2 * D + (i * Br + tile_row(r)) * p.ne1 * D + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
+                    }
+                }
+            }
+        }
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -215,7 +215,7 @@ static std::mutex compile_count_mutex;
 static std::condition_variable compile_count_cond;

 void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) {
-    std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_coopmat" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32"));
+    std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_cm1" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32"));
    std::string out_fname = join_paths(output_dir, name + ".spv");
    std::string in_path = join_paths(input_dir, in_fname);

@@ -424,6 +424,7 @@ void process_shaders() {
    // flash attention
    for (const auto& f16acc : {false, true}) {
        std::string acctype = f16acc ? "float16_t" : "float";
+        std::string acctypev4 = f16acc ? "f16vec4" : "vec4";

        for (const auto& tname : type_names) {
            if (tname == "f32") {
@@ -440,6 +441,16 @@ void process_shaders() {
                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
                    merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc);
            }
+#endif
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
+            if (tname == "f16") {
+                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
+                    merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"ACC_TYPEV4", acctypev4}, {"COOPMAT", "1"}}), true, true, false, f16acc);
+            } else if (tname == "q4_0" || tname == "q8_0") {
+                std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
+                    merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"ACC_TYPEV4", acctypev4}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), true, true, false, f16acc);
+            }
 #endif
            if (tname == "f16") {
                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -113,7 +113,7 @@ parser.add_argument("-o", "--output", help=help_o, default="pipe")
 help_s = (
    "Columns to add to the table. "
    "Accepts a comma-separated list of values. "
-    f"Legal values: {', '.join(KEY_PROPERTIES[:-2])}. "
+    f"Legal values: {', '.join(KEY_PROPERTIES[:-3])}. "
    "Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
    "plus any column where not all data points are the same. "
    "If the columns are manually specified, then the results for each unique combination of the "
@@ -505,7 +505,7 @@ if known_args.show is not None:
    show = known_args.show.split(",")
    unknown_cols = []
    for prop in show:
-        if prop not in KEY_PROPERTIES[:-2]:  # Last two values are n_prompt, n_gen.
+        if prop not in KEY_PROPERTIES[:-3]:  # Last three values are n_prompt, n_gen, n_depth.
            unknown_cols.append(prop)
    if unknown_cols:
        logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -14,6 +14,12 @@
 #include <thread>
 #include <unordered_map>

+// Quantization types. Changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void zeros(std::ofstream & file, size_t n) {
    char zero = 0;
    for (size_t i = 0; i < n; ++i) {
@@ -48,12 +54,6 @@ struct quantize_state_impl {
        {}
 };

-// changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static void llama_tensor_dequantize_impl(
    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                // unless the user specifies a type
                if (params->tensor_types) {
                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    const std::string tensor_name(tensor->name);
                    for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
-                            if (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                            if  (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                new_type = qtype;
+                                break; // if two or more types are specified for the tensor, first match wins
                            }
-                            new_type = qtype;
-                            break;
                        }
                    }
                }
            }
+
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
            }
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2309,14 +2309,6 @@ struct clip_model_loader {
    }
 };

-// read and create ggml_context containing the tensors and their data
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
-    return clip_init(fname, clip_context_params{
-        /* use_gpu */   true,
-        /* verbosity */ static_cast<ggml_log_level>(verbosity),
-    });
-}
-
 struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
    g_logger_state.verbosity_thold = ctx_params.verbosity;
    clip_ctx * ctx_clip = nullptr;
@@ -3085,19 +3077,6 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.image_grid_pinpoints.size();
 }

-// deprecated
-int clip_n_patches(const struct clip_ctx * ctx) {
-    clip_image_f32 img;
-    img.nx = ctx->vision_model.hparams.image_size;
-    img.ny = ctx->vision_model.hparams.image_size;
-    return clip_n_output_tokens(ctx, &img);
-}
-
-// deprecated
-int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
-    return clip_n_output_tokens(ctx, img);
-}
-
 int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
    const auto & params = ctx->vision_model.hparams;
    const int n_total = clip_n_output_tokens(ctx, img);
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -1,28 +1,9 @@
-#ifndef CLIP_H
-#define CLIP_H
+#pragma once

 #include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>

-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define CLIP_API __declspec(dllexport)
-#        else
-#            define CLIP_API __declspec(dllimport)
-#        endif
-#    else
-#        define CLIP_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define CLIP_API
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 struct clip_ctx;

 struct clip_image_size {
@@ -39,97 +20,80 @@ struct clip_context_params {
    enum ggml_log_level verbosity;
 };

-// deprecated, use clip_init
-CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);

-CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
+void clip_free(struct clip_ctx * ctx);

-CLIP_API void clip_free(struct clip_ctx * ctx);
+size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);

-CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
-
-CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
+int32_t clip_get_image_size (const struct clip_ctx * ctx);
+int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx);

 // TODO: should be enum, not string
-CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+const char * clip_patch_merge_type(const struct clip_ctx * ctx);

-CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
-CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
+const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+size_t get_clip_image_grid_size(const struct clip_ctx * ctx);

-GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
-    "use clip_n_output_tokens instead");
-GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
-    "use clip_n_output_tokens instead");
-
-CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);

 // for M-RoPE, this will be the number of token positions in X and Y directions
 // for other models, X will be the total number of tokens and Y will be 1
-CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);

 // this should be equal to the embedding dimension of the text model
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+int clip_n_mmproj_embd(const struct clip_ctx * ctx);

-CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
-CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
-CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
+int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
+void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
+struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

-CLIP_API struct clip_image_size      * clip_image_size_init(void);
-CLIP_API struct clip_image_u8        * clip_image_u8_init (void);
-CLIP_API struct clip_image_f32       * clip_image_f32_init(void);
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
+struct clip_image_size      * clip_image_size_init(void);
+struct clip_image_u8        * clip_image_u8_init (void);
+struct clip_image_f32       * clip_image_f32_init(void);
+struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava

 // nx, ny are the output image dimensions
-CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);

-CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
-CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
-CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
-CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+void clip_image_size_free (struct clip_image_size * img_size);
+void clip_image_u8_free (struct clip_image_u8  * img);
+void clip_image_f32_free(struct clip_image_f32 * img);
+void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

 // use for accessing underlay data of clip_image_f32_batch
-CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
-CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
-CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
-CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
+size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
+size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
+struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data

 /**
 * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
 * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
 */
-CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);

-CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

 /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

-CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

-CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
-CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

-CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+int clip_is_minicpmv(const struct clip_ctx * ctx);
+bool clip_is_glm(const struct clip_ctx * ctx);
+bool clip_is_qwen2vl(const struct clip_ctx * ctx);
+bool clip_is_llava(const struct clip_ctx * ctx);
+bool clip_is_gemma3(const struct clip_ctx * ctx);

-CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
-CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
-CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
-CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
-CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
-
-CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // CLIP_H
+bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
--- a/tools/mtmd/qwen2vl-test.cpp
+++ b/tools/mtmd/qwen2vl-test.cpp
@@ -1,636 +0,0 @@
-#include "arg.h"
-#include "base64.hpp"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-#include "ggml.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-#ifdef NDEBUG
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#endif
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <limits>
-#include <cassert>
-#include <cmath>
-
-// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
-// IT IS NOT A PRODUCTION CODE
-
-static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
-                                     int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
-    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));
-    const int patch_size = 14 * 2;
-    const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
-    const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
-    auto img_tokens = image_embed->n_image_pos;
-    // llama_pos mrope_pos[img_tokens * 4];
-    std::vector<llama_pos> mrope_pos;
-    mrope_pos.resize(img_tokens * 4);
-
-    for (int y = 0; y < ph; y++)
-    {
-        for (int x = 0; x < pw; x++)
-        {
-            int i = y * pw + x;
-            mrope_pos[i] = *st_pos_id;
-            mrope_pos[i + img_tokens] = *st_pos_id + y;
-            mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
-            mrope_pos[i + img_tokens * 3] = 0;
-        }
-    }
-    *st_pos_id += std::max(pw, ph);
-
-    int processed = 0;
-    std::vector<llama_pos> batch_mrope_pos;
-    batch_mrope_pos.resize(img_tokens * 4);
-
-    for (int i = 0; i < img_tokens; i += n_batch) {
-        int n_eval = img_tokens - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-
-        // llama_pos batch_mrope_pos[n_eval * 4];
-        std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
-        memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
-        memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
-        memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
-        memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
-
-        llama_batch batch = {
-            int32_t(n_eval),                // n_tokens
-            nullptr,                        // token
-            (image_embed->embed+i*n_embd),  // embed
-            batch_mrope_pos.data(),         // pos
-            nullptr,  // n_seq_id
-            nullptr,  // seq_id
-            nullptr,  // logits
-        };
-
-        if (llama_decode(ctx_llama, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-        processed += n_eval;
-    }
-    return true;
-}
-
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        auto batch = llama_batch_get_one(&tokens[i], n_eval);
-
-        if (llama_decode(ctx_llama, batch)) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-        *st_pos_id += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
-    return true;
-}
-
-static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
-                           int * n_past, int * st_pos_id) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
-    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
-        ret = "</s>";
-    } else {
-        ret = common_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past, st_pos_id);
-    return ret.c_str();
-}
-
-static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
-static const char* IMG_BASE64_TAG_END = "\">";
-
-static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
-    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
-    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
-}
-
-static bool prompt_contains_image(const std::string& prompt) {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    return (begin != std::string::npos);
-}
-
-// replaces the base64 image tag in the prompt with `replacement`
-static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
-    size_t img_base64_str_start, img_base64_str_end;
-    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
-    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
-        return NULL;
-    }
-
-    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
-    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
-    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
-
-    auto required_bytes = base64::required_encode_size(base64_str.size());
-    auto img_bytes = std::vector<unsigned char>(required_bytes);
-    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
-
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
-    if (!embed) {
-        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
-        return NULL;
-    }
-
-    return embed;
-}
-
-static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    if (begin == std::string::npos || end == std::string::npos) {
-        return prompt;
-    }
-    auto pre = prompt.substr(0, begin);
-    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
-    return pre + replacement + post;
-}
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void print_usage(int, char ** argv) {
-    LOG("\n example usage:\n");
-    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
-
-    // load and preprocess the image
-    llava_image_embed * embed = NULL;
-    auto prompt = params->prompt;
-    if (prompt_contains_image(prompt)) {
-        if (!params->image.empty()) {
-            LOG_INF("using base64 encoded image instead of command line image path\n");
-        }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
-        if (!embed) {
-            LOG_ERR("%s: can't load image from prompt\n", __func__);
-            return NULL;
-        }
-        params->prompt = remove_image_from_prompt(prompt);
-    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
-        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
-            return NULL;
-        }
-    }
-
-    return embed;
-}
-
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
-    int n_past = 0;
-    int cur_pos_id = 0;
-
-    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-
-    std::string system_prompt, user_prompt;
-    size_t image_pos = prompt.find("<|vision_start|>");
-    if (image_pos != std::string::npos) {
-        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-        system_prompt = prompt.substr(0, image_pos);
-        user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
-        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    } else {
-        // llava-1.5 native mode
-        system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
-        user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
-    if (image_embed != nullptr) {
-        auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
-        qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
-    }
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
-
-    // generate the response
-
-    LOG("\n");
-
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
-    std::string response = "";
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
-        response += tmp;
-        if (strcmp(tmp, "</s>") == 0) break;
-        if (strstr(tmp, "###")) break; // Yi-VL behavior
-        LOG("%s", tmp);
-        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
-        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
-        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
-
-        fflush(stdout);
-    }
-
-    common_sampler_free(smpl);
-    LOG("\n");
-}
-
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = common_model_params_to_llama(*params);
-
-    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.path.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
-
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
-
-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->ctx_clip = ctx_clip;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_model_free(ctx_llava->model);
-    llama_backend_free();
-}
-
-#ifndef NDEBUG
-
-static void debug_test_mrope_2d() {
-    // 1. Initialize backend
-    ggml_backend_t backend = NULL;
-    std::string backend_name = "";
-// #ifdef GGML_USE_CUDA
-//     fprintf(stderr, "%s: using CUDA backend\n", __func__);
-//     backend = ggml_backend_cuda_init(0); // init device 0
-//     backend_name = "cuda";
-//     if (!backend) {
-//         fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-//     }
-// #endif
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!backend) {
-        backend = ggml_backend_cpu_init();
-        backend_name = "cpu";
-    }
-
-    // Calculate the size needed to allocate
-    size_t ctx_size = 0;
-    ctx_size += 2 * ggml_tensor_overhead(); // tensors
-    // no need to allocate anything else!
-
-    // 2. Allocate `ggml_context` to store tensor data
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
-    };
-    struct ggml_context * ctx = ggml_init(params);
-
-    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
-    ggml_set_name(pos, "pos");
-    ggml_set_input(pos);
-
-    std::vector<float> dummy_q;
-    dummy_q.resize(128 * 12 * 30);
-    std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
-    // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
-
-    std::vector<int> pos_id;
-    pos_id.resize(30 * 4);
-    for (int i = 0; i < 30; i ++) {
-        pos_id[i] = i;
-        pos_id[i + 30] = i + 10;
-        pos_id[i + 60] = i + 20;
-        pos_id[i + 90] = i + 30;
-    }
-    int sections[4] = {32, 32, 0, 0};
-
-    // 4. Allocate a `ggml_backend_buffer` to store all tensors
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-    // 5. Copy tensor data from main memory (RAM) to backend buffer
-    ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
-    ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
-
-    // 6. Create a `ggml_cgraph` for mul_mat operation
-    struct ggml_cgraph * gf = NULL;
-    struct ggml_context * ctx_cgraph = NULL;
-
-    // create a temporally context to build the graph
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-    };
-    ctx_cgraph = ggml_init(params0);
-    gf = ggml_new_graph(ctx_cgraph);
-
-    struct ggml_tensor * result0 = ggml_rope_multi(
-        ctx_cgraph, inp_raw, pos, nullptr,
-        128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
-        0, 1, 32, 1);
-
-    // Add "result" tensor and all of its dependencies to the cgraph
-    ggml_build_forward_expand(gf, result0);
-
-    // 7. Create a `ggml_gallocr` for cgraph computation
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    // 9. Run the computation
-    int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, n_threads);
-    }
-    ggml_backend_graph_compute(backend, gf);
-
-    // 10. Retrieve results (output tensors)
-    // in this example, output tensor is always the last tensor in the graph
-    struct ggml_tensor * result = result0;
-    // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
-    float * result_data = (float *)malloc(ggml_nbytes(result));
-    // because the tensor data is stored in device buffer, we need to copy it back to RAM
-    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
-    const std::string bin_file = "mrope_2d_" + backend_name +".bin";
-    std::ofstream outFile(bin_file, std::ios::binary);
-
-    if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
-        outFile.close();
-        std::cout << "Data successfully written to " + bin_file << std::endl;
-    } else {
-        std::cerr << "Error opening file!" << std::endl;
-    }
-
-    free(result_data);
-    // 11. Free memory and exit
-    ggml_free(ctx_cgraph);
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buffer);
-    ggml_backend_free(backend);
-}
-
-enum model_output_type {
-    conv3d,
-    patch_embed,
-    patch_win_attn_scatter,
-    first_attn_layer,
-    last_attn_layer,
-    attn_softmax,
-    final_layer,
-};
-
-static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
-    constexpr int ih = 140;
-    constexpr int iw = 196;
-    // constexpr int ih = 56;
-    // constexpr int iw = 56;
-    // int n_embd  = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
-    int n_embd  = 1280;
-    int merge = 1;
-    if (output_type == model_output_type::final_layer) {
-        n_embd  = 2048;
-        merge = 2;
-    }
-    else if (output_type == model_output_type::attn_softmax) {
-        merge = 1;
-        n_embd = (ih/14/merge) * (iw/14/merge) * 16;
-    }
-
-    int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
-    float vals[iw * ih * 3];
-    // float embd[ne];
-    std::vector<float> embd;
-    embd.resize(ne);
-
-    for (int i = 0; i < iw*ih; i++)
-    {
-        for (int c = 0; c < 3; c++)
-            vals[i * 3 + c] = (float)i / (iw*ih);
-    }
-
-    clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
-
-    std::string file_postfix = "";
-    switch (output_type)
-    {
-    case model_output_type::conv3d:
-        file_postfix = "conv3d";
-        break;
-    case model_output_type::patch_embed:
-        file_postfix = "patch_embed";
-        break;
-    case model_output_type::patch_win_attn_scatter:
-        file_postfix = "scatter";
-        break;
-    case model_output_type::first_attn_layer:
-        file_postfix = "first_attn";
-        break;
-    case model_output_type::last_attn_layer:
-        file_postfix = "last_attn";
-        break;
-    case model_output_type::attn_softmax:
-        file_postfix = "attn_softmax";
-        break;
-    case model_output_type::final_layer:
-        file_postfix = "final";
-        break;
-    default:
-        break;
-    }
-    auto output_path = "img_embed_" + file_postfix + ".bin";
-
-    std::ofstream outFile(output_path, std::ios::binary);
-    if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
-
-        outFile.close();
-        std::cout << "Data successfully written to ::[ " << output_path << std::endl;
-    } else {
-        std::cerr << "Error opening file!" << std::endl;
-    }
-}
-
-#endif
-
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv);
-        return 1;
-    }
-
-    auto * model = llava_init(&params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
-        return 1;
-    }
-
-    if (prompt_contains_image(params.prompt)) {
-        auto * ctx_llava = llava_init_context(&params, model);
-
-        auto * image_embed = load_image(ctx_llava, &params, "");
-
-        // process the prompt
-        process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        llava_image_embed_free(image_embed);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-#ifndef NDEBUG
-    } else if (params.image[0].empty()) {
-        auto ctx_llava = llava_init_context(&params, model);
-
-        // debug_test_mrope_2d();
-        debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
-        // debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-#endif
-    } else {
-        for (auto & image : params.image) {
-            auto * ctx_llava = llava_init_context(&params, model);
-
-            auto * image_embed = load_image(ctx_llava, &params, image);
-            if (!image_embed) {
-                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
-                return 1;
-            }
-
-            // process the prompt
-            process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-            llama_perf_context_print(ctx_llava->ctx_llama);
-            llava_image_embed_free(image_embed);
-            ctx_llava->model = NULL;
-            llava_free(ctx_llava);
-        }
-    }
-
-    llama_model_free(model);
-
-    return 0;
-}
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -57,6 +57,12 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
    { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };

+// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
@@ -244,56 +250,10 @@ static ggml_type parse_ggml_type(const char * arg) {
            return type;
        }
    }
-    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
    return GGML_TYPE_COUNT;
 }

-// Allowed tensors for arbitrary quantization with --tensor-type option
-static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
-    "attn_k",
-    "attn_kv_a_mqa",
-    "attn_kv_b",
-    "attn_o",
-    "attn_output",
-    "attn_q",
-    "attn_q_a",
-    "attn_q_b",
-    "attn_qkv",
-    "attn_v",
-    "channel_mix_key",
-    "channel_mix_receptance",
-    "channel_mix_value",
-    "cls",
-    "cls.output",
-    "cross_attn_k",
-    "cross_attn_o",
-    "cross_attn_q",
-    "cross_attn_v",
-    "ffn_act",
-    "ffn_down",
-    "ffn_down_exps",
-    "ffn_down_shexp",
-    "ffn_gate",
-    "ffn_gate_exps",
-    "ffn_gate_shexp",
-    "ffn_up",
-    "ffn_up_exps",
-    "ffn_up_shexp",
-    "ssm_in",
-    "ssm_out",
-    "time_mix_gate",
-    "time_mix_key",
-    "time_mix_output",
-    "time_mix_receptance",
-    "time_mix_value",
-};
-
-// changes to this struct must be replicated in llama-quant.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr) {
@@ -306,7 +266,6 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
        printf("\n%s: missing tensor name\n\n", __func__);
        return false;
    }
-
    if (const size_t qt_len = strlen(sep); qt_len == 1) {
        printf("\n%s: missing quantization type\n\n", __func__);
        return false;
@@ -315,37 +274,15 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
    std::string tn(data, tn_len);
    std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
    sep++;
-    const std::string qt(sep);
-
-    bool found = false;
-    for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
-        std::string tensor;
-        tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
-        // handle special case of cls.output
-        std::string cls_output = "cls.output";
-        if (tn.find(cls_output) != std::string::npos) {
-            tensor = "cls.output";
-        }
-        // check if an allowed tensor exists and it's at the end of the kv string
-        if (tensor == allowed) {
-            found = true;
-            break;
-        }
-    }
-    if (!found) {
-        printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
-        return false;
-    }
-
-    if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
-        printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
-        return false;
-    }
-
    tensor_quantization tqz;
    tqz.name = tn;
-    tqz.quant = parse_ggml_type(qt.c_str());
+    tqz.quant = parse_ggml_type(sep);
    tensor_type.emplace_back(std::move(tqz));
+    if (tqz.quant == GGML_TYPE_COUNT) {
+        printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
+        return false;
+    }
+
    return true;
 }

--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1040,7 +1040,7 @@ To know the `id` of the adapter, use GET `/lora-adapters`

 Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models).

-The returned list always has one single element.
+The returned list always has one single element. The `meta` field can be `null` (for example, while the model is still loading).

 By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`.

--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3705,6 +3705,9 @@ int main(int argc, char ** argv) {
            if (req.path == "/" || tmp.back() == "html") {
                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
                res.status = 503;
+            } else if (req.path == "/models" || req.path == "/v1/models") {
+                // allow the models endpoint to be accessed during loading
+                return true;
            } else {
                res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
            }
@@ -4363,7 +4366,13 @@ int main(int argc, char ** argv) {
        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
    };

-    const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
+    const auto handle_models = [&params, &ctx_server, &state, &res_ok](const httplib::Request &, httplib::Response & res) {
+        server_state current_state = state.load();
+        json model_meta = nullptr;
+        if (current_state == SERVER_STATE_READY) {
+            model_meta = ctx_server.model_meta();
+        }
+
        json models = {
            {"object", "list"},
            {"data", {
@@ -4372,7 +4381,7 @@ int main(int argc, char ** argv) {
                    {"object",   "model"},
                    {"created",  std::time(0)},
                    {"owned_by", "llamacpp"},
-                    {"meta",     ctx_server.model_meta()}
+                    {"meta",     model_meta},
                },
             }}
        };
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -44,6 +44,7 @@
        "eslint": "^9.17.0",
        "eslint-plugin-react-hooks": "^5.0.0",
        "eslint-plugin-react-refresh": "^0.4.16",
+        "fflate": "^0.8.2",
        "globals": "^15.14.0",
        "prettier": "^3.4.2",
        "sass-embedded": "^1.83.4",
@@ -2802,6 +2803,13 @@
        "reusify": "^1.0.4"
      }
    },
+    "node_modules/fflate": {
+      "version": "0.8.2",
+      "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz",
+      "integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==",
+      "dev": true,
+      "license": "MIT"
+    },
    "node_modules/file-entry-cache": {
      "version": "8.0.0",
      "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -5,7 +5,7 @@
  "type": "module",
  "scripts": {
    "dev": "vite",
-    "build": "tsc -b && vite build",
+    "build": "npm run format && tsc -b && vite build",
    "format": "eslint . && prettier --write .",
    "lint": "eslint .",
    "preview": "vite preview"
@@ -47,6 +47,7 @@
    "eslint": "^9.17.0",
    "eslint-plugin-react-hooks": "^5.0.0",
    "eslint-plugin-react-refresh": "^0.4.16",
+    "fflate": "^0.8.2",
    "globals": "^15.14.0",
    "prettier": "^3.4.2",
    "sass-embedded": "^1.83.4",
--- a/tools/server/webui/src/components/ChatScreen.tsx
+++ b/tools/server/webui/src/components/ChatScreen.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useMemo, useRef, useState } from 'react';
+import { ClipboardEvent, useEffect, useMemo, useRef, useState } from 'react';
 import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
 import ChatMessage from './ChatMessage';
 import { CanvasType, Message, PendingMessage } from '../utils/types';
@@ -328,6 +328,17 @@ function ChatInput({
        {({ getRootProps, getInputProps }) => (
          <div
            className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
+            onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
+              const files = Array.from(e.clipboardData.items)
+                .filter((item) => item.kind === 'file')
+                .map((item) => item.getAsFile())
+                .filter((file) => file !== null);
+
+              if (files.length > 0) {
+                e.preventDefault();
+                extraContext.onFileAdded(files);
+              }
+            }}
            {...getRootProps()}
          >
            {!isGenerating && (
--- a/tools/server/webui/vite.config.ts
+++ b/tools/server/webui/vite.config.ts
@@ -3,7 +3,7 @@ import react from '@vitejs/plugin-react';
 import { viteSingleFile } from 'vite-plugin-singlefile';
 import path from 'node:path';
 import fs from 'node:fs';
-import zlib from 'node:zlib';
+import * as fflate from 'fflate';

 /* eslint-disable */

@@ -33,9 +33,10 @@ const BUILD_PLUGINS = [
      },
      writeBundle() {
        const outputIndexHtml = path.join(config.build.outDir, 'index.html');
-        const content =
+        let content =
          GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8');
-        const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), {
+        content = content.replace(/\r/g, ''); // remove windows-style line endings
+        const compressed = fflate.gzipSync(Buffer.from(content, 'utf-8'), {
          level: 9,
        });
Author	SHA1	Message	Date
Georgi Gerganov	237acc7cd5	server : update readme + return json for "meta" field	2025-05-14 15:30:12 +03:00
Georgi Gerganov	6190e1c1c9	server : passthrough the /models endpoint during loading	2025-05-14 14:17:20 +03:00
bandoti	09d13d94fb	cmake: simplify vulkan shader test logic (#13263 )	2025-05-14 07:53:57 -03:00
Jeff Bolz	24e86cae72	vulkan: KHR_coopmat flash attention (#13506 ) This shader uses coopmat1 to do the QK^T multiply. The PV multiply is more difficult for various reasons so I haven't done it. Performance for this shader is around 2.5x better than for the scalar shader when doing prompt processing. Some of the benefit may be from other optimizations like staging through shared memory, or splitting by rows.	2025-05-14 11:55:26 +02:00
Xuan-Son Nguyen	bb1681fbd5	webui : use fflate for more deterministic gzip compress (#13525 ) * webui : use pako for more deterministic gzip compress * simpler code * use fflate instead of pako	2025-05-14 10:26:12 +02:00
Luca Stefani	d486dd3e8e	webui: Allow pasting file from clipboard (#13526 ) * server: Allow pasting file from clipboard * server: Prevent default action on file paste * update build * format then build combined --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>	2025-05-14 10:07:31 +02:00
ddpasa	21ca987fba	docs: Update link to ggml-org in multimodal.md (#13513 ) * Update multimodal.md Minor change to include the huggingface link * Update docs/multimodal.md --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>	2025-05-14 09:59:12 +02:00
Sigbjørn Skjæret	be1d4a13db	scripts : fix compare-llama-bench.py show parameter (#13514 )	2025-05-14 08:41:01 +02:00
Jeff Bolz	ab3971f2a0	vulkan: workaround FA compile failures on macos (#13517 )	2025-05-14 06:15:50 +02:00
Ed Addario	e5c834f718	quantize : improve tensor-type pattern matching (#13033 )	2025-05-13 19:12:31 +02:00
Xuan-Son Nguyen	71bdbdb587	clip : clip.h become private API (⚠️ breaking change) (#13510 )	2025-05-13 17:07:21 +02:00
Georgi Gerganov	f0995d28ce	metal : use FA-vec kernel up to batch size 20 (#13496 ) * batched-bench : fix pp batch contents * metal : optimize multi-sequence FA vec kernel ggml-ci * metal : use FA-vec kernel up to batch size 20 ggml-ci	2025-05-13 18:04:39 +03:00
Georgi Gerganov	c252e0c409	metal : optimize multi-sequence FA vec kernel (#13493 ) * batched-bench : fix pp batch contents * metal : optimize multi-sequence FA vec kernel ggml-ci	2025-05-13 18:04:00 +03:00