quantize : fix confusing error message if ftype is invalid (#15071 )

ggml: WebGPU backend host improvements and style fixing (#14978 )
* Add parameter buffer pool, batching of submissions, refactor command building/submission * Add header for linux builds * Free staged parameter buffers at once * Format with clang-format * Fix thread-safe implementation * Use device implicit synchronization * Update workflow to use custom release * Remove testing branch workflow
2026-04-19 17:39:42 +02:00 · 2025-08-04 18:11:02 +02:00 · 2025-08-04 08:52:43 -07:00 · 2025-08-04 07:09:19 +02:00 · 2025-08-03 22:00:05 +02:00
9 changed files with 531 additions and 448 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -159,31 +159,15 @@ jobs:
      - name: Dawn Dependency
        id: dawn-depends
        run: |
-          ARTIFACTS_JSON=$(curl -s -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "https://api.github.com/repos/google/dawn/actions/artifacts")
-          echo "Finding latest macos-latest-Release artifact..."
-          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
-            | sort_by(.created_at)
-            | reverse
-            | map(select(.name | test("macos-latest-Release$")))
-            | .[0].archive_download_url')
-          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
-            echo "No suitable Dawn artifact found!"
-            exit 1
-          fi
-          echo "Downloading from: $DOWNLOAD_URL"
-          curl -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -o artifact.zip "$DOWNLOAD_URL"
-          unzip artifact.zip
+          DAWN_VERSION="v1.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
          mkdir dawn
-          tar_file=$(find . -name '*.tar.gz' | head -n 1)
-          echo "Extracting: $tar_file"
-          tar -xvf "$tar_file" -C dawn --strip-components=1
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
@@ -433,31 +417,15 @@ jobs:
        id: dawn-depends
        run: |
          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          ARTIFACTS_JSON=$(curl -s -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "https://api.github.com/repos/google/dawn/actions/artifacts")
-          echo "Finding latest ubuntu-latest-Release artifact..."
-          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
-            | sort_by(.created_at)
-            | reverse
-            | map(select(.name | test("ubuntu-latest-Release$")))
-            | .[0].archive_download_url')
-          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
-            echo "No suitable Dawn artifact found!"
-            exit 1
-          fi
-          echo "Downloading from: $DOWNLOAD_URL"
-          curl -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-            -o artifact.zip "$DOWNLOAD_URL"
-          unzip artifact.zip
+          DAWN_VERSION="v1.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
          mkdir dawn
-          tar_file=$(find . -name '*.tar.gz' | head -n 1)
-          echo "Extracting: $tar_file"
-          tar -xvf "$tar_file" -C dawn --strip-components=1
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1

      - name: Build
        id: cmake_build
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2647,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_out_freq = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--output-format"}, "{gguf,dat}",
+        string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "gguf") { params.imat_dat = false; }
+            else if (value == "dat")  { params.imat_dat = true;  }
+            else { throw std::invalid_argument("invalid output format"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"--save-frequency"}, "N",
        string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
--- a/common/common.h
+++ b/common/common.h
@@ -439,6 +439,7 @@ struct common_params {
    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
    int32_t i_chunk     =  0; // start processing from this chunk
+    bool    imat_dat    = false; // whether the legacy imatrix.dat format should be output

    bool process_output  = false; // collect data for the output tensor
    bool compute_ppl     = true;  // whether to compute perplexity
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -3096,9 +3096,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
        uint32_t conv2d_SHMEM_PAD = 4;
        bool conv2d_UNROLL = true;

+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
        if (device->coopmat2) {
            conv2d_SHMEM_PAD = 8; // 8 float16_t
        }
+#endif

        if (device->vendor_id == VK_VENDOR_ID_INTEL) {
            conv2d_SHMEM_PAD = 0;
@@ -3158,6 +3160,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
        std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
        std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };

+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
        if (device->coopmat2) {
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_cm2_len, conv2d_f32_cm2_data, "main", 3,
@@ -3165,7 +3168,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_cm2_len, conv2d_f16_f32_cm2_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
-        } else if (conv2d_UNROLL) {
+        } else
+#endif
+        if (conv2d_UNROLL) {
            ggml_vk_create_pipeline(
                device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -661,8 +661,10 @@ void process_shaders() {
    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
    string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});

+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
    string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
+#endif

    string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
    string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@@ -7,7 +7,7 @@ More information is available in <https://github.com/ggml-org/llama.cpp/pull/486

 ```
 ./llama-imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \
+    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
    [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
    [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
    [--show-statistics] [...]
@@ -20,6 +20,7 @@ The parameters in square brackets are optional and have the following meaning:
 * `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
 * `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
 * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 * `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
@@ -45,14 +46,19 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the

 ```bash
 # generate and save the imatrix using legacy format
-./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99
+./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99
 ```

 ```bash
-# covert legacy (binary) imatrix format to new (GGUF) format
+# convert legacy (binary) imatrix format to new (GGUF) format
 ./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
 ```

+```bash
+# convert new (GGUF) imatrix format to legacy (binary) format
+./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat
+```
+
 ```bash
 # combine existing imatrices
 ./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -26,7 +26,7 @@
 static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
            "       [--show-statistics] [...]\n" , argv[0]);
@@ -506,13 +506,13 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {

 void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
    auto fname = m_params.out_file;
+    bool use_legacy_format = m_params.imat_dat;

-    // TODO: use the new format in more cases
-    if (!string_ends_with(fname, ".gguf")) {
-        LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
+    if (use_legacy_format) {
        this->save_imatrix_legacy(n_chunk);
        return;
    }
+    // else, default to GGUF imatrix

    if (n_chunk > 0) {
        fname += ".at_";
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -611,7 +611,7 @@ int main(int argc, char ** argv) {
            return 1;
        }
        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
-            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
+            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
            return 1;
        }
        if (ftype_str == "COPY") {
Author	SHA1	Message	Date
Sigbjørn Skjæret	2721257e3e	quantize : fix confusing error message if ftype is invalid (#15071 )	2025-08-04 18:11:02 +02:00
Reese Levine	587d0118f5	ggml: WebGPU backend host improvements and style fixing (#14978 ) * Add parameter buffer pool, batching of submissions, refactor command building/submission * Add header for linux builds * Free staged parameter buffers at once * Format with clang-format * Fix thread-safe implementation * Use device implicit synchronization * Update workflow to use custom release * Remove testing branch workflow	2025-08-04 08:52:43 -07:00
Jeff Bolz	5aa1105da2	vulkan: fix build when using glslang that does not support coopmat2 (#15062 )	2025-08-04 07:09:19 +02:00
compilade	d31192b4ee	imatrix : use GGUF by default (#14842 ) * imatrix : use GGUF by default * imatrix : use GGUF regardless of the output filename The legacy format can only be produced with --output-format dat	2025-08-03 22:00:05 +02:00