sync : ggml

ggml : try fix win32 build (whisper/0)
hexagon: hmx flash attention (#22347 )
2026-05-02 07:44:24 +02:00 · 2026-05-02 07:22:35 +03:00 · 2026-05-02 07:22:35 +03:00 · 2026-05-01 20:29:13 -07:00 · 2026-05-01 19:55:55 +02:00
16 changed files with 2776 additions and 747 deletions
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -22,7 +22,8 @@ message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for
 include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 include(ExternalProject)

-option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
+option(GGML_HEXAGON_HTP_DEBUG  "ggml-hexagon: enable HTP debug output" OFF)
+option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF)
 set(GGML_HEXAGON_HTP_CERT  "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
 set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")

--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2254,8 +2254,7 @@ static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_sess
        return false;
    }

-    if (dst->ne[2] != 1 || dst->ne[3] != 1) {
-        // FA during prompt still needs work
+    if (dst->ne[3] != 1) {
        return false;
    }

--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -44,6 +44,11 @@ target_compile_definitions(${HTP_LIB} PRIVATE
    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,FARF_HIGH=1,>
    FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})

+if (GGML_HEXAGON_FA_EXP2_HF)
+    message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)")
+    target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1)
+endif()
+
 # HMX acceleration: available on v73+ architectures
 set(HTP_HMX_VERSIONS v73 v75 v79 v81)
 list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
@@ -52,11 +57,13 @@ if (_hmx_idx GREATER_EQUAL 0)
    target_sources(${HTP_LIB} PRIVATE
        hmx-queue.c
        hmx-matmul-ops.c
+        hmx-flash-attn-ops.c
    )

    # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h)
    set_source_files_properties(
        hmx-matmul-ops.c
+        hmx-flash-attn-ops.c
        PROPERTIES COMPILE_OPTIONS "-mhmx"
    )

--- a/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
+++ b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
@@ -138,15 +138,15 @@ set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG   "-Wl,-soname,")
 set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")

 #Compiler Options
-set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
+set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")

 set(CMAKE_CXX_FLAGS_DEBUG          "${COMMON_FLAGS} -O0 -D_DEBUG -g")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g")
-set(CMAKE_CXX_FLAGS_RELEASE        "${COMMON_FLAGS} -O3")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g")
+set(CMAKE_CXX_FLAGS_RELEASE        "${COMMON_FLAGS} -O2")

 set(CMAKE_C_FLAGS_DEBUG            "${COMMON_FLAGS} -O0 -D_DEBUG -g")
-set(CMAKE_C_FLAGS_RELWITHDEBINFO   "${COMMON_FLAGS} -O3 -g")
-set(CMAKE_C_FLAGS_RELEASE          "${COMMON_FLAGS} -O3")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO   "${COMMON_FLAGS} -O2 -g")
+set(CMAKE_C_FLAGS_RELEASE          "${COMMON_FLAGS} -O2")

 set(CMAKE_ASM_FLAGS_DEBUG          "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
 set(CMAKE_ASM_FLAGS_RELEASE        "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -17,13 +17,14 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
+#include "hmx-ops.h"

 // Must be multiple of 32
 #define FLASH_ATTN_BLOCK_SIZE (32 * 2)

 // This is a bit of a hack because the compiler is strugling to properly inline
 // the default hvx_vec_f32_to_f16 with output into the local array.
-static void __attribute__((noinline)) hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
+static __attribute__((noinline)) void hvx_vec_f32_to_f16_a(void *ptr, HVX_Vector v0, HVX_Vector v1)
 {
    *(HVX_Vector *) ptr = hvx_vec_f32_to_f16(v0, v1);
 }
@@ -621,6 +622,17 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
        return HTP_STATUS_NO_SUPPORT;
    }

+#ifdef HTP_HAS_HMX
+    // HMX path: prefill (neq1 >= 32), head_dim multiple of 32, F16 KV
+    if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 32 == 0 && q->ne[1] >= 32) {
+        int ret = hmx_flash_attn_ext(octx);
+        if (ret == HTP_STATUS_OK) {
+            return ret;
+        }
+        // VTCM too small or other failure -> fall through to HVX path
+    }
+#endif
+
    struct htp_fa_context factx;
    factx.octx = octx;

--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@@ -74,6 +74,12 @@ static inline size_t hex_smax(size_t a, size_t b) {
    return a > b ? a : b;
 }

+static inline void hex_swap_ptr(void ** p1, void ** p2) {
+    void * t = *p1;
+    *p1      = *p2;
+    *p2      = t;
+}
+
 static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) {
    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
    Q6_l2fetch_AP((void *) p, control);
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
--- a/ggml/src/ggml-hexagon/htp/hmx-ops.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-ops.h
@@ -61,6 +61,9 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx,
                                      int m, int k, int n,
                                      int weight_type);

+// HMX flash attention
+int hmx_flash_attn_ext(struct htp_ops_context * octx);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-hexagon/htp/hmx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-utils.h
@@ -4,6 +4,9 @@
 #ifndef HMX_UTILS_H
 #define HMX_UTILS_H

+#include "hvx-base.h"
+
+#include <assert.h>
 #include <hexagon_types.h>
 #include <stddef.h>

@@ -12,21 +15,188 @@
 #define HMX_FP16_TILE_N_ELMS 1024
 #define HMX_FP16_TILE_SIZE   2048

-#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))
-
 // Initialise aligned 256-byte area with scale vector + zero padding.
-static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
-    HVX_Vector *pv = (HVX_Vector *)out_scales;
-    *pv++ = v_scale;
-    *pv   = Q6_V_vzero();
+static inline void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
+    volatile HVX_Vector *pv = (HVX_Vector *) out_scales;
+    pv[0] = v_scale;
+    pv[1] = Q6_V_vzero();
 }

-// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---
+// --- Shared scatter offsets and interleave helper ---

-static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
-    uint8_t *p = *vtcm_ptr;
-    *vtcm_ptr += size;
-    return p;
+// vscatter offsets for fused dequant+transpose: write K-values directly to [K][N] tile.
+// word[i] = i*128 maps K-row-pair i to byte offset i*128.
+// Column offset (n*4) is added at runtime.  Entries 0..15 cover one tile (region 2047);
+// entries 16..31 cover the next adjacent tile (region 4095) — pick region size at the
+// call site to scatter into one tile (masked) or two contiguous tiles (unmasked).
+static const int32_t hmx_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = {
+    0 * 128,  1 * 128,  2 * 128,  3 * 128,  4 * 128,  5 * 128,  6 * 128,  7 * 128,  8 * 128,  9 * 128,  10 * 128,
+    11 * 128, 12 * 128, 13 * 128, 14 * 128, 15 * 128, 16 * 128, 17 * 128, 18 * 128, 19 * 128, 20 * 128, 21 * 128,
+    22 * 128, 23 * 128, 24 * 128, 25 * 128, 26 * 128, 27 * 128, 28 * 128, 29 * 128, 30 * 128, 31 * 128,
+};
+
+// Scatter row-major FP16 data (in VTCM scratch) into transposed [K][N] tiles.
+// vtcm_src: [n_cols][src_stride] row-major fp16 (only first k elements per row are used)
+// vtcm_dst: [n_col_tiles][n_k_tiles][HMX_FP16_TILE_N_ELMS] tile-major interleaved fp16
+// Processes rows [start_row, end_row) for multi-thread slicing.
+// Full range: start_row=0, end_row=n_cols.
+static inline void hmx_interleave_rows_to_tiles(__fp16 * restrict vtcm_dst,
+                                            const __fp16 * restrict vtcm_src,
+                                            int n_cols,
+                                            int k,
+                                            int src_stride,
+                                            int start_row,
+                                            int end_row) {
+    assert(k % HMX_FP16_TILE_N_COLS == 0);
+
+    const int            n_k_tiles     = k / HMX_FP16_TILE_N_COLS;
+    const HVX_Vector     v_scat_base   = hvx_vmem(hmx_transpose_scatter_offsets);
+    const HVX_Vector     v_scat_step   = Q6_V_vsplat_R(4);
+    const HVX_VectorPred q_mask64      = Q6_Q_vsetq_R(64);
+    // Each hvx_vmemu load brings 64 fp16 = 128 bytes covering 2 adjacent K-tiles.
+    // When n_k_tiles is even, scatter into 2 K-tiles per call (region 4095, no mask)
+    // using the upper half of hmx_transpose_scatter_offsets.  Tail one K-tile (when
+    // n_k_tiles is odd) falls back to single-tile masked scatter.
+    const bool           pair_scatter  = (n_k_tiles & 1) == 0;
+    const size_t         pair_region   = (size_t) (2 * HMX_FP16_TILE_SIZE - 1);
+    const size_t         single_region = (size_t) (HMX_FP16_TILE_SIZE - 1);
+    __builtin_assume(k > 0);
+    __builtin_assume(end_row > start_row);
+
+    if (pair_scatter) {
+        // Step c by 64 fp16 (two K-tiles per scatter), advance dst by 2 tiles per iter.
+        const int    c_step      = 2 * HMX_FP16_TILE_N_COLS;
+        const size_t c_byte_step = (size_t) c_step * sizeof(__fp16);
+        const size_t dst_step    = 2 * (size_t) HMX_FP16_TILE_N_ELMS;
+        const int    n_c_iters   = k / c_step;
+
+        for (int r = start_row; r < end_row; r += 2) {
+            const int        ct             = r / HMX_FP16_TILE_N_ROWS;
+            const int        local_r        = r % HMX_FP16_TILE_N_ROWS;
+            const bool       next_row_valid = (r + 1) < end_row && (r + 1) < n_cols;
+            const HVX_Vector v_off0         = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4));
+            const HVX_Vector v_off1         = Q6_Vw_vadd_VwVw(v_off0, v_scat_step);
+
+            __fp16 *        tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
+            const uint8_t * p0        = (const uint8_t *) (vtcm_src + r * src_stride);
+            const uint8_t * p1        = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
+
+            if (p1) {
+                for (int i = 0; i < n_c_iters; ++i) {
+                    HVX_Vector v0 = hvx_vmemu(p0);
+                    p0 += c_byte_step;
+                    HVX_Vector v1 = hvx_vmemu(p1);
+                    p1 += c_byte_step;
+                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off0, v0);
+                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off1, v1);
+                    tile_base += dst_step;
+                }
+            } else {
+                const HVX_Vector vzero = Q6_V_vzero();
+                for (int i = 0; i < n_c_iters; ++i) {
+                    HVX_Vector v0 = hvx_vmemu(p0);
+                    p0 += c_byte_step;
+                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off0, v0);
+                    Q6_vscatter_RMVwV((size_t) tile_base, pair_region, v_off1, vzero);
+                    tile_base += dst_step;
+                }
+            }
+        }
+    } else {
+        // Fallback: scatter one K-tile per call (region 2047, masked).
+        const int    c_step      = HMX_FP16_TILE_N_COLS;
+        const size_t c_byte_step = (size_t) c_step * sizeof(__fp16);
+        const size_t dst_step    = (size_t) HMX_FP16_TILE_N_ELMS;
+        const int    n_c_iters   = k / c_step;
+
+        for (int r = start_row; r < end_row; r += 2) {
+            const int        ct             = r / HMX_FP16_TILE_N_ROWS;
+            const int        local_r        = r % HMX_FP16_TILE_N_ROWS;
+            const bool       next_row_valid = (r + 1) < end_row && (r + 1) < n_cols;
+            const HVX_Vector v_off0         = Q6_Vw_vadd_VwVw(v_scat_base, Q6_V_vsplat_R(local_r * 4));
+            const HVX_Vector v_off1         = Q6_Vw_vadd_VwVw(v_off0, v_scat_step);
+
+            __fp16 *        tile_base = vtcm_dst + (size_t) ct * n_k_tiles * HMX_FP16_TILE_N_ELMS;
+            const uint8_t * p0        = (const uint8_t *) (vtcm_src + r * src_stride);
+            const uint8_t * p1        = next_row_valid ? (const uint8_t *) (vtcm_src + (r + 1) * src_stride) : NULL;
+
+            if (p1) {
+                for (int i = 0; i < n_c_iters; ++i) {
+                    HVX_Vector v0 = hvx_vmemu(p0);
+                    p0 += c_byte_step;
+                    HVX_Vector v1 = hvx_vmemu(p1);
+                    p1 += c_byte_step;
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off0, v0);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off1, v1);
+                    tile_base += dst_step;
+                }
+            } else {
+                const HVX_Vector vzero = Q6_V_vzero();
+                for (int i = 0; i < n_c_iters; ++i) {
+                    HVX_Vector v0 = hvx_vmemu(p0);
+                    p0 += c_byte_step;
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off0, v0);
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, single_region, v_off1, vzero);
+                    tile_base += dst_step;
+                }
+            }
+        }
+    }
+}
+
+// Interleave row-major FP16 data into column-major tile format.
+// Input: [n_rows, head_dim] row-major.  Output: tile[dim_tile][row_tile].
+// Processes rows [start_row, end_row) for multi-thread slicing.
+// Full range: start_row=0, end_row=n_rows.
+static inline void hmx_interleave_cols_to_tiles(__fp16 * restrict tiles_out,
+                                            const __fp16 * restrict src,
+                                            int n_rows,
+                                            int head_dim,
+                                            int src_stride,
+                                            int n_row_tiles,
+                                            int start_row,
+                                            int end_row) {
+    __builtin_assume(head_dim > 0);
+    const size_t tile_stride_elms = (size_t) n_row_tiles * HMX_FP16_TILE_N_ELMS;
+
+    for (int r = start_row; r < end_row; r += 2) {
+        const bool next_row_valid = (r + 1) < end_row && (r + 1) < n_rows;
+
+        const HVX_Vector * pv_in0 = (const HVX_Vector *) (src + r * src_stride);
+        const HVX_Vector * pv_in1 = next_row_valid ? (const HVX_Vector *) (src + (r + 1) * src_stride) : NULL;
+
+        // Row-pair invariants hoisted out of the c loop.
+        const int r0      = r / HMX_FP16_TILE_N_ROWS;
+        const int r1_half = (r % HMX_FP16_TILE_N_ROWS) / 2;
+
+        // tb0 starts at tile (c0=0, r0); tb1 at the adjacent dim-tile (c0=1, r0).
+        // Each c step (+= 64) advances both by 2 dim-tiles worth of fp16.
+        __fp16 *     tb0     = tiles_out + (size_t) r0 * HMX_FP16_TILE_N_ELMS;
+        __fp16 *     tb1     = tb0 + tile_stride_elms;
+        const size_t tb_step = 2 * tile_stride_elms;
+
+        if (pv_in1) {
+            for (int c = 0; c < head_dim; c += 64) {
+                HVX_Vector     v0             = *pv_in0++;
+                HVX_Vector     v1             = *pv_in1++;
+                HVX_VectorPair vp             = Q6_W_vshuff_VVR(v1, v0, -2);
+                ((HVX_Vector *) tb0)[r1_half] = Q6_V_lo_W(vp);
+                ((HVX_Vector *) tb1)[r1_half] = Q6_V_hi_W(vp);
+                tb0 += tb_step;
+                tb1 += tb_step;
+            }
+        } else {
+            const HVX_Vector vzero = Q6_V_vzero();
+            for (int c = 0; c < head_dim; c += 64) {
+                HVX_Vector     v0             = *pv_in0++;
+                HVX_VectorPair vp             = Q6_W_vshuff_VVR(vzero, v0, -2);
+                ((HVX_Vector *) tb0)[r1_half] = Q6_V_lo_W(vp);
+                ((HVX_Vector *) tb1)[r1_half] = Q6_V_hi_W(vp);
+                tb0 += tb_step;
+                tb1 += tb_step;
+            }
+        }
+    }
 }

 #endif // HMX_UTILS_H
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -77,6 +77,12 @@ static inline int32_t hvx_vec_get_i32(HVX_Vector v) {
    return x;
 }

+static inline _Float16 hvx_vec_get_f16(HVX_Vector v) {
+    _Float16 __attribute__((aligned(128))) x;
+    hvx_vec_store_a(&x, 2, v);
+    return x;
+}
+
 static inline HVX_Vector hvx_vec_abs_f16(HVX_Vector v) {
    // abs by clearing the fp16 sign bit
    HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
--- a/ggml/src/ggml-hexagon/htp/hvx-copy.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-copy.h
@@ -7,7 +7,8 @@

 #include "hvx-base.h"

-#define hvx_splat_loop_body(dst_type, vec_store)                 \
+#define hvx_splat_pragma(x) _Pragma(#x)
+#define hvx_splat_loop_body(dst_type, vec_store, unroll_cnt)     \
    do {                                                         \
        dst_type * restrict vdst = (dst_type *) dst;             \
                                                                 \
@@ -16,7 +17,7 @@
                                                                 \
        uint32_t i = 0;                                          \
                                                                 \
-        _Pragma("unroll(4)")                                     \
+        hvx_splat_pragma(unroll(unroll_cnt))                     \
        for (; i < nvec; i++) {                                  \
            vdst[i] = src;                                       \
        }                                                        \
@@ -25,31 +26,47 @@
        }                                                        \
    } while(0)

-static inline void hvx_splat_a(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
+static inline void hvx_splat_a(void * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
    assert((unsigned long) dst % 128 == 0);
-    hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a);
+    hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a, 4);
 }

-static inline void hvx_splat_u(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
-    hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u);
+static inline void hvx_splat_u(void * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
+    hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u, 4);
 }

-static inline void hvx_splat_f32_a(uint8_t * restrict dst, float v, uint32_t n) {
+static inline void hvx_splat_f32_a(void * restrict dst, float v, uint32_t n) {
    hvx_splat_a(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
 }

-static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n) {
+static inline void hvx_splat_f32_u(void * restrict dst, float v, uint32_t n) {
    hvx_splat_u(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
 }

-static inline void hvx_splat_f16_a(uint8_t * restrict dst, _Float16 v, uint32_t n) {
+static inline void hvx_splat_f16_a(void * restrict dst, _Float16 v, uint32_t n) {
    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
 }

-static inline void hvx_splat_f16_u(uint8_t * restrict dst, _Float16 v, uint32_t n) {
+static inline void hvx_splat_f16_u(void * restrict dst, _Float16 v, uint32_t n) {
    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
 }

+static inline void hvx_splat_u16_a(void * restrict dst, uint16_t v, uint32_t n) {
+    hvx_splat_a(dst,  Q6_Vh_vsplat_R(v), n, sizeof(uint16_t));
+}
+
+static inline void hvx_splat_u16_u(void * restrict dst, uint16_t v, uint32_t n) {
+    hvx_splat_u(dst,  Q6_Vh_vsplat_R(v), n, sizeof(uint16_t));
+}
+
+static inline void hvx_splat_u8_a(void * restrict dst, uint8_t v, uint32_t n) {
+    hvx_splat_a(dst,  Q6_Vb_vsplat_R(v), n, 1);
+}
+
+static inline void hvx_splat_u8_u(void * restrict dst, uint8_t v, uint32_t n) {
+    hvx_splat_u(dst,  Q6_Vb_vsplat_R(v), n, 1);
+}
+
 #define hvx_copy_loop_body(dst_type, src_type, vec_store)            \
    do {                                                             \
        dst_type * restrict vdst = (dst_type *) dst;                 \
--- a/ggml/src/ggml-hexagon/htp/vtcm-utils.h
+++ b/ggml/src/ggml-hexagon/htp/vtcm-utils.h
@@ -0,0 +1,16 @@
+#ifndef VTCM_UTILS_H
+#define VTCM_UTILS_H
+
+#include "hex-utils.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <hexagon_types.h>
+
+static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
+    uint8_t *p = *vtcm_ptr;
+    *vtcm_ptr += size;
+    return p;
+}
+
+#endif // VTCM_UTILS_H
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -55,8 +55,13 @@

 uint64_t ggml_graph_next_uid(void) {
 #ifdef _MSC_VER
+#if defined(_WIN32)
+    static volatile LONG counter = 1;
+    return (uint64_t) InterlockedIncrement(&counter) - 1;
+#else
    static volatile long long counter = 1;
    return (uint64_t) _InterlockedIncrement64(&counter) - 1;
+#endif
 #else
    static uint64_t counter = 1;
    return __atomic_fetch_add(&counter, 1, __ATOMIC_RELAXED);
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-387fa29fbbf3149f06a631c7850b6c35c24b0232
+94e1fd7f5bd358cc724d2a94afb520a417b46dc9
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -683,9 +683,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_mod
                        LLAMA_LOG_WARN("%s: %-36s - applying manual override: %s -> %s\n",
                                       __func__, tensor_name.c_str(), ggml_type_name(new_type), ggml_type_name(qtype));
                        new_type = qtype;
-                        manual = true;
-                        break;
                    }
+                    manual = true;
+                    break;
                }
            }
        }
Author	SHA1	Message	Date
Georgi Gerganov	457e2288c9	sync : ggml	2026-05-02 07:22:35 +03:00
Georgi Gerganov	e8ec7ab058	ggml : try fix win32 build (whisper/0)	2026-05-02 07:22:35 +03:00
Yiwei Shao	1a03cf47f6	hexagon: hmx flash attention (#22347 ) * hmx: extract shared interleave headers and unify matmul batched * hmx: add HMX-accelerated flash attention for prefill * hmx: replace asm wrappers with Q6_ intrinsics in hmx-utils.h Switches three single-instruction helpers from inline asm to the matching Q6_ intrinsics, matching the style established by aizip `f8737609a` and used by the upstream PR #21554 hmx-matmul-ops.c rewrite: hmx_set_output_scales asm "bias=mxmem2" -> Q6_bias_mxmem2_A hmx_load_tile_pair_fp16 asm packet -> Q6_activation_hf_mxmem_RR + Q6_weight_hf_mxmem_RR hmx_consume_accumulator_fp16 asm "mxmem=acc" -> Q6_mxmem_AR_after_hf hmx_load_tiles_fp16 stays on inline asm: it uses ":deep" activation streaming, and the mixed Q6_activation_hf_mxmem_RR_deep + non-deep Q6_weight_hf_mxmem_RR pair fails the HMX backend constraint check ("activate weight pair (1) exceeds limit (1)"). The asm bundle keeps both halves in one VLIW packet and avoids the diagnostic. Functionally equivalent — same instructions emitted; the Q6_ intrinsics just give the compiler more visibility for scheduling. * hmx: drop the duplicate interleave_fp16_weight_chunk_to_tiles * hmx: apply upstream optimization to hmx-flash-attn-ops.c apply restrict, __builtin_assume, and pointer accumulation to the three HMX workers (qk_dot, o_update, o_norm) and the matching inline HMX loops in op_hmx_flash_attn_ext. * hmx: unify interleave helper * hmx: multi-thread Q load / O store and enable prefill FA dispatch Extract inline Q-load and O-store loops into worker_pool-parallel helpers (fa_phase_q_load, fa_phase_o_store) so HVX threads split the F32↔F16 conversion work across row ranges. Also relax the softmax threading gate from n_row_vec_cnt >= n_threads to >= 2, which was unnecessarily forcing single-thread fallback when n_rows_g < 512. On the dispatch side, remove the ne[2] != 1 guard that blocked multi-head (prefill) FA from reaching the HTP backend — GQA is already handled internally by both the HMX and HVX flash-attention paths. * hmx: relax matmul pipeline gate to cover k > n shapes (e.g. FFN_down) * hmx: optimize FA softmax mask phase (no-ALiBi fast path + GQA dedup) * hmx: Add an asm memory clobber at the phase boundary to prevent reorder bug * [experimental]: fp16 softmax (EXP2_HF) to accelerate fa Bake log2(e) into qk_scale and use hvx_exp2_hf directly for P and m_diff (base-2 consistent, matches htp-ops-lib). ~22 ALU ops for 64 lanes vs ~44 for the F32 round-trip path. * hmx flash-attn: refine cost model coefficients based on profiling data * hmx flash-attn: replace asm clobber with targeted volatile reads on vtcm_d_tiles * hmx flash-attn: fix prefill correctness (dst indexing, softmax reduce, V stride) * hmx flash-attn: fix p_tiles dual-tile OOB race; enable MT + pipeline * hmx flash-attn: preserve additive mask bias in no-ALiBi fast path The no-ALiBi fast path (max_bias==0) was skipping mask add entirely on the assumption that mask values are only {0, -inf}. This is wrong when the mask carries additive positional bias — those terms were silently dropped. Keep the slope-mul skip (slope≡1.0) but add mask back so the bias survives; vmux still clamps below -16 to -inf. Also add HMX FA coverage to test-backend-ops: prefill shapes (nb=64, nb=32) × {mask on/off} × {ALiBi on/off} × {softcap on/off}, F16 KV, hs ∈ {64, 128}. * hmx: fix softcap+EXP2_HF interaction, tighten matmul pipeline gate, add FA tests - flash-attn: when EXP2_HF is on AND logit_softcap is active, fold log2(e) into the post-tanh multiplier (v_cap) instead of pre-baking it into qk_scale. Pre-baking shifted the tanh knee from x≈c to x≈c/log2(e) and produced numerically wrong softcapped outputs whenever both knobs were enabled. - flash-attn softmax (fa_softmax_thread): replace the union+memcpy scalar extract pattern with HVX vmux-based per-row accumulators on rowmax/rowsum. Add hvx_vec_get_f16 helper in hvx-base.h. Functional parity, less scalar code, clearer hf/qf16 lane-format contract. - matmul (hmx_mat_mul_permuted_qk_0_d16a32): pick pipeline vs sequential layout based on whether the chunker actually yields >=2 n-chunks, instead of the static (m>=128 && n>=256) gate. Avoids paying for output double-buffer + worker dispatch when there is no HMX/HVX overlap to gain (e.g. shapes that collapse to one n-chunk). - tests: add HMX flash-attention coverage over the {mask, ALiBi (max_bias), logit_softcap} cross-product for the prefill path — head_dim 64/128, GQA 4×4, kv=512/nb=64 plus a kv=113/nb=32 non-aligned case. * [Help Wanted]: refactor D matrix computation into separate function for clarity and maintainability * format code * hexagon: looks like -O3 is causing issues with the large code base, switch to -O2 and -flto instead * hexagon: use hex_ prefix for swap_ptr * hexagon: move vtcm_seq_alloc into vtcm-utils.h More vtcm allocator updates are coming so it makes sense to start the separate hdr for it. * hmx-utils: add hmx_prefix for layout converters * hmx-mm: move main hmx_mm functions to the end, remove unused fwd decls, etc * hmx-mm: remove unused qweight_fetch_task_state_t and minor alignment fixes * hmx-fa: minor alignment fixes * hmx-fa: move hmx_flash_atten into hmx-ops.h * hmx-fa: remove redundant workpool pointer in the hmx_fa_ctx, plus minor alignment updates * hmx-fa: minor alignment and simplifications * hexagon: move FA_EXP_F16 option to hostside CMake file * hmx-fa: use hvx_vec_splat_f16 instead of fp16_to_bits * hmx-fa: add hvx_splat_u16/u8 and use that in the fa instead custom hvx_fill * hmx-fa: some more alignment updates in the core fa function * hmx-fa: keep slopes in vtcm in fp16 Saves malloc/free and removes the need for float -> fp16 downcast on every use. * hexagon: consistent noinline usage (after static) * hex-hmx: consistent use FARF_HIGH to enable debug output * hmx-utils: no need for always_inline attr * hex-hmx: consistent noinline usage (static noinline ...) * hex-hmx: simplify init_col_scales * hexagon: fix editorconfig errors * hmx-mm: minor alignment fixes --------- Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>	2026-05-01 20:29:13 -07:00
ddh0	b97ebdc98f	llama-quant : fix `--tensor-type` when default `qtype` is overriden (#22572 ) fix #22544 (my fault!) Credit to @Anai-Guo, ref #22559 - since that one was closed due to the new contributor policy I am taking the liberty of re-submitting that PR here.	2026-05-01 19:55:55 +02:00