CUDA: consistent use of __restrict__ + PDL for FA (#25185 )

ggml-cpu: add AVX2 optimization for nvfp4 dot product and use UE4M3 LUT (#23961 )
ui Prevent tool messages from incorrectly appending to other conversations (#25177 )
2026-07-01 11:43:04 +02:00 · 2026-07-01 10:55:14 +02:00 · 2026-07-01 15:31:20 +08:00 · 2026-07-01 09:25:18 +02:00 · 2026-07-01 07:32:55 +02:00 · 2026-06-30 21:43:20 -07:00
36 changed files with 1768 additions and 65 deletions
@@ -11,6 +11,11 @@ struct common_http_url {
    std::string path;
 };

+// bracket an IPv6 literal host for a URL authority (RFC 3986)
+static std::string common_http_format_host(const std::string & host) {
+    return host.find(':') != std::string::npos ? "[" + host + "]" : host;
+}
+
 static common_http_url common_http_parse_url(const std::string & url) {
    common_http_url parts;
    auto scheme_end = url.find("://");
@@ -49,11 +54,28 @@ static common_http_url common_http_parse_url(const std::string & url) {
        parts.path = "/";
    }

-    auto colon_pos = parts.host.find(':');
+    // split the authority into host and optional port, a bracketed IPv6 literal keeps its inner colons (RFC 3986)
+    std::string port_str;
+    if (!parts.host.empty() && parts.host.front() == '[') {
+        auto close = parts.host.find(']');
+        if (close == std::string::npos) {
+            throw std::runtime_error("invalid IPv6 URL authority: " + parts.host);
+        }
+        auto after = parts.host.substr(close + 1);
+        if (!after.empty() && after.front() == ':') {
+            port_str = after.substr(1);
+        }
+        parts.host = parts.host.substr(1, close - 1);
+    } else {
+        auto colon_pos = parts.host.find(':');
+        if (colon_pos != std::string::npos) {
+            port_str = parts.host.substr(colon_pos + 1);
+            parts.host = parts.host.substr(0, colon_pos);
+        }
+    }

-    if (colon_pos != std::string::npos) {
-        parts.port = std::stoi(parts.host.substr(colon_pos + 1));
-        parts.host = parts.host.substr(0, colon_pos);
+    if (!port_str.empty()) {
+        parts.port = std::stoi(port_str);
    } else if (parts.scheme == "http") {
        parts.port = 80;
    } else if (parts.scheme == "https") {
@@ -83,7 +105,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
    }
 #endif

-    httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
+    httplib::Client cli(parts.scheme + "://" + common_http_format_host(parts.host) + ":" + std::to_string(parts.port));

    if (!parts.user.empty()) {
        cli.set_basic_auth(parts.user, parts.password);
@@ -95,5 +117,5 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
 }

 static std::string common_http_show_masked_url(const common_http_url & parts) {
-    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
+    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + common_http_format_host(parts.host) + parts.path;
 }
@@ -1111,11 +1111,12 @@ GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
    -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
 GGML_TABLE_END()

-// e2m1 values (doubled)
+// e2m1 values (doubled), shared by MXFP4 and NVFP4
 // ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
+GGML_TABLE_BEGIN(int8_t, kvalues_fp4, 16)
    0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
 GGML_TABLE_END()
+#define kvalues_mxfp4 kvalues_fp4

 #define NGRID_IQ1S 2048
 #define IQ1S_DELTA 0.125f
@@ -82,7 +82,6 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 // quants.c
-#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -934,7 +934,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo

 #if defined __AVX2__

-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
    const __m128i m4b  = _mm_set1_epi8(0x0f);
    const __m256i mone = _mm256_set1_epi16(1);

@@ -963,7 +963,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));

 #elif defined __AVX__
-    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
    const __m128i m4b  = _mm_set1_epi8(0x0f);

    __m256 accum = _mm256_setzero_ps();
@@ -993,14 +993,152 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
        int sumi1 = 0;
        int sumi2 = 0;
        for (int j = 0; j < QK_MXFP4/2; ++j) {
-            sumi1 += y[ib].qs[j +          0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
-            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >>  4];
+            sumi1 += y[ib].qs[j +          0] * kvalues_fp4[x[ib].qs[j] & 0xf];
+            sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_fp4[x[ib].qs[j] >>  4];
        }
        sumf += d * (sumi1 + sumi2);
    }
    *s = sumf;
 }

+void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    assert(n % QK_NVFP4 == 0);
+
+    const block_nvfp4 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_NVFP4;
+    int ib = 0;
+    float sumf = 0;
+
+#if defined(__AVX2__)
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+    const __m256i mone = _mm256_set1_epi16(1);
+
+    __m256 accum = _mm256_setzero_ps();
+    for(; ib < nb; ib++){
+
+        const __m128i q4bits_01 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 0));
+        const __m128i q4bits_23 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 16));
+
+        const __m256i q8_01 = _mm256_loadu_si256((const __m256i *)y[2*ib + 0].qs);
+        const __m256i q8_23 = _mm256_loadu_si256((const __m256i *)y[2*ib + 1].qs);
+
+        const __m128i q4_01_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_01, m4b));
+        const __m128i q4_01_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_01, 4), m4b));
+        const __m128i q4_23_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_23, m4b));
+        const __m128i q4_23_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_23, 4), m4b));
+
+        //reordering
+        const __m256i q4_01 = MM256_SET_M128I(_mm_unpackhi_epi64(q4_01_lo,q4_01_hi), _mm_unpacklo_epi64(q4_01_lo,q4_01_hi));
+        const __m256i q4_23 = MM256_SET_M128I(_mm_unpackhi_epi64(q4_23_lo,q4_23_hi),_mm_unpacklo_epi64(q4_23_lo,q4_23_hi));
+
+        const __m256i p01 = mul_add_epi8(q4_01,q8_01);
+        const __m256i p_1 = _mm256_madd_epi16(p01, mone);
+
+        const __m256i p23 = mul_add_epi8(q4_23,q8_23);
+        const __m256i p_2 = _mm256_madd_epi16(p23, mone);
+
+        const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
+        const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
+
+        const float s0 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[0]) * dy0;
+        const float s1 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[1]) * dy0;
+        const float s2 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[2]) * dy1;
+        const float s3 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[3]) * dy1;
+
+        const __m256 scales01 = _mm256_set_m128(_mm_set1_ps(s1), _mm_set1_ps(s0));
+        const __m256 scales23 = _mm256_set_m128(_mm_set1_ps(s3), _mm_set1_ps(s2));
+
+        accum = _mm256_fmadd_ps(scales01, _mm256_cvtepi32_ps(p_1), accum);
+        accum = _mm256_fmadd_ps(scales23, _mm256_cvtepi32_ps(p_2), accum);
+    }
+    sumf = hsum_float_8(accum);
+
+#elif defined(__AVX__)
+
+    const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_fp4);
+    const __m128i m4b  = _mm_set1_epi8(0x0f);
+
+    __m256 accum = _mm256_setzero_ps();
+    for(; ib < nb; ib++){
+
+        const __m128i q4bits_01 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 0));
+        const __m128i q4bits_23 = _mm_loadu_si128((const __m128i *)(x[ib].qs + 16));
+
+        const __m128i q8_0 = _mm_loadu_si128((const __m128i *)(y[2*ib + 0].qs + 0));
+        const __m128i q8_1 = _mm_loadu_si128((const __m128i *)(y[2*ib + 0].qs + 16));
+        const __m128i q8_2 = _mm_loadu_si128((const __m128i *)(y[2*ib + 1].qs + 0));
+        const __m128i q8_3 = _mm_loadu_si128((const __m128i *)(y[2*ib + 1].qs + 16));
+
+        const __m128i q4_01_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_01, m4b));
+        const __m128i q4_01_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_01, 4), m4b));
+        const __m128i q4_23_lo = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_23, m4b));
+        const __m128i q4_23_hi = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_23, 4), m4b));
+
+        const __m128i q4_0 = _mm_unpacklo_epi64(q4_01_lo, q4_01_hi);
+        const __m128i q4_1 = _mm_unpackhi_epi64(q4_01_lo, q4_01_hi);
+        const __m128i q4_2 = _mm_unpacklo_epi64(q4_23_lo, q4_23_hi);
+        const __m128i q4_3 = _mm_unpackhi_epi64(q4_23_lo, q4_23_hi);
+
+        const __m128i p0_i32 = mul_sum_i8_pairs(q4_0, q8_0);
+        const __m128i p1_i32 = mul_sum_i8_pairs(q4_1, q8_1);
+        const __m128i p2_i32 = mul_sum_i8_pairs(q4_2, q8_2);
+        const __m128i p3_i32 = mul_sum_i8_pairs(q4_3, q8_3);
+
+        const __m128 p0 = _mm_cvtepi32_ps(p0_i32);
+        const __m128 p1 = _mm_cvtepi32_ps(p1_i32);
+        const __m128 p2 = _mm_cvtepi32_ps(p2_i32);
+        const __m128 p3 = _mm_cvtepi32_ps(p3_i32);
+
+        const __m256 p01 = _mm256_set_m128(p1, p0);
+        const __m256 p23 = _mm256_set_m128(p3, p2);
+
+        const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
+        const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
+
+        const float s0 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[0]) * dy0;
+        const float s1 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[1]) * dy0;
+        const float s2 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[2]) * dy1;
+        const float s3 = GGML_CPU_UE4M3_TO_FP32(x[ib].d[3]) * dy1;
+
+        const __m256 scales01 = _mm256_set_m128(_mm_set1_ps(s1), _mm_set1_ps(s0));
+        const __m256 scales23 = _mm256_set_m128(_mm_set1_ps(s3), _mm_set1_ps(s2));
+
+        accum = _mm256_add_ps(accum, _mm256_mul_ps(p01, scales01));
+        accum = _mm256_add_ps(accum, _mm256_mul_ps(p23, scales23));
+    }
+    sumf = hsum_float_8(accum);
+
+#endif
+
+    for (;ib < nb; ++ib) {
+        for (int s_idx = 0; s_idx < 4; ++s_idx) {
+            const float d = GGML_CPU_UE4M3_TO_FP32(x[ib].d[s_idx]);
+            const int q8_block = s_idx / 2;
+            const int q8_off   = (s_idx % 2) * QK_NVFP4_SUB;
+            const float dy = GGML_CPU_FP16_TO_FP32(y[2*ib + q8_block].d);
+
+            int sumi_lo = 0, sumi_hi = 0;
+            for (int j = 0; j < QK_NVFP4_SUB/2; ++j) {
+                const uint8_t qv = x[ib].qs[s_idx*(QK_NVFP4_SUB/2) + j];
+                sumi_lo += y[2*ib + q8_block].qs[q8_off + j +               0] * kvalues_fp4[qv & 0xf];
+                sumi_hi += y[2*ib + q8_block].qs[q8_off + j + QK_NVFP4_SUB/2] * kvalues_fp4[qv >>  4];
+            }
+
+            sumf += dy * d * (sumi_lo + sumi_hi);
+        }
+    }
+    *s = sumf;
+}
+
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    const int qk = QK8_0;
    const int nb = n / qk;
@@ -82,6 +82,9 @@ float ggml_table_f32_f16[1 << 16];
 // precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h)
 float ggml_table_f32_e8m0_half[1 << 8];

+// precomputed f32 table for ue4m3 (1 KB) (simd-mappings.h)
+float ggml_table_f32_ue4m3[1 << 8];
+
 #if defined(__ARM_ARCH)
 struct ggml_arm_arch_features_type {
    int sve_cnt;
@@ -3798,6 +3801,11 @@ void ggml_cpu_init(void) {
                ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i);
            }

+            // initialize UE4M3 table (256 entries)
+            for (int i = 0; i < (1 << 8); ++i) {
+                ggml_table_f32_ue4m3[i] = ggml_ue4m3_to_fp32(i);
+            }
+
            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);

            GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
@@ -120,6 +120,10 @@ extern float ggml_table_f32_f16[1 << 16];
 // defined in ggml-cpu.c, initialized in ggml_cpu_init()
 extern float ggml_table_f32_e8m0_half[1 << 8];

+// precomputed f32 table for ue4m3 (1 KB)
+// defined in ggml-cpu.c, initialized in ggml_cpu_init()
+extern float ggml_table_f32_ue4m3[1 << 8];
+
 // Use lookup table for E8M0 on x86 (faster than bit manipulation)
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 #define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
@@ -127,6 +131,13 @@ extern float ggml_table_f32_e8m0_half[1 << 8];
 #define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
 #endif

+// Use lookup table for UE4M3 on x86 (faster than bit manipulation)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+#define GGML_CPU_UE4M3_TO_FP32(x) ggml_table_f32_ue4m3[(uint8_t)(x)]
+#else
+#define GGML_CPU_UE4M3_TO_FP32(x) ggml_ue4m3_to_fp32(x)
+#endif
+
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
@@ -664,7 +664,10 @@ constexpr __device__ dequantize_V_t get_dequantize_V() {
 template <int ncols1>
 __launch_bounds__(FATTN_KQ_STRIDE/2, 1)
 static __global__ void flash_attn_mask_to_KV_max(
-        const half2 * __restrict__ mask, int * __restrict__ KV_max, const int ne30, const int s31, const int s33) {
+        const half2 * mask_ptr, int * KV_max_ptr, const int ne30, const int64_t s31, const int64_t s33) {
+    const half2 * GGML_CUDA_RESTRICT mask   = mask_ptr;
+    int         * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
+
    const int ne31     = gridDim.x;
    const int tid      = threadIdx.x;
    const int sequence = blockIdx.y;
@@ -1089,8 +1092,8 @@ void launch_fattn(
    // Only worth the overhead if there is at lease one FATTN_KQ_STRIDE x FATTN_KQ_STRIDE square to be skipped or
    //     multiple sequences of possibly different lengths.
    if (mask && K->ne[1] % FATTN_KQ_STRIDE == 0 && (Q->ne[1] >= 1024 || Q->ne[3] > 1)) {
-        const int s31 = mask->nb[1] / sizeof(half2);
-        const int s33 = mask->nb[3] / sizeof(half2);
+        const int64_t s31 = mask->nb[1] / sizeof(half2);
+        const int64_t s33 = mask->nb[3] / sizeof(half2);

        const dim3 blocks_num_KV_max(ntiles_x, Q->ne[3], 1);
        const dim3 block_dim_KV_max(FATTN_KQ_STRIDE/2, 1, 1);
@@ -1099,8 +1102,9 @@ void launch_fattn(
        const int iter_k = K->ne[1] / FATTN_KQ_STRIDE;

        KV_max.alloc(ne_KV_max);
-        flash_attn_mask_to_KV_max<ncols1><<<blocks_num_KV_max, block_dim_KV_max, 0, main_stream>>>
-            ((const half2 *) mask->data, KV_max.ptr, iter_k, s31, s33);
+        ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num_KV_max, block_dim_KV_max, 0, main_stream);
+        ggml_cuda_kernel_launch(flash_attn_mask_to_KV_max<ncols1>, launch_params,
+            (const half2 *) mask->data, KV_max.ptr, iter_k, s31, s33);
        CUDA_CHECK(cudaGetLastError());
    }

@@ -2003,6 +2003,10 @@ DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 112,  64)
 DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128,  64)
 DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 256,  64)

+extern DECL_FATTN_MMA_F16_CASE(512, 512,  4,  2);
+extern DECL_FATTN_MMA_F16_CASE(512, 512,  8,  2);
+extern DECL_FATTN_MMA_F16_CASE(512, 512, 16,  2);
+extern DECL_FATTN_MMA_F16_CASE(512, 512, 32,  2);
 extern DECL_FATTN_MMA_F16_CASE(512, 512,  2,  4);
 extern DECL_FATTN_MMA_F16_CASE(512, 512,  4,  4);
 extern DECL_FATTN_MMA_F16_CASE(512, 512,  8,  4);
@@ -76,6 +76,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2,  64,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  64,  64)
@@ -144,6 +145,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2,  32,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  32,  64)
@@ -219,6 +221,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2,  64,  64)
@@ -296,6 +299,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128,  64)

+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  2,  64, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4,  64,  64)
@@ -1308,12 +1312,12 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
            return;
        }

-        if constexpr (DV <= 256) {
-            if (use_gqa_opt && gqa_ratio % 2 == 0) {
-                launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
-                return;
-            }
+        if (use_gqa_opt && gqa_ratio % 2 == 0) {
+            launch_fattn_tile_switch_ncols1<DKQ, DV, 2, use_logit_softcap>(ctx, dst);
+            return;
+        }

+        if constexpr (DV <= 256) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 1, use_logit_softcap>(ctx, dst);
            return;
        }
@@ -99,12 +99,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
        return;
    }

-    if constexpr (DKQ <= 256) {
-        if (use_gqa_opt && gqa_ratio > 1) {
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
-            return;
-        }
+    if (use_gqa_opt && gqa_ratio > 1) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
+        return;
+    }

+    if constexpr (DKQ <= 256) {
        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
    } else {
        GGML_ABORT("fatal error");
@@ -78,26 +78,29 @@ static __global__ void k_get_rows_float(

 template<typename grad_t, typename dst_t>
 static __global__ void k_get_rows_back_float(
-        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
+        const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst,
+        const int64_t ncols, const int64_t nrows_grad, const int64_t nrows_dst) {
    const int col = blockIdx.x*blockDim.x + threadIdx.x;

    if (col >= ncols) {
        return;
    }

-    const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
-
-    float sum = 0.0f;
-
    ggml_cuda_pdl_sync();
-    for (int64_t i = 0; i < nrows_grad; ++i) {
-        if (rows[i] != dst_row) {
-            continue;
-        }
-        sum += grad[i*ncols + col];
-    }

-    dst[dst_row*ncols + col] = sum;
+    // grid.y is clamped to the CUDA grid limit, so stride over the destination rows
+    for (int64_t dst_row = blockIdx.y; dst_row < nrows_dst; dst_row += gridDim.y) {
+        float sum = 0.0f;
+
+        for (int64_t i = 0; i < nrows_grad; ++i) {
+            if (rows[i] != dst_row) {
+                continue;
+            }
+            sum += grad[i*ncols + col];
+        }
+
+        dst[dst_row*ncols + col] = sum;
+    }
 }

 template<int qk, int qr, dequantize_kernel_t dq, typename dst_t>
@@ -302,7 +305,7 @@ void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * d

    const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
    const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne1, 1);
+    const dim3 block_nums(block_num_x, MIN(ne1, (int64_t)UINT16_MAX), 1);

-    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
+    k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10, ne1);
 }
@@ -368,5 +368,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
        return true;
    }

+    // gfx900 (Vega 10) lacks native dp4a, loses to dequant + hipBLAS
+    // for dense matrices; keep MMQ only for MoE, where the
+    // hipBLAS path is much slower.
+    if (cc == GGML_CUDA_CC_VEGA) {
+        return n_experts > 0;
+    }
+
    return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 16, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 32, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 4, 2);
@@ -8,3 +8,4 @@ DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
 DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
 DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
 DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
+DECL_FATTN_MMA_F16_CASE(512, 512, 8, 2);
@@ -92,7 +92,7 @@ for ncols in [8, 16, 32, 64]:
                    continue
                if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
                    continue
-                if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
+                if head_size_kq == 512 and ncols2 not in (2, 4, 8): # Gemma 4 (+ MTP)
                    continue
                if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
                    continue
@@ -78,6 +78,8 @@ set(GGML_OPENCL_KERNELS
    mul_mv_f16_f32_l4
    mul_mv_f16_f32
    mul_mv_f32_f32
+    mul_mv_q1_0_f32
+    mul_mv_q1_0_f32_flat
    mul_mv_q4_0_f32
    mul_mv_q4_0_f32_v
    mul_mv_q4_0_f32_8x_flat
@@ -128,6 +130,7 @@ set(GGML_OPENCL_KERNELS
    moe_sort_by_expert
    mul_mm_f32_f32_l4_lm
    mul_mm_f16_f32_l4_lm
+    mul_mm_q1_0_f32_l4_lm
    mul_mm_q4_0_f32_l4_lm
    mul_mm_q4_1_f32_l4_lm
    mul_mm_q5_0_f32_l4_lm
@@ -137,6 +140,8 @@ set(GGML_OPENCL_KERNELS
    mul_mm_q4_k_f32_l4_lm
    mul_mm_q5_k_f32_l4_lm
    mul_mm_q6_k_f32_l4_lm
+    gemv_noshuffle_q1_0_f32
+    gemm_noshuffle_q1_0_f32
    gemv_noshuffle_q4_0_f32
    gemv_noshuffle_q4_0_f32_spec
    gemm_noshuffle_q4_0_f32
@@ -631,6 +631,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_mul_mm_f16_f32_kqv;
    cl_kernel kernel_mul_mm_f16_f32_kq;
    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
+    cl_kernel kernel_convert_block_q1_0, kernel_restore_block_q1_0;
    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
    cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns;
    cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
@@ -670,6 +671,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_convert_block_iq4_nl, kernel_restore_block_iq4_nl;
    cl_kernel kernel_convert_block_iq4_nl_noshuffle;
    cl_kernel kernel_restore_block_iq4_nl_noshuffle;
+    cl_kernel kernel_mul_mv_q1_0_f32, kernel_mul_mv_q1_0_f32_flat;
    cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
    cl_kernel kernel_mul_mv_q4_1_f32;
    cl_kernel kernel_mul_mv_q4_1_f32_flat;
@@ -733,6 +735,7 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
    cl_kernel kernel_mul_mm_f32_f32_l4_lm;
    cl_kernel kernel_mul_mm_f16_f32_l4_lm;
+    cl_kernel kernel_mul_mm_q1_0_f32_l4_lm;
    cl_kernel kernel_mul_mm_q4_0_f32_l4_lm;
    cl_kernel kernel_mul_mm_q4_1_f32_l4_lm;
    cl_kernel kernel_mul_mm_q5_0_f32_l4_lm;
@@ -890,6 +893,8 @@ struct ggml_backend_opencl_context {
    cl_kernel kernel_gemm_noshuffle_q4_1_f32;
    cl_kernel kernel_gemm_noshuffle_q8_0_f32;
    cl_kernel kernel_gemv_noshuffle_q8_0_f32;
+    cl_kernel kernel_gemm_noshuffle_q1_0_f32;
+    cl_kernel kernel_gemv_noshuffle_q1_0_f32;
    cl_kernel kernel_gemv_noshuffle_q4_k_f32;
    cl_kernel kernel_gemm_noshuffle_q4_k_f32;
    cl_kernel kernel_gemv_noshuffle_q6_K_f32;
@@ -1151,6 +1156,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        backend_ctx->program_cvt =
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);

+        CL_CHECK((backend_ctx->kernel_convert_block_q1_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q1_0", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q1_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q1_0", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
        CL_CHECK((backend_ctx->kernel_convert_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
@@ -1685,6 +1692,40 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        GGML_LOG_CONT(".");
    }

+    // mul_mv_q1_0_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mv_q1_0_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mv_q1_0_f32.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mv_q1_0_f32 = clCreateKernel(prog, "kernel_mul_mv_q1_0_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // mul_mv_q1_0_f32_flat
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mv_q1_0_f32_flat.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mv_q1_0_f32_flat.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mv_q1_0_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q1_0_f32_flat", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
    // mul_mv_iq4_nl_f32
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -1990,6 +2031,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        GGML_LOG_CONT(".");
    }

+    // mul_mm_q1_0_f32_l4_lm
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "mul_mm_q1_0_f32_l4_lm.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("mul_mm_q1_0_f32_l4_lm.cl");
+#endif
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_mul_mm_q1_0_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q1_0_f32_l4_lm", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
    // mul_mm_iq4_nl_f32_l4_lm
    {
 #ifdef GGML_OPENCL_EMBED_KERNELS
@@ -2939,6 +2997,44 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
        GGML_LOG_CONT(".");
    }

+    // gemm_noshuffle_q1_0_f32
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src {
+            #include "gemm_noshuffle_q1_0_f32.cl.h"
+        };
+#else
+        const std::string kernel_src = read_file("gemm_noshuffle_q1_0_f32.cl");
+#endif
+        cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q1_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q1_0_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemv_noshuffle_q1_0_f32
+    {
+        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
+                                       " -cl-mad-enable "
+                                       " -DSIMDGROUP_WIDTH=" +
+                                       std::to_string(backend_ctx->adreno_wave_size);
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src_CL_gemv_general {
+            #include "gemv_noshuffle_q1_0_f32.cl.h"
+        };
+#else
+        const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_q1_0_f32.cl");
+#endif
+
+        cl_program prog = build_program_from_source(
+            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
+
+        CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q1_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q1_0_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
    // gemv_noshuffle_general
    {
        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
@@ -4829,6 +4925,39 @@ struct ggml_tensor_extra_cl {
    }
 };

+struct ggml_tensor_extra_cl_q1_0 {
+    cl_mem q = nullptr;
+    cl_mem q_img = nullptr;
+
+    cl_mem d = nullptr;
+    cl_mem d_img = nullptr;
+
+    size_t size_q = 0;
+    size_t size_d = 0;
+
+    ~ggml_tensor_extra_cl_q1_0() {
+        reset();
+    }
+
+    void reset() {
+        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
+        // They must be properly released so that the original buffer can be
+        // properly released to avoid memory leak.
+        if (q != nullptr) {
+            CL_CHECK(clReleaseMemObject(q));
+            q = nullptr;
+        }
+        if (d != nullptr) {
+            CL_CHECK(clReleaseMemObject(d));
+            d = nullptr;
+        }
+        q_img = nullptr;
+        d_img = nullptr;
+        size_q = 0;
+        size_d = 0;
+    }
+};
+
 // Additional tensor extra structs for quantized tensors.
 // These tensors are loaded from files and should not be allocated in scratch --
 // they should always be allocated from the pool. Hence, they do not have an
@@ -5732,6 +5861,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                return true;
            } else if (op->src[0]->type == GGML_TYPE_F32) {
                return op->src[1]->type == GGML_TYPE_F32;
+            } else if (op->src[0]->type == GGML_TYPE_Q1_0) {
+                return op->src[1]->type == GGML_TYPE_F32;
            } else if (op->src[0]->type == GGML_TYPE_Q4_0) {
                // Non-contig src0 routes through on-device dequant-to-f16.
                return op->src[1]->type == GGML_TYPE_F32;
@@ -5988,6 +6119,12 @@ struct ggml_backend_opencl_buffer_context {
        for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
            delete e;
        }
+        for (ggml_tensor_extra_cl_q1_0 * e : temp_tensor_extras_q1_0) {
+            delete e;
+        }
+        for (ggml_tensor_extra_cl_q1_0 * e : temp_tensor_extras_q1_0_in_use) {
+            delete e;
+        }
        for (ggml_tensor_extra_cl_iq4_nl * e : temp_tensor_extras_iq4_nl) {
            delete e;
        }
@@ -6029,6 +6166,21 @@ struct ggml_backend_opencl_buffer_context {
        return extra;
    }

+    ggml_tensor_extra_cl_q1_0 * ggml_opencl_alloc_temp_tensor_extra_q1_0() {
+        ggml_tensor_extra_cl_q1_0 * extra;
+        if (temp_tensor_extras_q1_0.empty()) {
+            extra = new ggml_tensor_extra_cl_q1_0();
+        } else {
+            extra = temp_tensor_extras_q1_0.back();
+            temp_tensor_extras_q1_0.pop_back();
+        }
+
+        temp_tensor_extras_q1_0_in_use.push_back(extra);
+
+        extra->reset();
+        return extra;
+    }
+
    ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
        ggml_tensor_extra_cl_q4_0 * extra;
        if (temp_tensor_extras_q4_0.empty()) {
@@ -6185,6 +6337,11 @@ struct ggml_backend_opencl_buffer_context {
        }
        temp_tensor_extras_in_use.clear();

+        for (ggml_tensor_extra_cl_q1_0 * e : temp_tensor_extras_q1_0_in_use) {
+            temp_tensor_extras_q1_0.push_back(e);
+        }
+        temp_tensor_extras_q1_0_in_use.clear();
+
        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
            temp_tensor_extras_q4_0.push_back(e);
        }
@@ -6246,6 +6403,8 @@ struct ggml_backend_opencl_buffer_context {
    // for reuse.
    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras;
    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
+    std::vector<ggml_tensor_extra_cl_q1_0 *> temp_tensor_extras_q1_0;
+    std::vector<ggml_tensor_extra_cl_q1_0 *> temp_tensor_extras_q1_0_in_use;
    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
    std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1;
@@ -6353,6 +6512,82 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
    cl_command_queue queue = backend_ctx->queue;

 #ifdef GGML_OPENCL_SOA_Q
+    if (tensor->type == GGML_TYPE_Q1_0) {
+        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
+        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
+
+        // Allocate the new extra and create aliases from the original.
+        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+        ggml_tensor_extra_cl_q1_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q1_0();
+
+        // q1_0 block = ggml_half d + (QK1_0/8) quant bytes = 2 + 16 = 18 bytes
+        size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)/8);
+        GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
+            ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+        CL_CHECK(clEnqueueWriteBuffer(
+            queue, data_device, CL_TRUE, 0,
+            ggml_nbytes(tensor), data, 0, NULL, NULL));
+
+        // The original tensor memory is divided into scales and quants, i.e.,
+        // we first store scales, then quants.
+        cl_buffer_region region;
+
+        // Create subbuffer for scales.
+        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
+        region.size = size_d;
+        extra->d = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+        auto previous_origin = region.origin;
+
+        // Create subbuffer for quants.
+        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
+        region.size = size_q;
+        extra->q = clCreateSubBuffer(
+            extra_orig->data_device, CL_MEM_READ_WRITE,
+            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+
+        cl_kernel kernel = backend_ctx->kernel_convert_block_q1_0;
+
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clReleaseMemObject(data_device));
+
+        tensor->extra = extra;
+
+        // q is uint32 (32 sign bits each); d is one half per 128-block.
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (enable_adreno_trans_weight(backend_ctx, tensor)) {
+            int M = tensor->ne[1];   // ne01
+            int K = tensor->ne[0];   // ne00
+
+            GGML_ASSERT(K % 128 == 0);
+            GGML_ASSERT(M % 4 == 0);
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
+
+            transpose_2d_as_32b(backend_ctx, extra->q, extra->q, size_q, K/32,  M);
+            transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/128, M);
+        } // end transpose
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
+        return;
+    }
    // We separate the quantized bits and scale from block_q4_0 by using an
    // additional kernel, where each thread handles a block. We first read the
    // original weights into a temporary buffer, then create two separate
@@ -7743,6 +7978,63 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
    sync_with_other_backends(backend_ctx);

 #ifdef GGML_OPENCL_SOA_Q
+    if (tensor->type == GGML_TYPE_Q1_0) {
+        ggml_tensor_extra_cl_q1_0 * extra = (ggml_tensor_extra_cl_q1_0 *)tensor->extra;
+
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (enable_adreno_trans_weight(backend_ctx, tensor)) {
+            ggml_cl_buffer buf_trans_q;
+            ggml_cl_buffer buf_trans_d;
+            ggml_cl_buffer buf_unpacked;
+
+            int M = tensor->ne[1];
+            int K = tensor->ne[0];
+
+            size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
+            size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)/8);
+
+            buf_trans_q.allocate(backend_ctx->context, size_q);
+            buf_trans_d.allocate(backend_ctx->context, size_d);
+            buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor));
+
+            transpose_2d_as_32b(backend_ctx, extra->q, buf_trans_q.buffer, size_q, M, K/32);
+            transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/128);
+
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q1_0;
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d.buffer));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_unpacked.buffer));
+
+            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+            size_t local_work_size[] = {1, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+            CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL));
+            return;
+        }
+#endif
+
+        cl_int err;
+        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, ggml_nbytes(tensor), NULL, &err);
+        CL_CHECK(err);
+
+        cl_kernel kernel = backend_ctx->kernel_restore_block_q1_0;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+
+        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
+        size_t local_work_size[] = {1, 1, 1};
+
+        cl_event evt;
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clWaitForEvents(1, &evt));
+        CL_CHECK(clEnqueueReadBuffer(queue, data_device, CL_TRUE, offset, size, data, 0, NULL, NULL));
+        CL_CHECK(clReleaseMemObject(data_device));
+        return;
+    }
    // In end-to-end runs, get_tensor is usually used to get back the logits,
    // where we can simply do clEnqueueReadBuffer since they are f32.
    // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
@@ -13437,6 +13729,203 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
    CL_CHECK(clReleaseMemObject(D_sub_buffer));
 }

+static void ggml_cl_mul_mat_q1_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    GGML_ASSERT(src0->type == GGML_TYPE_Q1_0);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+    ggml_tensor_extra_cl_q1_0 * extra0_q1_0 = (ggml_tensor_extra_cl_q1_0 *)src0->extra;
+
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    GGML_ASSERT(src1->view_offs == 0);
+    GGML_ASSERT(dst->view_offs == 0);
+
+    const int  ne00 = src0->ne[0];
+    const int  ne01 = src0->ne[1];
+    const int  ne02 = src0->ne[2];
+
+    const int  ne10 = src1->ne[0];
+    const int  ne12 = src1->ne[2];
+
+    const int  ne0 = dst->ne[0];
+    const int  ne1 = dst->ne[1];
+
+    GGML_ASSERT(ne00 == ne10);
+    GGML_ASSERT((ne00 % 128) == 0);
+    GGML_ASSERT(ne0 == ne01);
+
+    cl_context context = backend_ctx->context;
+    cl_kernel kernel;
+
+    cl_int              err;
+    cl_image_format     img_fmt;
+    cl_image_desc       img_desc;
+    cl_buffer_region    region;
+
+    int M = ne01;
+    int N = ne1;
+    int K = ne00;
+
+    if (ne1 == 1) {
+        cl_mem q_img = nullptr;
+        cl_mem b_sub_buf = nullptr;
+        cl_mem b_img = nullptr;
+
+        // image for q (uint32: each texel packs 32 sign bits)
+        img_fmt = { CL_R, CL_UNSIGNED_INT32};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = M * K / 32;
+        img_desc.buffer = extra0_q1_0->q;
+        CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        // create a sub_buffer for B
+        region.origin = offset1;
+        region.size = K * N * sizeof(float);
+        CL_CHECK((b_sub_buf = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for activations
+        img_fmt = {CL_RGBA, CL_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * N / 4;
+        img_desc.buffer = b_sub_buf;
+        CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        kernel = backend_ctx->kernel_gemv_noshuffle_q1_0_f32;
+
+        int r2 = 1;
+        int r3 = 1;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &q_img));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q1_0->d));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &b_img));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &extra1->offset));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &extrad->offset));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
+        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
+        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
+
+        size_t wavesize = backend_ctx->adreno_wave_size;
+        size_t local_work_size[]  = { wavesize, 4, 1 };
+        size_t global_work_size[] = { CEIL_DIV(M, wavesize)*wavesize, 4, 1 };
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+        CL_CHECK(clReleaseMemObject(q_img));
+        CL_CHECK(clReleaseMemObject(b_img));
+        CL_CHECK(clReleaseMemObject(b_sub_buf));
+    } else {
+        cl_mem b_sub_buf = nullptr;
+        cl_mem b_sub_buf_trans = nullptr;
+        cl_mem b_img = nullptr;
+        cl_mem b_img_trans = nullptr;
+
+        // subbuffer for activations
+        region.origin = offset1;
+        region.size = K * N * sizeof(float);
+        CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for activations
+        img_fmt = {CL_RGBA, CL_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * N / 4;
+        img_desc.buffer = b_sub_buf;
+        CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
+
+        // pad N to multiple of 8
+        int extra_elements = N % 8;
+        int padding = 0;
+        if (extra_elements > 0){
+            padding = 8 - extra_elements;
+        }
+
+        // subbuffer for transposed activations
+        region.origin = 0;
+        region.size = K * (N + padding) * sizeof(float)/2;
+        backend_ctx->prealloc_act_trans.allocate(context, region.size);
+        CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &err), err));
+
+        // image for transposed activations
+        img_fmt = {CL_RGBA, CL_HALF_FLOAT};
+        memset(&img_desc, 0, sizeof(img_desc));
+        img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        img_desc.image_width = K * (N + padding) / 4;
+        img_desc.buffer = b_sub_buf_trans;
+        CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err));
+
+        // transpose activations
+        int height_B = N/4;
+        if (height_B == 0) {
+            height_B = 1;
+        }
+        int width_B = K/4;
+        int padded_height_B = (N + padding)/4;
+
+        kernel = backend_ctx->kernel_transpose_32_16;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+        size_t local_work_size_t[2] = { 1, 16 };
+        size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B };
+        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst);
+
+        // gemm
+        kernel = backend_ctx->kernel_gemm_noshuffle_q1_0_f32;
+        int padded_N = N + padding;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q1_0->q));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q1_0->d));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &b_img_trans));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &K));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &M));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &padded_N));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &N));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &offsetd));
+
+        size_t global_work_size[] = { (size_t)CEIL_DIV(N, 8), (size_t)CEIL_DIV(M, 4), 1 };
+        size_t local_work_size[]  = { 2, 128, 1 };
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+        CL_CHECK(clReleaseMemObject(b_img_trans));
+        CL_CHECK(clReleaseMemObject(b_sub_buf_trans));
+        CL_CHECK(clReleaseMemObject(b_img));
+        CL_CHECK(clReleaseMemObject(b_sub_buf));
+    }
+#else
+    GGML_UNUSED(backend);
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+#endif
+}
+
 static void ggml_cl_mul_mat_q4_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
    GGML_ASSERT(src0);
@@ -15311,6 +15800,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
    // view->extra stays pre-SoA; cast to the SoA struct would SIGSEGV.
    // Follow view_src to reach the real SoA extra.
    const ggml_tensor * soa0_src = src0->view_src != nullptr ? src0->view_src : src0;
+    ggml_tensor_extra_cl_q1_0 * extra0_q1_0 = (ggml_tensor_extra_cl_q1_0 *)src0->extra;
    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)soa0_src->extra;
    ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)soa0_src->extra;
    ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)soa0_src->extra;
@@ -15374,6 +15864,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
        // a limit check, but q4_0 / q4_1 tensors are very unlikely to exceed that
        // limit, so the check is omitted.

+        // q1_0 x fp32
+        if (src0t == GGML_TYPE_Q1_0 && src1t == GGML_TYPE_F32 &&
+            enable_adreno_trans_weight(backend_ctx, src0)) {
+                ggml_cl_mul_mat_q1_0_f32_adreno(backend, src0, src1, dst);
+                return;
+        }
+
        // q4_0 x fp32
        if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
            ggml_cl_mul_mat_q4_0_f32_adreno(backend, src0, src1, dst);
@@ -15577,6 +16074,48 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
                return;
            }
+            case GGML_TYPE_Q1_0: {
+                if (ne11 < 32) {
+                    break;
+                }
+                if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
+                    break;
+                }
+
+                kernel = backend_ctx->kernel_mul_mm_q1_0_f32_l4_lm;
+                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
+
+                int batch_stride_a = ne00*ne01;
+                int batch_stride_b = ne10*ne11;
+                int batch_stride_d = ne0*ne1;
+
+                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q1_0->q));
+                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q1_0->d));
+                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
+                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
+                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
+                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
+                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
+                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
+                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
+
+                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
+                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
+                size_t local_work_size[] = {(size_t)nth0, 1, 1};
+
+                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+                return;
+            }
            case GGML_TYPE_Q4_0: {
                if (ne11 < 32) {
                    break;
@@ -16165,6 +16704,81 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
            break;
+        case GGML_TYPE_Q1_0: {
+#ifdef GGML_OPENCL_SOA_Q
+            kernel = backend_ctx->kernel_mul_mv_q1_0_f32_flat;
+
+            // nth0 - subgroup size
+            // nth1 - number of subgroups per workgroup
+            // ndst - number of output values per workgroup = output per subgroup * number of subgroups
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 16;
+                nth1 = 2;
+                ndst = nth1*4;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 2;
+                ndst = nth1*4;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q1_0->q));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q1_0->d));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb01));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
+#else
+            kernel = backend_ctx->kernel_mul_mv_q1_0_f32;
+
+            if (backend_ctx->gpu_family == INTEL) {
+                nth0 = 16;
+                nth1 = 2;
+                ndst = nth1*4;
+            } else if (backend_ctx->gpu_family == ADRENO) {
+                nth0 = 64;
+                nth1 = 2;
+                ndst = nth1*4;
+            } else {
+                GGML_ASSERT(false && "TODO: Unknown GPU");
+            }
+
+            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb01));
+            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb02));
+            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
+            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
+            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
+            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
+            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
+            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne0));
+            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne1));
+            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
+            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
+#endif // GGML_OPENCL_SOA_Q
+            break;
+        }
        case GGML_TYPE_Q4_0:
            // This should have been satisfied.
            GGML_ASSERT(ne11 == ne1);
@@ -16879,6 +17493,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
        src0t == GGML_TYPE_Q5_0 ||
        src0t == GGML_TYPE_Q5_1 ||
        src0t == GGML_TYPE_Q8_0 ||
+        src0t == GGML_TYPE_Q1_0 ||
        src0t == GGML_TYPE_IQ4_NL ||
        src0t == GGML_TYPE_Q2_K) {
        // Each SIMD group produces N_DST values in the result. Assuming each
@@ -27,6 +27,8 @@
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
+#define QK1_0                   128
+#define QR1_0                   1
 #define QK_K                    256
 #define K_SCALE_SIZE            (3 * QK_K / 64)
 #define K_QUANTS_PER_ITERATION  2
@@ -38,6 +40,14 @@ typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;

+//------------------------------------------------------------------------------
+// block_q1_0
+//------------------------------------------------------------------------------
+typedef struct {
+    half d;             // delta
+    uchar qs[QK1_0/8];  // 1-bit signs (16 bytes)
+} block_q1_0;
+
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
@@ -159,6 +169,42 @@ kernel void kernel_convert_f16_to_bf16(
    }
 }

+//------------------------------------------------------------------------------
+// kernel_convert_block_q1_0
+// Convert block_q1_0 (AOS) to 2 separate arrays (SOA): quant bytes + scales.
+// q1_0 bits are stored in natural order (bit j of byte i -> weight 8*i + j)
+//------------------------------------------------------------------------------
+kernel void kernel_convert_block_q1_0(
+    global block_q1_0 * src0,
+    global uchar * dst_q,
+    global half  * dst_d
+) {
+    global block_q1_0 * b = (global block_q1_0 *) src0 + get_global_id(0);
+    global uchar      * q = (global uchar *) dst_q + (QK1_0/8)*get_global_id(0);
+    global half       * d = (global half *) dst_d + get_global_id(0);
+
+    *d = b->d;
+
+    for (int i = 0; i < QK1_0/8; ++i) {
+        q[i] = b->qs[i];
+    }
+}
+
+kernel void kernel_restore_block_q1_0(
+    global uchar * src_q,
+    global half  * src_d,
+    global block_q1_0 * dst
+) {
+    global block_q1_0 * b = (global block_q1_0 *) dst + get_global_id(0);
+    global uchar      * q = (global uchar *) src_q + (QK1_0/8)*get_global_id(0);
+    global half       * d = (global half *) src_d + get_global_id(0);
+
+    b->d = *d;
+    for (int i = 0; i < QK1_0/8; ++i) {
+        b->qs[i] = q[i];
+    }
+}
+
 //------------------------------------------------------------------------------
 // kernel_convert_block_q4_0
 // Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
@@ -0,0 +1,94 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+// each work-item computes a 4 (rows of A / m) x 8 (cols of B / n) output tile.
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_128
+#endif
+kernel void kernel_gemm_noshuffle_q1_0_f32(
+        global const uint * src0_q,
+        global const half  * src0_d,
+        read_only image1d_buffer_t src1,
+        global float * dst,
+        int k,
+        int m,
+        int n,
+        int n_no_padding,
+        ulong offsetd
+) {
+    int n_4 = n >> 2;
+
+    int gy   = get_global_id(0);
+    int gx   = get_global_id(1);
+    int gx_2 = gx << 2;
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
+    half8 B;
+
+    global const uint* wptr = src0_q + gx_2;
+    global const half* sptr = src0_d + gx_2;
+
+    // 32 weights per uint32, 128 weights (one block / one scale) per 4 uint32.
+    for (int i = 0; i < k; i += 32) {
+        uint4 pack4 = vload4(0, wptr + (i / 32)  * m); // 4 rows, 32 K-values each
+        half4 scale = vload4(0, sptr + (i / 128) * m); // 4 rows, one scale per 128
+
+        for (int j = 0; j < 32; ++j) {
+            B.s0123 = read_imageh(src1, gy * 2 + (i + j) * n_4);
+            B.s4567 = read_imageh(src1, gy * 2 + (i + j) * n_4 + 1);
+
+            // sign bit -> +-1 (half arithmetic avoids unsigned underflow)
+            half4 wj = (half4)(
+                2.0h * (half)((pack4.s0 >> j) & 1u) - 1.0h,
+                2.0h * (half)((pack4.s1 >> j) & 1u) - 1.0h,
+                2.0h * (half)((pack4.s2 >> j) & 1u) - 1.0h,
+                2.0h * (half)((pack4.s3 >> j) & 1u) - 1.0h) * scale;
+
+            c0 += B * wj.s0;
+            c1 += B * wj.s1;
+            c2 += B * wj.s2;
+            c3 += B * wj.s3;
+        }
+    }
+
+    int idx = (gy << 3) * m + (gx << 2);
+
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+    }
+}
@@ -0,0 +1,121 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#endif
+
+#define QK1_0 128
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_q1(total, bits, scale, regB, lb)                                       \
+    total += (2.0f*(float)((bits >>  0) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s0, lb+0); \
+    total += (2.0f*(float)((bits >>  1) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s1, lb+0); \
+    total += (2.0f*(float)((bits >>  2) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s2, lb+0); \
+    total += (2.0f*(float)((bits >>  3) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s3, lb+0); \
+    total += (2.0f*(float)((bits >>  4) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s4, lb+0); \
+    total += (2.0f*(float)((bits >>  5) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s5, lb+0); \
+    total += (2.0f*(float)((bits >>  6) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s6, lb+0); \
+    total += (2.0f*(float)((bits >>  7) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s7, lb+0); \
+    total += (2.0f*(float)((bits >>  8) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s0, lb+1); \
+    total += (2.0f*(float)((bits >>  9) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s1, lb+1); \
+    total += (2.0f*(float)((bits >> 10) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s2, lb+1); \
+    total += (2.0f*(float)((bits >> 11) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s3, lb+1); \
+    total += (2.0f*(float)((bits >> 12) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s4, lb+1); \
+    total += (2.0f*(float)((bits >> 13) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s5, lb+1); \
+    total += (2.0f*(float)((bits >> 14) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s6, lb+1); \
+    total += (2.0f*(float)((bits >> 15) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s7, lb+1); \
+    total += (2.0f*(float)((bits >> 16) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s0, lb+2); \
+    total += (2.0f*(float)((bits >> 17) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s1, lb+2); \
+    total += (2.0f*(float)((bits >> 18) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s2, lb+2); \
+    total += (2.0f*(float)((bits >> 19) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s3, lb+2); \
+    total += (2.0f*(float)((bits >> 20) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s4, lb+2); \
+    total += (2.0f*(float)((bits >> 21) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s5, lb+2); \
+    total += (2.0f*(float)((bits >> 22) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s6, lb+2); \
+    total += (2.0f*(float)((bits >> 23) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s7, lb+2); \
+    total += (2.0f*(float)((bits >> 24) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s0, lb+3); \
+    total += (2.0f*(float)((bits >> 25) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s1, lb+3); \
+    total += (2.0f*(float)((bits >> 26) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s2, lb+3); \
+    total += (2.0f*(float)((bits >> 27) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s3, lb+3); \
+    total += (2.0f*(float)((bits >> 28) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s4, lb+3); \
+    total += (2.0f*(float)((bits >> 29) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s5, lb+3); \
+    total += (2.0f*(float)((bits >> 30) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s6, lb+3); \
+    total += (2.0f*(float)((bits >> 31) & 1u) - 1.0f) * scale * sub_group_broadcast(regB.s7, lb+3);
+
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+__kernel void kernel_gemv_noshuffle_q1_0_f32(
+        read_only  image1d_buffer_t src0_q,
+        global half  * src0_d,
+        read_only  image1d_buffer_t src1,
+        ulong offset1,
+        global float * dst,
+        ulong offsetd,
+        int ne00,
+        int ne01,
+        int ne02,
+        int ne10,
+        int ne12,
+        int ne0,
+        int ne1,
+        int r2,
+        int r3)
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid  = get_sub_group_local_id();
+
+    uint K = ne00;
+    uint M = ne01;
+
+    uint LINE_STRIDE_A  = M;
+    uint BLOCK_STRIDE_A = 4 * M;
+
+    uint4  regA;
+    half   regS;
+    float8 regB;
+
+    float totalSum = 0.0f;
+
+    #pragma unroll 1
+    for (uint kb = groupId; kb < (K / QK1_0); kb += N_SIMDGROUP) {
+        regS = src0_d[gid + kb * LINE_STRIDE_A]; // each fiber loads its row's scale
+
+        // first 16 fibers load 8 B values each -> 128 activations for this block
+        if (slid < 16) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + kb * 32));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + kb * 32));
+        }
+
+        // load this row's 4 uint32 (128 sign bits)
+        regA.s0 = read_imageui(src0_q, (gid + kb * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_q, (gid + kb * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_q, (gid + kb * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_q, (gid + kb * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+
+        float scale = (float)regS;
+        dequantizeBlockAccum_q1(totalSum, regA.s0, scale, regB, 0);
+        dequantizeBlockAccum_q1(totalSum, regA.s1, scale, regB, 4);
+        dequantizeBlockAccum_q1(totalSum, regA.s2, scale, regB, 8);
+        dequantizeBlockAccum_q1(totalSum, regA.s3, scale, regB, 12);
+    }
+
+    // reduction in local memory, assumes #wave = N_SIMDGROUP = 4
+    local float reduceLM[SIMDGROUP_WIDTH * 3];
+    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        dst[gid] = totalSum;
+    }
+}
@@ -0,0 +1,156 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// LOAD_VEC_A is 8 because one q1_0 quant byte expands to 8 weights along K.
+#define LOAD_VEC_A 8
+#define LOAD_VEC_B 4
+
+#define BM 64
+#define BN 64
+#define BK 32
+#define TM 4
+#define TN 8
+
+kernel void kernel_mul_mm_q1_0_f32_l4_lm(
+    global uchar  * src0_q,
+    global half   * src0_d,
+    global float4 * src1,
+    ulong offset1,
+    global float  * dst,
+    ulong offsetd,
+
+    int ne00,
+    int ne01,
+    int ne02,
+    int ne11,
+    int ne12,
+
+    int stride_a,
+    int stride_b,
+    int stride_d,
+
+    int batch_stride_a,
+    int batch_stride_b,
+    int batch_stride_d,
+
+    int r2,
+    int r3
+) {
+    src1 = (global float4*)((global char*)src1 + offset1);
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+    local float buf_a[BM * BK];
+    local float buf_b[BN * BK];
+
+    const int batch_idx = get_global_id(2);
+
+    const int i13 = batch_idx / ne12;
+    const int i12 = batch_idx % ne12;
+
+    const int i03 = i13 / r3;
+    const int i02 = i12 / r2;
+
+    const int batch_idx_a = i03 * ne02 + i02;
+
+    const int ir = get_group_id(0);
+    const int ic = get_group_id(1);
+
+    const int tid = get_local_id(0);
+    const int th_r  = tid % (BM / TM);
+    const int th_c  = tid / (BM / TM);
+
+    const int loadr_a = get_local_id(0) % (BK / LOAD_VEC_A);
+    const int loadc_a = get_local_id(0) / (BK / LOAD_VEC_A);
+    const int loadr_b = get_local_id(0) % (BK / LOAD_VEC_B);
+    const int loadc_b = get_local_id(0) / (BK / LOAD_VEC_B);
+
+    const int loadstride_a = get_local_size(0) * LOAD_VEC_A / BK;
+    const int loadstride_b = get_local_size(0) * LOAD_VEC_B / BK;
+
+    int pos_a = (batch_idx_a * batch_stride_a + ir * BM * stride_a) / LOAD_VEC_A;
+    int pos_b = (batch_idx   * batch_stride_b + ic * BN * stride_b) / LOAD_VEC_B;
+
+    float sums[TM * TN];
+    float cache_a[TM];
+    float cache_b[TN];
+
+    for (int i = 0; i < TM * TN; i++) {
+        sums[i] = 0.0f;
+    }
+
+    for (int block = 0; block < ne00; block += BK) {
+        for (int l = 0; l < BM; l += loadstride_a) {
+            if (ir*BM + loadc_a + l < ne01) {
+                int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+                int ib  = idx / 16; // 16 quant bytes per q1_0 block
+
+                float d = (float)src0_d[ib];
+                uint bits = src0_q[idx];
+
+                // use float to avoid unsigned underflow of (2*0 - 1).
+                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 0) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 1) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 2) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 3) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 3) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 4) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 4) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 5) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 5) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 6) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 6) & 1) - 1.0f);
+                buf_a[(loadr_a * LOAD_VEC_A + 7) * BM + loadc_a + l] = d * (2.0f*(float)((bits >> 7) & 1) - 1.0f);
+            } else {
+                for (int b = 0; b < LOAD_VEC_A; ++b) {
+                    buf_a[(loadr_a * LOAD_VEC_A + b) * BM + loadc_a + l] = 0.0f;
+                }
+            }
+        }
+
+        for (int l = 0; l < BN; l += loadstride_b) {
+            if (ic*BN + loadc_b + l < ne11) {
+                int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = src1[idx].s2;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = src1[idx].s3;
+            } else {
+                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 2) * BN + loadc_b + l] = 0.0f;
+                buf_b[(loadr_b * LOAD_VEC_B + 3) * BN + loadc_b + l] = 0.0f;
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        pos_a += BK / LOAD_VEC_A;
+        pos_b += BK / LOAD_VEC_B;
+
+        for (int i = 0; i < BK; i++) {
+            for (int j = 0; j < TM; j++) {
+                cache_a[j] = buf_a[(i) * BM + th_r * TM + j];
+            }
+
+            for (int j = 0; j < TN; j++) {
+                cache_b[j] = buf_b[(i) * BN + th_c * TN + j];
+            }
+
+            for (int cc = 0; cc < TN; cc++) {
+                for (int cr = 0; cr < TM; cr++) {
+                    const int sums_idx = cc*TM + cr;
+                    sums[sums_idx] = mad(cache_a[cr], cache_b[cc], sums[sums_idx]);
+                }
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    const int dr = ir * BM + th_r * TM;
+    const int dc = ic * BN + th_c * TN;
+
+    const int offsets = batch_idx * batch_stride_d;
+
+    for (int cc = 0; cc < TN; cc++) {
+        for (int cr = 0; cr < TM; cr++) {
+            if (dr + cr < ne01 && dc + cc < ne11) {
+                dst[offsets + (dc + cc) * stride_d + dr + cr] = sums[cc * TM + cr];
+            }
+        }
+    }
+}
@@ -0,0 +1,141 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define QK1_0 128
+typedef struct {
+    half d;
+    uchar qs[QK1_0/8];
+} block_q1_0;
+
+#define NB_Q1_0 16
+
+#ifdef INTEL_GPU
+#define N_R0_Q1_0 4 // number of rows each subgroup works on
+#define N_SG_Q1_0 2 // number of subgroups in a work group
+#define N_SIMDWIDTH 16 // subgroup size
+#elif defined (ADRENO_GPU)
+#define N_R0_Q1_0 4
+#define N_SG_Q1_0 2
+#define N_SIMDWIDTH 64
+#endif
+
+inline float block_q_1_0_dot_y(global block_q1_0 * qb, float sumy, float yl[NB_Q1_0], short il) {
+    global uchar * qs = qb->qs + il*2;
+    uint b0 = qs[0];
+    uint b1 = qs[1];
+
+    float acc = 0.f;
+    acc += yl[ 0]*(float)((b0 >> 0) & 1) + yl[ 1]*(float)((b0 >> 1) & 1);
+    acc += yl[ 2]*(float)((b0 >> 2) & 1) + yl[ 3]*(float)((b0 >> 3) & 1);
+    acc += yl[ 4]*(float)((b0 >> 4) & 1) + yl[ 5]*(float)((b0 >> 5) & 1);
+    acc += yl[ 6]*(float)((b0 >> 6) & 1) + yl[ 7]*(float)((b0 >> 7) & 1);
+
+    acc += yl[ 8]*(float)((b1 >> 0) & 1) + yl[ 9]*(float)((b1 >> 1) & 1);
+    acc += yl[10]*(float)((b1 >> 2) & 1) + yl[11]*(float)((b1 >> 3) & 1);
+    acc += yl[12]*(float)((b1 >> 4) & 1) + yl[13]*(float)((b1 >> 5) & 1);
+    acc += yl[14]*(float)((b1 >> 6) & 1) + yl[15]*(float)((b1 >> 7) & 1);
+
+    return qb->d * (2.0f*acc - sumy);
+}
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q1_0_f32(
+    global char * src0,
+    ulong         offset0,
+    global char * src1,
+    ulong         offset1,
+    global char * dst,
+    ulong         offsetd,
+    int           ne00,
+    int           ne01,
+    ulong         nb01,
+    ulong         nb02,
+    ulong         nb03,
+    int           ne12,
+    ulong         nb11,
+    ulong         nb12,
+    ulong         nb13,
+    int           ne0,
+    int           ne1,
+    int           r2,
+    int           r3
+) {
+    src0 = (global char*)((global char*)src0 + offset0);
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst  = (global char*)((global char*)dst  + offsetd);
+
+    int nb = ne00/QK1_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0*N_SG_Q1_0 + get_sub_group_id()) * N_R0_Q1_0;
+
+    uint i12 = im%ne12;
+    uint i13 = im/ne12;
+
+    ulong offset_src1 = r1*nb11 + i12*nb12 + i13*nb13;
+    global float * y  = (global float *) (src1 + offset_src1);
+
+    // pointers to src0 rows
+    global block_q1_0 * ax[N_R0_Q1_0];
+    for (int row = 0; row < N_R0_Q1_0; ++row) {
+        ulong offset_src0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+        ax[row] = (global block_q1_0 *) ((global char *) src0 + offset_src0);
+    }
+
+    float yl[NB_Q1_0];
+    float sumf[N_R0_Q1_0] = { 0.f };
+
+    const short ix = get_sub_group_local_id()/8;
+    const short il = get_sub_group_local_id()%8;
+
+    global float * yb = y + ix*QK1_0 + il*NB_Q1_0;
+
+    // each thread handles NB_Q1_0 quants at a time
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/8) {
+        float sumy = 0.f;
+        for (short i = 0; i < NB_Q1_0; ++i) {
+            yl[i] = yb[i];
+            sumy += yb[i];
+        }
+
+        for (short row = 0; row < N_R0_Q1_0; row++) {
+            sumf[row] += block_q_1_0_dot_y(ax[row] + ib, sumy, yl, il);
+        }
+
+        yb += N_SIMDWIDTH*NB_Q1_0;
+    }
+
+    global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
+
+    for (int row = 0; row < N_R0_Q1_0; ++row) {
+        float tot = sub_group_reduce_add(sumf[row]);
+
+        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
@@ -0,0 +1,190 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define QK1_0 128
+#define QK1_0_BYTES (QK1_0/8)              // 16 quant bytes per block
+#define QK1_0_BLK_BYTES (QK1_0_BYTES + 2)  // d + qs in original tensor = 18
+
+#define NB_Q1_0 16 // quants handled per thread (two qs bytes)
+
+#ifdef INTEL_GPU
+#define N_R0_Q1_0 4 // number of rows each subgroup works on
+#define N_SG_Q1_0 2 // number of subgroups in a work group
+#define N_SIMDWIDTH 16 // subgroup size
+#elif defined (ADRENO_GPU)
+#define N_R0_Q1_0 4
+#define N_SG_Q1_0 2
+#define N_SIMDWIDTH 64
+#endif
+
+#ifdef INTEL_GPU
+REQD_SUBGROUP_SIZE_16
+#elif defined (ADRENO_GPU)
+REQD_SUBGROUP_SIZE_64
+#endif
+kernel void kernel_mul_mv_q1_0_f32_flat(
+    global char * src0_q,
+    global half * src0_d,
+    global char * src1,
+    ulong         offset1,
+    global char * dst,
+    ulong         offsetd,
+    int           ne00,
+    int           ne01,
+    ulong         nb01,
+    ulong         nb02,
+    ulong         nb03,
+    int           ne12,
+    ulong         nb11,
+    ulong         nb12,
+    ulong         nb13,
+    int           ne0,
+    int           ne1,
+    int           r2,
+    int           r3
+) {
+    src1 = (global char*)((global char*)src1 + offset1);
+    dst  = (global char*)((global char*)dst  + offsetd);
+
+    int nb = ne00/QK1_0;
+
+    int r0 = get_group_id(0);
+    int r1 = get_group_id(1);
+    int im = get_group_id(2);
+
+    int first_row = (r0*N_SG_Q1_0 + get_sub_group_id()) * N_R0_Q1_0;
+
+    uint i12 = im%ne12;
+    uint i13 = im/ne12;
+
+    ulong offset_src1 = r1*nb11 + i12*nb12 + i13*nb13;
+    global float * y  = (global float *) (src1 + offset_src1);
+
+    // pointers to src0 rows (flat: q bytes + scales)
+    uint offset_src0_base = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
+
+    global uchar * ax0, * ax1, * ax2, * ax3;
+    global half  * ad0, * ad1, * ad2, * ad3;
+    uint offset_src0;
+
+    offset_src0 = (offset_src0_base + 0*nb01) / QK1_0_BLK_BYTES;
+    ax0 = (global uchar *) ((global char *) src0_q + offset_src0*QK1_0_BYTES);
+    ad0 = (global half  *) ((global char *) src0_d + offset_src0*sizeof(half));
+
+    offset_src0 = (offset_src0_base + 1*nb01) / QK1_0_BLK_BYTES;
+    ax1 = (global uchar *) ((global char *) src0_q + offset_src0*QK1_0_BYTES);
+    ad1 = (global half  *) ((global char *) src0_d + offset_src0*sizeof(half));
+
+    offset_src0 = (offset_src0_base + 2*nb01) / QK1_0_BLK_BYTES;
+    ax2 = (global uchar *) ((global char *) src0_q + offset_src0*QK1_0_BYTES);
+    ad2 = (global half  *) ((global char *) src0_d + offset_src0*sizeof(half));
+
+    offset_src0 = (offset_src0_base + 3*nb01) / QK1_0_BLK_BYTES;
+    ax3 = (global uchar *) ((global char *) src0_q + offset_src0*QK1_0_BYTES);
+    ad3 = (global half  *) ((global char *) src0_d + offset_src0*sizeof(half));
+
+    const short ix = get_sub_group_local_id()/8;
+    const short il = get_sub_group_local_id()%8;
+
+    global float * yb = y + ix*QK1_0 + il*NB_Q1_0;
+
+    float8 yl_lo;
+    float8 yl_hi;
+    float4 sumf = 0.f;
+
+    // each thread handles NB_Q1_0 = 16 quants (two qs bytes) at a time
+    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/8) {
+        yl_lo = vload8(0, yb);
+        yl_hi = vload8(0, yb + 8);
+        float sumy = yl_lo.s0 + yl_lo.s1 + yl_lo.s2 + yl_lo.s3
+                   + yl_lo.s4 + yl_lo.s5 + yl_lo.s6 + yl_lo.s7
+                   + yl_hi.s0 + yl_hi.s1 + yl_hi.s2 + yl_hi.s3
+                   + yl_hi.s4 + yl_hi.s5 + yl_hi.s6 + yl_hi.s7;
+
+        uint b0, b1;
+        float acc;
+
+        b0 = ax0[ib*QK1_0_BYTES + il*2 + 0];
+        b1 = ax0[ib*QK1_0_BYTES + il*2 + 1];
+        acc  = yl_lo.s0*(float)((b0 >> 0) & 1) + yl_lo.s1*(float)((b0 >> 1) & 1)
+             + yl_lo.s2*(float)((b0 >> 2) & 1) + yl_lo.s3*(float)((b0 >> 3) & 1)
+             + yl_lo.s4*(float)((b0 >> 4) & 1) + yl_lo.s5*(float)((b0 >> 5) & 1)
+             + yl_lo.s6*(float)((b0 >> 6) & 1) + yl_lo.s7*(float)((b0 >> 7) & 1)
+             + yl_hi.s0*(float)((b1 >> 0) & 1) + yl_hi.s1*(float)((b1 >> 1) & 1)
+             + yl_hi.s2*(float)((b1 >> 2) & 1) + yl_hi.s3*(float)((b1 >> 3) & 1)
+             + yl_hi.s4*(float)((b1 >> 4) & 1) + yl_hi.s5*(float)((b1 >> 5) & 1)
+             + yl_hi.s6*(float)((b1 >> 6) & 1) + yl_hi.s7*(float)((b1 >> 7) & 1);
+        sumf.s0 += (float)ad0[ib] * (2.0f*acc - sumy);
+
+        b0 = ax1[ib*QK1_0_BYTES + il*2 + 0];
+        b1 = ax1[ib*QK1_0_BYTES + il*2 + 1];
+        acc  = yl_lo.s0*(float)((b0 >> 0) & 1) + yl_lo.s1*(float)((b0 >> 1) & 1)
+             + yl_lo.s2*(float)((b0 >> 2) & 1) + yl_lo.s3*(float)((b0 >> 3) & 1)
+             + yl_lo.s4*(float)((b0 >> 4) & 1) + yl_lo.s5*(float)((b0 >> 5) & 1)
+             + yl_lo.s6*(float)((b0 >> 6) & 1) + yl_lo.s7*(float)((b0 >> 7) & 1)
+             + yl_hi.s0*(float)((b1 >> 0) & 1) + yl_hi.s1*(float)((b1 >> 1) & 1)
+             + yl_hi.s2*(float)((b1 >> 2) & 1) + yl_hi.s3*(float)((b1 >> 3) & 1)
+             + yl_hi.s4*(float)((b1 >> 4) & 1) + yl_hi.s5*(float)((b1 >> 5) & 1)
+             + yl_hi.s6*(float)((b1 >> 6) & 1) + yl_hi.s7*(float)((b1 >> 7) & 1);
+        sumf.s1 += (float)ad1[ib] * (2.0f*acc - sumy);
+
+        b0 = ax2[ib*QK1_0_BYTES + il*2 + 0];
+        b1 = ax2[ib*QK1_0_BYTES + il*2 + 1];
+        acc  = yl_lo.s0*(float)((b0 >> 0) & 1) + yl_lo.s1*(float)((b0 >> 1) & 1)
+             + yl_lo.s2*(float)((b0 >> 2) & 1) + yl_lo.s3*(float)((b0 >> 3) & 1)
+             + yl_lo.s4*(float)((b0 >> 4) & 1) + yl_lo.s5*(float)((b0 >> 5) & 1)
+             + yl_lo.s6*(float)((b0 >> 6) & 1) + yl_lo.s7*(float)((b0 >> 7) & 1)
+             + yl_hi.s0*(float)((b1 >> 0) & 1) + yl_hi.s1*(float)((b1 >> 1) & 1)
+             + yl_hi.s2*(float)((b1 >> 2) & 1) + yl_hi.s3*(float)((b1 >> 3) & 1)
+             + yl_hi.s4*(float)((b1 >> 4) & 1) + yl_hi.s5*(float)((b1 >> 5) & 1)
+             + yl_hi.s6*(float)((b1 >> 6) & 1) + yl_hi.s7*(float)((b1 >> 7) & 1);
+        sumf.s2 += (float)ad2[ib] * (2.0f*acc - sumy);
+
+        b0 = ax3[ib*QK1_0_BYTES + il*2 + 0];
+        b1 = ax3[ib*QK1_0_BYTES + il*2 + 1];
+        acc  = yl_lo.s0*(float)((b0 >> 0) & 1) + yl_lo.s1*(float)((b0 >> 1) & 1)
+             + yl_lo.s2*(float)((b0 >> 2) & 1) + yl_lo.s3*(float)((b0 >> 3) & 1)
+             + yl_lo.s4*(float)((b0 >> 4) & 1) + yl_lo.s5*(float)((b0 >> 5) & 1)
+             + yl_lo.s6*(float)((b0 >> 6) & 1) + yl_lo.s7*(float)((b0 >> 7) & 1)
+             + yl_hi.s0*(float)((b1 >> 0) & 1) + yl_hi.s1*(float)((b1 >> 1) & 1)
+             + yl_hi.s2*(float)((b1 >> 2) & 1) + yl_hi.s3*(float)((b1 >> 3) & 1)
+             + yl_hi.s4*(float)((b1 >> 4) & 1) + yl_hi.s5*(float)((b1 >> 5) & 1)
+             + yl_hi.s6*(float)((b1 >> 6) & 1) + yl_hi.s7*(float)((b1 >> 7) & 1);
+        sumf.s3 += (float)ad3[ib] * (2.0f*acc - sumy);
+
+        yb += N_SIMDWIDTH*NB_Q1_0;
+    }
+
+    global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
+
+    float4 tot = (float4)(
+        sub_group_reduce_add(sumf.s0),
+        sub_group_reduce_add(sumf.s1),
+        sub_group_reduce_add(sumf.s2),
+        sub_group_reduce_add(sumf.s3)
+    );
+
+    if (get_sub_group_local_id() == 0) {
+        if (first_row + 0 < ne01) dst_f32[first_row + 0] = tot.s0;
+        if (first_row + 1 < ne01) dst_f32[first_row + 1] = tot.s1;
+        if (first_row + 2 < ne01) dst_f32[first_row + 2] = tot.s2;
+        if (first_row + 3 < ne01) dst_f32[first_row + 3] = tot.s3;
+    }
+}
@@ -2475,6 +2475,85 @@ static bool ggml_vk_strip_decode_vector(const uint32_t * code, size_t word_count
    return true;
 }

+// Remove the loop unrolling hint of the matmul shader's BK loop
+// and replace it with the dont_unroll hint for better performance on
+// hardware like Apple M1/M2.
+// Assumes 1. code comes from mul_mm.comp 2. the K-tile loop has no loop
+// control hint and 3. the BK loop is the last loop nested directly inside
+// the K-tile loop.
+// Returns true when the input was modified; returns false otherwise
+// without touching `out`.
+static bool ggml_vk_roll_bk_loop(const uint32_t * code, size_t word_count, std::vector<uint32_t> & out) {
+    if (word_count < 5) {
+        return false;
+    }
+
+    struct vk_spv_loop {
+        size_t   header;
+        size_t   end;
+        uint32_t control;
+    };
+
+    std::vector<vk_spv_loop> loops;
+
+    // Collect a list of all loops in the module.
+    for (size_t pos = 5; pos < word_count; ) {
+        const uint32_t wc = code[pos] >> spv::WordCountShift;
+        const uint32_t op = code[pos] & spv::OpCodeMask;
+        if (wc == 0 || pos + wc > word_count) {
+            return false;
+        }
+
+        if (op == spv::OpLoopMerge && wc >= 4) { loops.push_back({ pos, 0, code[pos + 3] }); }
+
+        if (op == spv::OpLabel && wc >= 2) {
+            for (auto & l : loops) {
+                if (l.end == 0 && code[l.header + 1] == code[pos + 1]) { l.end = pos; }
+            }
+        }
+
+        pos += wc;
+    }
+
+    auto encloses = [](const vk_spv_loop & a, const vk_spv_loop & b) {
+        return a.header < b.header && b.header < a.end;
+    };
+
+    // Find the BK loop.
+    const vk_spv_loop * bk = nullptr;
+    for (const auto & h : loops) {
+        if (h.control != spv::LoopControlUnrollMask) {
+            continue;
+        }
+        const vk_spv_loop * parent = nullptr;
+        bool has_child = false;
+        for (const auto & g : loops) {
+            if (encloses(g, h) && (!parent || g.header > parent->header)) {
+                parent = &g;
+            }
+            if (encloses(h, g)) {
+                has_child = true;
+            }
+        }
+        // BK loop should be the last loop nested inside the loop with no hint
+        // and have at least one child loop.
+        if (parent &&
+            parent->control == spv::LoopControlMaskNone &&
+            has_child &&
+            (!bk || h.header > bk->header)) {
+            bk = &h;
+        }
+    }
+    if (!bk) {
+        return false;
+    }
+
+    // set DontUnroll instead of Unroll
+    out.assign(code, code + word_count);
+    out[bk->header + 3] = spv::LoopControlDontUnrollMask;
+    return true;
+}
+
 static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, size_t spv_size, const void* spv_data, const std::string entrypoint,
                                         uint32_t parameter_count, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
                                         bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
@@ -2558,6 +2637,22 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
    }
 #endif

+#if VK_HEADER_VERSION >= 287
+    // Roll the mul_mm BK loop on Asahi Linux. Skip bf16 and the mul_mmq pipelines.
+    if (device->driver_id == vk::DriverId::eMesaHoneykrisp &&
+        pipeline->name.rfind("matmul", 0) == 0 &&
+        pipeline->name.find("bf16") == std::string::npos &&
+        pipeline->name.find("q8_1") == std::string::npos) {
+        const uint32_t * src   = spirv.empty() ? reinterpret_cast<const uint32_t *>(spv_data) : spirv.data();
+        size_t           src_n = spirv.empty() ? spv_size / sizeof(uint32_t) : spirv.size();
+        std::vector<uint32_t> rolled;
+        if (ggml_vk_roll_bk_loop(src, src_n, rolled)) {
+            spirv = std::move(rolled);
+            shader_module_create_info = vk::ShaderModuleCreateInfo({}, spirv.size() * sizeof(uint32_t), spirv.data());
+        }
+    }
+#endif
+
    pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);

    vk::PushConstantRange pcr(
@@ -121,6 +121,8 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
        ggml_tensor * inpSA = inpL;

        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
@@ -7759,6 +7759,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    }

    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 8, 2, 1, false));
+    test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_F32, 1, 70000, 4, 1, false)); // row count > CUDA grid-y limit (65535)
    for (ggml_type type : all_types) {
        for (bool v : {false, true}) {
            test_cases.emplace_back(new test_get_rows_back(type, 256, 5, 4, 1, v));
@@ -39,7 +39,7 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin
        throw std::runtime_error("unsupported URL scheme in target URL: " + parsed_url.scheme);
    }

-    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str());
+    SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), common_http_format_host(parsed_url.host).c_str(), parsed_url.port, parsed_url.path.c_str());

    std::map<std::string, std::string> headers;
    const std::string proxy_header_prefix = "x-llama-server-proxy-header-";
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "http.h"
 #include "server-http.h"
 #include "server-stream.h"
 #include "server-common.h"
@@ -441,7 +442,7 @@ bool server_http_context::start() {
    srv->wait_until_ready();

    listening_address = is_sock ? string_format("unix://%s", hostname.c_str())
-                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", hostname.c_str(), port);
+                                : string_format("%s://%s:%d", is_ssl ? "https" : "http", common_http_format_host(hostname).c_str(), port);
    return true;
 }

@@ -1,4 +1,5 @@
 #include "server-common.h"
+#include "http.h"
 #include "server-models.h"
 #include "server-context.h"
 #include "server-stream.h"
@@ -2263,7 +2264,8 @@ server_http_proxy::server_http_proxy(
            }
            if (lowered == "host") {
                bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
-                req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
+                const std::string url_host = common_http_format_host(host);
+                req.set_header(key, is_default_port ? url_host : url_host + ":" + std::to_string(port));
            } else {
                req.set_header(key, value);
            }
@@ -43,7 +43,7 @@
 		assistantMessages: number;
 		messageTypes: string[];
 	} | null>(null);
-	let editedContent = $derived(message.content);
+	let editedContent = $state(message.content);

 	let rawEditContent = $derived.by(() => {
 		if (message.role !== MessageRole.ASSISTANT) return undefined;
@@ -288,9 +288,7 @@ export const API_CACHING_PATTERNS = {
 } as const;

 // SvelteKit PWA plugin options
-export const PWA_KIT_OPTIONS = {
-	NAVIGATE_FALLBACK: './'
-} as const;
+export const PWA_KIT_OPTIONS = {} as const;

 export const APPLE_META_TAGS = {
 	MOBILE_WEB_APP_CAPABLE: { name: 'apple-mobile-web-app-capable', content: 'yes' },
@@ -322,6 +320,14 @@ export const SVELTEKIT_PWA_OPTIONS: SvelteKitPWAOptions = {
 		globIgnores: GLOB_IGNORES,
 		maximumFileSizeToCacheInBytes: CACHE_SETTINGS.MAX_FILE_SIZE_BYTES,

+		// Prevent @vite-pwa/sveltekit from auto-adding a NavigationRoute by
+		// setting navigateFallback to empty string. This keeps the service
+		// worker from intercepting direct browser navigation to server API
+		// endpoints (e.g. /slots, /models, /v1/models) which should return
+		// JSON, not the SPA HTML shell. The server's own static-file fallback
+		// handles non-API navigation to index.html for the SPA router.
+		navigateFallback: '',
+
 		// Runtime caching for API calls - use NetworkFirst so APIs are always fresh
 		runtimeCaching: [
 			{
@@ -351,10 +357,7 @@ export const SVELTEKIT_PWA_OPTIONS: SvelteKitPWAOptions = {

 	devOptions: {
 		enabled: true,
-		suppressWarnings: true,
-		// Use PWA_KIT_OPTIONS.NAVIGATE_FALLBACK to match production SW behaviour
-		// (navigateFallback defaults to the configured base path, which is '/' for this SPA).
-		navigateFallback: PWA_KIT_OPTIONS.NAVIGATE_FALLBACK
+		suppressWarnings: true
 	},

 	// SvelteKit-specific options
@@ -1083,6 +1083,11 @@ class ChatStore {
 		let resolvedModel: string | null = null;
 		let modelPersisted = false;
 		const convId = assistantMessage.convId;
+		// Tracks the last message created in this flow. Used as the parent for the next
+		// turn's assistant message so createAssistantMessage does not have to read
+		// conversationsStore.activeMessages, which may belong to a different conversation
+		// after the user navigates while the loop is still running.
+		let lastCreatedInFlow = currentMessageId;
 		// freeze the POST identity from t0 so a stop cancels with the exact session key,
 		// never a stale or empty model resolved later
 		this.setChatStreaming(convId, streamedContent, currentMessageId, effectiveModel);
@@ -1208,8 +1213,15 @@ class ChatStore {
 				};
 				if (timings) uiUpdate.timings = timings;
 				if (resolvedModel) uiUpdate.model = resolvedModel;
-				conversationsStore.updateMessageAtIndex(idx, uiUpdate);
-				await conversationsStore.updateCurrentNode(currentMessageId);
+				// touch the active ui array and node pointer only when this conversation
+				// is displayed; otherwise persist the node move straight to the db so a
+				// foreign conv's currNode stays untouched
+				if (conversationsStore.activeConversation?.id === convId) {
+					conversationsStore.updateMessageAtIndex(idx, uiUpdate);
+					await conversationsStore.updateCurrentNode(currentMessageId);
+				} else {
+					await DatabaseService.updateCurrentNode(convId, currentMessageId);
+				}
 			},
 			createToolResultMessage: async (
 				toolCallId: string,
@@ -1230,8 +1242,16 @@ class ChatStore {
 					},
 					currentMessageId
 				);
-				conversationsStore.addMessageToActive(msg);
-				await conversationsStore.updateCurrentNode(msg.id);
+				// mirror into the active store and move the node pointer only when this
+				// conversation is displayed; otherwise persist the node move straight to
+				// the db for the owning conv so a foreign conv's currNode stays untouched
+				if (conversationsStore.activeConversation?.id === convId) {
+					conversationsStore.addMessageToActive(msg);
+					await conversationsStore.updateCurrentNode(msg.id);
+				} else {
+					await DatabaseService.updateCurrentNode(convId, msg.id);
+				}
+				lastCreatedInFlow = msg.id;
 				return msg;
 			},
 			createAssistantMessage: async () => {
@@ -1239,8 +1259,6 @@ class ChatStore {
 				streamedContent = '';
 				streamedReasoningContent = '';

-				const lastMsg =
-					conversationsStore.activeMessages[conversationsStore.activeMessages.length - 1];
 				const msg = await DatabaseService.createMessageBranch(
 					{
 						convId,
@@ -1252,10 +1270,13 @@ class ChatStore {
 						children: [],
 						model: resolvedModel
 					},
-					lastMsg.id
+					lastCreatedInFlow
 				);
-				conversationsStore.addMessageToActive(msg);
+				if (conversationsStore.activeConversation?.id === convId) {
+					conversationsStore.addMessageToActive(msg);
+				}
 				currentMessageId = msg.id;
+				lastCreatedInFlow = msg.id;
 				return msg;
 			},
 			onFlowComplete: (finalTimings?: ChatMessageTimings) => {
@@ -43,7 +43,10 @@ test.describe('PWA Service Worker', () => {
 		expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/);
 		expect(swContent).toMatch(/"manifest\.webmanifest"/);
 		expect(swContent).toMatch(/"_app\/version\.json"/);
-		expect(swContent).toMatch(/NavigationRoute/);
+		// NavigationRoute is intentionally absent — server API endpoints
+		// (e.g. /slots, /models) must not be intercepted by the PWA and
+		// should return JSON directly from the server.
+		expect(swContent).not.toMatch(/NavigationRoute/);
 		expect(swContent).toMatch(/api-cache/);
 	});

@@ -108,9 +108,11 @@ describe('PWA Build Output', () => {
 			expect(swContent).toMatch(/"manifest\.webmanifest"/);
 		});

-		it('has navigation route registered', () => {
+		it('no navigation route — API endpoints bypass PWA', () => {
 			expect(swContent).toBeTruthy();
-			expect(swContent).toMatch(/NavigationRoute/);
+			// NavigationRoute is intentionally absent so direct browser
+			// navigation to server API endpoints returns JSON, not HTML.
+			expect(swContent).not.toMatch(/NavigationRoute/);
 		});

 		it('has runtime caching for API routes', () => {
Author	SHA1	Message	Date
Johannes Gäßler	b820cc8e6f	CUDA: consistent use of __restrict__ + PDL for FA (#25185 )	2026-07-01 10:55:14 +02:00
ragz4125	6dbc1174b8	ggml-cpu: add AVX2 optimization for nvfp4 dot product and use UE4M3 LUT (#23961 )	2026-07-01 15:31:20 +08:00
Aleksander Grygier	9d88e7cedd	ui Prevent tool messages from incorrectly appending to other conversations (#25177 ) * fix: Prevent tool messages from incorrectly appending to other conversations * ui: prevent agentic loop from poisoning another conv's currNode * ui: make editedContent a so background recompute does not wipe in-progress edits --------- Co-authored-by: Pascal <admin@serveurperso.com>	2026-07-01 09:25:18 +02:00
Aleksander Grygier	7af4279f45	ui: Remove PWA navigate fallback to prevent caching API endpoint requests (#25174 )	2026-07-01 07:32:55 +02:00
lhez	fd1a05791d	opencl: initial q1_0 support (#25160 ) * opencl: general q1_0 support * opencl: add Adreno GEMM/GEMV for q1_0	2026-06-30 21:43:20 -07:00
fairydreaming	0eca4d490e	cuda : prevent integer truncation and overflow errors when using KQ mask strides in flash_attn_mask_to_KV_max kernel (#24945 ) Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>	2026-06-30 20:47:05 +02:00
Jürgen Schmied	4f31eedb0c	model : register t_layer_inp for qwen3next (#25141 ) * Fix input assignment in layer processing loop Fix DFLASH for qwen-coder-next * add line break Added tensor for attention normalization in Qwen3 model.	2026-06-30 17:57:14 +02:00
Pascal	799fcc04a5	common,server: handle bracketed IPv6 literals in URL authority (#25140 ) * common,server: handle bracketed IPv6 literals in URL authority Parse the [host]:port form (RFC 3986) and bracket IPv6 hosts when formatting a URL authority: listening log, proxy Host header, proxy log, client rebuild. The per-request remote_addr stays bare. * common: restore unsupported scheme throw in url parser Address @ngxson review: keep the explicit reject in port resolution so the block stays self-contained. Non-http(s) schemes still throw (also gated at the top of common_http_parse_url).	2026-06-30 16:16:44 +02:00
Matt Jallo	931eb37f8c	CUDA: fix get_rows_back for tables with more than 65535 rows (grid-y clamp + stride) (#25103 )	2026-06-30 14:16:24 +02:00
Johannes Gäßler	e495d1e748	CUDA: fix Gemma E4B MTP FlashAttention (#25148 ) * CUDA: fix Gemma E4B MTP FlashAttention * remove unused template declaration	2026-06-30 14:06:54 +02:00
Kevin Liu	f708a5b2ca	vulkan: roll bk loop in matmul for asahi linux (#24663 ) * vulkan: roll bk loop in matmul for asahi linux * vulkan: fix inline comment * vulkan: revert BK-loop unroll change * vulkan: edit spirv directly for asahi roll bk loop * vulkan: remove trailing whitespace at the end of comments	2026-06-30 12:27:38 +02:00
zduford	d9df11006f	HIP: use hipBLAS for dense prefill on gfx900, keep MMQ for MoE (#24588 ) * HIP: keep MMQ for gfx900 MoE and Q8_0, use hipBLAS for dense K-quants Assisted-by: GitHub Copilot CLI * HIP: tighten conditional block to be explicitly for gfx900 * HIP: Further simplified gfx900 conditional block * removed unnecessary comment	2026-06-30 11:51:38 +02:00