Compare commits

...

1 Commits
b9535 ... b9536

Author SHA1 Message Date
lhez
308f61c31f opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160)
* opencl: allow multiple workgroups for large rows

* opencl: improve small cpy

* opencl: packed concat for small input

* opencl: tweak flat q6_K gemv, increase N_DST and remap threads
2026-06-05 13:45:25 -07:00
5 changed files with 244 additions and 83 deletions

View File

@@ -558,7 +558,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_i32_i32;
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32, kernel_cpy_f32_f32_pack, kernel_cpy_i32_i32;
cl_kernel kernel_mul_mat_f32_f32;
cl_kernel kernel_mul_mat_f16_f16;
cl_kernel kernel_mul_mat_f16_f32_1row;
@@ -639,7 +639,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_softplus_f16, kernel_softplus_f16_4, kernel_softplus_f16_nc;
cl_kernel kernel_upscale;
cl_kernel kernel_upscale_bilinear;
cl_kernel kernel_concat_f32;
cl_kernel kernel_concat_f32, kernel_concat_f32_pack;
cl_kernel kernel_conv_2d_f16;
cl_kernel kernel_conv_2d_f32;
cl_kernel kernel_conv_2d_f16_f32;
@@ -1121,6 +1121,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(prog, "kernel_cpy_f16_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(prog, "kernel_cpy_f32_f16", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(prog, "kernel_cpy_f32_f32", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_f32_f32_pack = clCreateKernel(prog, "kernel_cpy_f32_f32_pack", &err), err));
CL_CHECK((backend_ctx->kernel_cpy_i32_i32 = clCreateKernel(prog, "kernel_cpy_i32_i32", &err), err));
GGML_LOG_CONT(".");
}
@@ -2615,6 +2616,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
CL_CHECK((backend_ctx->kernel_concat_f32_pack = clCreateKernel(prog, "kernel_concat_f32_pack", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
@@ -8552,7 +8554,14 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
nth *= 2;
}
size_t global_work_size[] = {(size_t)ne10*nth, (size_t)ne11, (size_t)ne12};
int nchunks = 1;
if (src0->type == GGML_TYPE_F32) {
const int chunk_target = nth * 4;
nchunks = (ne00 + chunk_target - 1) / chunk_target;
nchunks = MAX(1, MIN(nchunks, 64));
}
size_t global_work_size[] = {(size_t)ne10*nth*nchunks, (size_t)ne11, (size_t)ne12};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
@@ -11128,7 +11137,9 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
int nth = MIN(64, ne0);
cl_kernel kernel = backend_ctx->kernel_concat_f32;
const bool concat_pack = (dim == 0 && ne0 < 32);
cl_kernel kernel = concat_pack ? backend_ctx->kernel_concat_f32_pack
: backend_ctx->kernel_concat_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -11155,10 +11166,28 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
size_t local_work_size[] = {(size_t)nth, 1, 1};
if (concat_pack) {
// packed kernel needs the dst dims to unflatten its 1-D row index.
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne2));
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &ne3));
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
const int base = MIN(64, maxwg);
const int tpr = MIN(ne0, base); // threads per row
const int rpw = MAX(1, base / tpr); // rows per workgroup
const int lsz = tpr * rpw;
const int nrows = ne1*ne2*ne3;
const int nwg = (nrows + rpw - 1) / rpw;
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
size_t local_work_size[] = {(size_t)lsz, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
} else {
size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -14536,7 +14565,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
} else if (backend_ctx->gpu_family == ADRENO) {
nth0 = 64;
nth1 = 2;
ndst = 4;
ndst = 16;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
@@ -16633,7 +16662,8 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
kernel = backend_ctx->kernel_cpy_f32_f16;
break;
case GGML_TYPE_F32:
kernel = backend_ctx->kernel_cpy_f32_f32;
kernel = ne00 < 32 ? backend_ctx->kernel_cpy_f32_f32_pack
: backend_ctx->kernel_cpy_f32_f32;
break;
default:
GGML_ASSERT(false && "not implemented");
@@ -16685,12 +16715,27 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
const int nth = MIN(64, ne00);
if (kernel == backend_ctx->kernel_cpy_f32_f32_pack) {
const int maxwg = (int)backend_ctx->get_kernel_workgroup_size(kernel);
const int base = MIN(64, maxwg);
const int tpr = MIN(ne00, base); // threads per row
const int rpw = MAX(1, base / tpr); // rows per workgroup
const int lsz = tpr * rpw; // <= base <= maxwg
const int nrows = ne01*ne02*ne03;
const int nwg = (nrows + rpw - 1) / rpw;
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
size_t global_work_size[] = {(size_t)nwg*lsz, 1, 1};
size_t local_work_size[] = {(size_t)lsz, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, src1);
} else {
const int nth = MIN(64, ne00);
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
}
}
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

View File

@@ -49,3 +49,70 @@ kernel void kernel_concat_f32(
*y = *x;
}
}
kernel void kernel_concat_f32_pack(
global const char * src0,
ulong offset0,
global const char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3,
int dim,
int ne1,
int ne2,
int ne3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int lsz = get_local_size(0);
int tpr = min(ne0, lsz); // threads per row
int rpw = lsz / tpr; // rows per workgroup
int lid = get_local_id(0);
int row = get_group_id(0)*rpw + lid / tpr;
int lane = lid - (lid / tpr) * tpr;
int nrows = ne1*ne2*ne3;
if (row >= nrows) {
return;
}
int i1 = row % ne1;
int t = row / ne1;
int i2 = t % ne2;
int i3 = t / ne2;
int o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
for (int i0 = lane; i0 < ne0; i0 += tpr) {
global const float * x;
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
} else {
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
}
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*y = *x;
}
}

View File

@@ -183,6 +183,65 @@ kernel void kernel_cpy_f32_f32(
}
}
kernel void kernel_cpy_f32_f32_pack(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne0,
int ne1,
int ne2,
int ne3,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
int lsz = get_local_size(0);
int tpr = min(ne00, lsz); // threads per row
int rpw = lsz / tpr; // rows per workgroup
int lid = get_local_id(0);
int row = get_group_id(0)*rpw + lid / tpr;
int lane = lid - (lid / tpr) * tpr;
int nrows = ne01*ne02*ne03;
if (row >= nrows) {
return;
}
int i01 = row % ne01;
int t = row / ne01;
int i02 = t % ne02;
int i03 = t / ne02;
// linear index of the first element of this row, unflattened over dst dims
long n = (long)row * ne00;
int i3 = (int)(n / ((long)ne2*ne1*ne0));
long rm = n - (long)i3*ne2*ne1*ne0;
int i2 = (int)(rm / ((long)ne1*ne0));
rm -= (long)i2*ne1*ne0;
int i1 = (int)(rm / ne0);
int i0 = (int)(rm - (long)i1*ne0);
global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
for (int i00 = lane; i00 < ne00; i00 += tpr) {
global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
dst_data[i00] = src[0];
}
}
kernel void kernel_cpy_i32_i32(
global int * src0,
ulong offset0,

View File

@@ -82,21 +82,27 @@ kernel void kernel_get_rows_f32(
src1 = (global int*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
int i10 = get_group_id(0);
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int nchunks = get_num_groups(0) / ne10;
int g = get_group_id(0);
int i10 = g / nchunks;
int chunk = g - i10 * nchunks;
int i11 = get_group_id(1);
int i12 = get_group_id(2);
int r = ((global int *) ((global char *) src1 + i12*nb12 + i11*nb11 + i10*nb10))[0];
int i02 = i11;
int i03 = i12;
for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
if (ind >= ne00) {
return;
}
((global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1))[ind] =
((global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03))[ind];
global float * dst_row = (global float *) ((global char *) dst + i12*nb3 + i11*nb2 + i10*nb1);
global float * src_row = (global float *) ((global char *) src0 + r*nb01 + i02*nb02 + i03*nb03);
int span = (ne00 + nchunks - 1) / nchunks;
int start = chunk * span;
int end = min(start + span, ne00);
for (int ind = start + get_local_id(0); ind < end; ind += get_local_size(0)) {
dst_row[ind] = src_row[ind];
}
}

View File

@@ -33,13 +33,15 @@ inline float block_q_6_K_dot_y_flat(
global uchar * blk_qh,
global char * blk_scales,
global half * blk_d,
global float * yy,
int ib,
int ip,
int is,
int l0
int l0,
float4 y0,
float4 y1,
float4 y2,
float4 y3
) {
int y_offset = 128*ip + l0;
int q_offset_l = 64*ip + l0;
int q_offset_h = 32*ip + l0;
@@ -48,36 +50,28 @@ inline float block_q_6_K_dot_y_flat(
global uchar * qh = blk_qh + ib*64 + q_offset_h;
global char * sc = blk_scales + ib*16 + is;
global float * y = yy + ib * QK_K + y_offset;
float dall = blk_d[ib];
float sumf = 0;
float4 sums = {0.f, 0.f, 0.f, 0.f};
// Vectorized loads: 3 uchar4 weight loads instead of 12 scalar byte reads.
// q_offset_l/h are 4-aligned, so these are aligned vector loads.
uchar4 q1v = vload4(0, q1);
uchar4 q2v = vload4(0, q2);
uchar4 qhv = vload4(0, qh);
sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[0+64] * ((float)((q1[0] >> 4) | ((qh[0] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[0+96] * ((float)((q2[0] >> 4) | ((qh[0] & Q6_K_MASK4) >> 2)) - 32.f);
int4 q1i = convert_int4(q1v);
int4 q2i = convert_int4(q2v);
int4 qhi = convert_int4(qhv);
sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[1+64] * ((float)((q1[1] >> 4) | ((qh[1] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[1+96] * ((float)((q2[1] >> 4) | ((qh[1] & Q6_K_MASK4) >> 2)) - 32.f);
// Reconstruct the four 6-bit weight groups (low/high nibble of ql OR'd with the
// matching 2-bit plane of qh), same arithmetic as the scalar version, then dot()
// against the cached activation lanes.
float4 w0 = convert_float4((q1i & 0xF) | ((qhi & Q6_K_MASK1) << 4)) - 32.f;
float4 w1 = convert_float4((q2i & 0xF) | ((qhi & Q6_K_MASK2) << 2)) - 32.f;
float4 w2 = convert_float4((q1i >> 4) | ((qhi & Q6_K_MASK3) )) - 32.f;
float4 w3 = convert_float4((q2i >> 4) | ((qhi & Q6_K_MASK4) >> 2)) - 32.f;
sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[2+64] * ((float)((q1[2] >> 4) | ((qh[2] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[2+96] * ((float)((q2[2] >> 4) | ((qh[2] & Q6_K_MASK4) >> 2)) - 32.f);
sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & Q6_K_MASK1) << 4)) - 32.f);
sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & Q6_K_MASK2) << 2)) - 32.f);
sums.s2 += y[3+64] * ((float)((q1[3] >> 4) | ((qh[3] & Q6_K_MASK3) << 0)) - 32.f);
sums.s3 += y[3+96] * ((float)((q2[3] >> 4) | ((qh[3] & Q6_K_MASK4) >> 2)) - 32.f);
sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
return sumf;
return dall * (dot(y0, w0) * sc[0] + dot(y1, w1) * sc[2] +
dot(y2, w2) * sc[4] + dot(y3, w3) * sc[6]);
}
#undef N_DST
@@ -89,7 +83,7 @@ inline float block_q_6_K_dot_y_flat(
#define N_SIMDGROUP 2
#define N_SIMDWIDTH 16
#elif defined (ADRENO_GPU)
#define N_DST 4
#define N_DST 16
#define N_SIMDGROUP 2
#define N_SIMDWIDTH 64
#endif
@@ -146,49 +140,39 @@ kernel void kernel_mul_mv_q6_K_f32_flat(
global half * blk_d = (global half *) src0_d + offset_src0_d;
global float * yy = (global float *) src1 + r1*ne10 + im*ne00*ne1;
int tid = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
int ix = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
int tid = get_sub_group_local_id()%(N_SIMDWIDTH/BLOCK_STRIDE); // within-super-block part, 0..15
int ix = get_sub_group_local_id()/(N_SIMDWIDTH/BLOCK_STRIDE); // super-block selector, 0..BLOCK_STRIDE-1
int ip = tid/8; // first or second half of (super) block (0 or 1)
int il = tid%8; // each half has 8 parts, one per scale
int n = 4; // 4 scales at a time (and 4 sums)
int l0 = n*il; // offset into half-block, 0..28
int is = 8*ip + l0/16; // 0, 1, 8, 9
float4 sumf = 0;
float sumf[N_DST];
for (int row = 0; row < N_DST; row++) {
sumf[row] = 0.f;
}
for (int ib = ix; ib < nb; ib += BLOCK_STRIDE) {
if (first_row + 0 < ne01) {
sumf.s0 += block_q_6_K_dot_y_flat(blk_ql + 0*nb*128, blk_qh + 0*nb*64, blk_scales + 0*nb*16, blk_d + 0*nb, yy, ib, ip, is, l0);
}
if (first_row + 1 < ne01) {
sumf.s1 += block_q_6_K_dot_y_flat(blk_ql + 1*nb*128, blk_qh + 1*nb*64, blk_scales + 1*nb*16, blk_d + 1*nb, yy, ib, ip, is, l0);
}
if (first_row + 2 < ne01) {
sumf.s2 += block_q_6_K_dot_y_flat(blk_ql + 2*nb*128, blk_qh + 2*nb*64, blk_scales + 2*nb*16, blk_d + 2*nb, yy, ib, ip, is, l0);
}
if (first_row + 3 < ne01) {
sumf.s3 += block_q_6_K_dot_y_flat(blk_ql + 3*nb*128, blk_qh + 3*nb*64, blk_scales + 3*nb*16, blk_d + 3*nb, yy, ib, ip, is, l0);
global float * y = yy + ib * QK_K + 128*ip + l0;
float4 y0 = vload4(0, y + 0);
float4 y1 = vload4(0, y + 32);
float4 y2 = vload4(0, y + 64);
float4 y3 = vload4(0, y + 96);
for (int row = 0; row < N_DST; row++) {
if (first_row + row < ne01) {
sumf[row] += block_q_6_K_dot_y_flat(
blk_ql + row*nb*128, blk_qh + row*nb*64, blk_scales + row*nb*16, blk_d + row*nb,
ib, ip, is, l0, y0, y1, y2, y3);
}
}
}
float4 tot = (float4)(
sub_group_reduce_add(sumf.s0),
sub_group_reduce_add(sumf.s1),
sub_group_reduce_add(sumf.s2),
sub_group_reduce_add(sumf.s3)
);
if (get_sub_group_local_id() == 0) {
if (first_row + 0 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
}
if (first_row + 1 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
}
if (first_row + 2 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
}
if (first_row + 3 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
for (int row = 0; row < N_DST; row++) {
float tot = sub_group_reduce_add(sumf[row]);
if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
}
}
}