Compare commits

...

2 Commits

Author SHA1 Message Date
Charles Xu
eff06702b2 kleidiai : update to v1.24.0 and use release archive (#22549) 2026-05-04 22:13:31 +03:00
leonardHONG
e77056f9b2 CUDA: use fastdiv for batch index split in get_rows (#22650) 2026-05-04 16:24:05 +02:00
2 changed files with 24 additions and 14 deletions

View File

@@ -578,13 +578,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.22.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
set(KLEIDIAI_COMMIT_TAG "v1.24.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
set(KLEIDIAI_RELEASE_ARCHIVE_MD5 "2f02ebe29573d45813e671eb304f2a00")
set(KLEIDIAI_FETCH_ARGS
URL ${KLEIDIAI_DOWNLOAD_URL}
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)

View File

@@ -6,17 +6,18 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
const int i11 = dm.x;
const int i12 = dm.y;
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
@@ -42,17 +43,18 @@ template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
const int i11 = dm.x;
const int i12 = dm.y;
if (i00 >= ne00) {
return;
@@ -115,10 +117,14 @@ static void get_rows_cuda_q(
GGML_ASSERT(ne00 % 2 == 0);
GGML_ASSERT(ne12 > 0);
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
const uint3 ne12_fdv = init_fastdiv_values(ne12);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
@@ -146,10 +152,14 @@ static void get_rows_cuda_float(
const size_t s12 = nb12 / sizeof(int32_t);
// const size_t s13 = nb13 / sizeof(int32_t);
GGML_ASSERT(ne12 > 0);
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
const uint3 ne12_fdv = init_fastdiv_values(ne12);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);