Compare commits

...

2 Commits

Author SHA1 Message Date
Masashi Yoshimura 6c5de1cc83 ggml-webgpu: add support for NVFP4 (#25143) 2026-06-30 17:20:04 +09:00
Oliver Simons 86b94708f2 Revert "sched : reintroduce less synchronizations during split compute (#20793)" (#25138) 2026-06-30 08:41:45 +08:00
8 changed files with 142 additions and 30 deletions
+3 -7
View File
@@ -1551,8 +1551,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];
ggml_backend_synchronize(split_backend);
// copy the input tensors to the split backend
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
@@ -1563,15 +1561,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else if (!split_backend->iface.cpy_tensor_async) {
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
ggml_backend_tensor_copy(input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else if (!split_backend->iface.cpy_tensor_async) {
} else {
ggml_backend_synchronize(split_backend);
}
@@ -1676,8 +1674,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
}
ggml_backend_synchronize(split_backend);
if (!sched->callback_eval) {
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
if (ec != GGML_STATUS_SUCCESS) {
+4 -20
View File
@@ -3192,24 +3192,11 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
// Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
// Excluding this path for HIP and MUSA as a precaution.
// According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
// Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
// e.g. https://github.com/ROCm/rocm-systems/issues/5109
// It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
// This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
const bool copy_from_host = false;
#else
const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
#endif
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
return false;
}
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
return false;
}
@@ -3220,17 +3207,14 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif // NDEBUG
return false;
}
if (copy_from_host) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
} else if (backend_src != backend_dst) {
if (backend_src != backend_dst) {
// copy on src stream
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
@@ -1563,6 +1563,7 @@ class ggml_webgpu_shader_lib {
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
{
// Quantized types using u32 buffers for portability.
defines.push_back("SRC_TYPE=u32");
@@ -1593,6 +1594,8 @@ class ggml_webgpu_shader_lib {
} else if ((key.src_type >= GGML_TYPE_Q4_0 && key.src_type <= GGML_TYPE_Q8_1) ||
key.src_type == GGML_TYPE_IQ4_NL || key.src_type == GGML_TYPE_MXFP4) {
defines.push_back("BLOCK_SIZE=32u");
} else if (key.src_type == GGML_TYPE_NVFP4) {
defines.push_back("BLOCK_SIZE=64u");
} else if (key.src_type >= GGML_TYPE_Q2_K) {
defines.push_back("BLOCK_SIZE=256u");
} else {
@@ -1960,6 +1963,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
@@ -2103,6 +2107,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
@@ -2274,6 +2279,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
@@ -2394,6 +2400,7 @@ class ggml_webgpu_shader_lib {
defines.push_back(type_upper + "_TABLES");
break;
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
defines.push_back(type_upper + "_LUT");
break;
default:
+3
View File
@@ -4056,6 +4056,7 @@ static bool ggml_webgpu_supported_qtype(ggml_type type) {
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
return true;
default:
return false;
@@ -4156,6 +4157,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
supports_op = true;
break;
default:
@@ -4196,6 +4198,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_MXFP4:
case GGML_TYPE_NVFP4:
supports_op = true;
break;
default:
@@ -896,9 +896,23 @@ const kvalues_iq4nl = array<i32, 16>(
#endif
#ifdef MXFP4_LUT
#if defined(MXFP4_LUT) || defined(NVFP4_LUT)
const kvalues_mxfp4 = array<i32, 16>(
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12
);
#endif
#endif // MXFP4_LUT || NVFP4_LUT
#ifdef NVFP4_LUT
fn ue4m3_to_fp32(u: u32) -> f32 {
if (u == 0u || u == 127u) {
return 0.0;
}
let exp = (u >> 3u) & 15u;
let man = u & 7u;
if (exp == 0u) {
return f32(man) * (1.0 / 512.0);
}
let bits = ((exp + 120u) << 23u) | (man << 20u);
return bitcast<f32>(bits);
}
#endif // NVFP4_LUT
@@ -672,6 +672,27 @@ fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
}
#endif
#ifdef NVFP4
fn copy_elements(src_base: u32, dst_base: u32, offset: u32) {
let block_byte_base = (src_base + offset) * 36;
let d_word = load_u32_at_src(block_byte_base);
for (var sub: u32 = 0u; sub < 4; sub++) {
let d = ue4m3_to_fp32(get_byte(d_word, sub)) * 0.5;
for (var j: u32 = 0u; j < 2; j++) {
let q_packed = load_u32_at_src(block_byte_base + 4 + sub * 8 + j * 4);
for (var k: u32 = 0; k < 4; k++) {
let q_byte = get_byte(q_packed, k);
let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d;
let dst_offset = dst_base + offset * 64 + sub * 16 + j * 4 + k;
dst[dst_offset] = q_lo;
dst[dst_offset + 8u] = q_hi;
}
}
}
}
#endif
@group(0) @binding(0)
var<storage, read_write> src: array<SRC_TYPE>;
@@ -241,7 +241,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
#endif // INIT_SRC0_SHMEM_Q8_1
#if defined(INIT_SRC0_SHMEM_MXFP4)
let block_byte_base = src0_idx * 17u;
let block_byte_base = src0_idx * 17u; // BLOCK_SIZE_BYTES = 17u;
let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
let e = ldexp(1.0, i32(eu8) - 128);
@@ -263,6 +263,47 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
}
#endif // legacy-quants
#if defined(INIT_SRC0_SHMEM_NVFP4)
const BLOCK_SIZE = 64u;
const BLOCK_SIZE_BYTES = 36u;
const SUB_BLOCK_SIZE = 16u; // elements sharing one UE4M3 scale
const NQ = 16u;
const BYTES_PER_THREAD = 8u;
const BYTES_PER_INNER_LOOP = 4u;
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
let tile_m = i / TILE_K;
let tile_k_start = i % TILE_K;
let global_m = offset_m + tile_m;
let global_k_start = k_outer + tile_k_start;
if (global_m >= params.m) {
break;
}
let block_k = global_k_start / BLOCK_SIZE;
let sub_block = (global_k_start % BLOCK_SIZE) / SUB_BLOCK_SIZE;
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
let d_byte_base = block_byte_base;
let qs_byte_base = block_byte_base + 4u;
let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(d_byte_base), sub_block)) * 0.5;
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j++) {
let q_packed = load_u32_at_src0_aligned(qs_byte_base + sub_block * 8u + j * 4u);
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
let q_byte = get_byte(q_packed, k);
shmem[i + j * BYTES_PER_INNER_LOOP + k] = f16(f32(kvalues_mxfp4[q_byte & 0xF]) * d);
shmem[i + j * BYTES_PER_INNER_LOOP + k + 8u] = f16(f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * d);
}
}
}
}
#endif // INIT_SRC0_SHMEM_NVFP4
// k-quants
#if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
const BLOCK_SIZE = 256u;
@@ -1505,3 +1505,49 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src
return acc;
}
#endif
#ifdef MUL_ACC_NVFP4
#define BLOCK_SIZE 64
#define BLOCK_SIZE_BYTES 36
#define THREADS_PER_BLOCK 4
#define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK)
fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array<array<f32, OUTPUTS_PER_WG>, NUM_COLS> {
var acc: array<array<f32, OUTPUTS_PER_WG>, NUM_COLS>;
let num_blocks = params.k / BLOCK_SIZE;
let sub = thread_id % THREADS_PER_BLOCK;
for (var block = thread_id/THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE/THREADS_PER_BLOCK) {
let x_base = src1_idx_base + block * BLOCK_SIZE + sub * ELEMS_PER_THREAD;
var x_block: array<array<f32, ELEMS_PER_THREAD>, NUM_COLS>;
for (var col = 0u; col < NUM_COLS;col += 1) {
for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) {
x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]);
x_block[col][i + 8] = f32(src1[x_base + col * params.stride_11 + i + 8]);
}
}
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let output_row = row_base + row;
if (output_row < params.m) {
let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES;
let d = ue4m3_to_fp32(get_byte(load_u32_at_src0_aligned(block_byte_base), sub)) * 0.5;
let q_w0 = load_u32_at_src0_aligned(block_byte_base + 4u + 8u * sub);
let q_w1 = load_u32_at_src0_aligned(block_byte_base + 8u + 8u * sub);
for (var col = 0u;col < NUM_COLS;col += 1) {
var row_sum = 0.0;
for (var l = 0u; l < 8u; l++) {
let q_word = select(q_w0, q_w1, l >= 4u);
let q_byte = get_byte(q_word, l % 4u);
let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * d;
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4u) & 0xFu]) * d;
row_sum += q_lo * x_block[col][l];
row_sum += q_hi * x_block[col][l + 8u];
}
acc[col][row] += row_sum;
}
}
}
}
return acc;
}
#endif