mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-09 04:22:58 +02:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e1aca09da | ||
|
|
7d2b45b4f7 | ||
|
|
42a0afd594 | ||
|
|
a66d50588b | ||
|
|
1705d434f6 | ||
|
|
3b3da01dc2 |
@@ -789,6 +789,16 @@ class Gemma4UnifiedModel(Gemma4Model):
|
||||
class Gemma4AssistantModel(Gemma4Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
|
||||
if "masked_embedding" in name:
|
||||
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||
return None
|
||||
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"])
|
||||
|
||||
@@ -448,15 +448,19 @@ struct ggml_webgpu_upscale_pipeline_key_hash {
|
||||
/** Concat **/
|
||||
|
||||
struct ggml_webgpu_concat_pipeline_key {
|
||||
int type;
|
||||
int type;
|
||||
bool src_overlap;
|
||||
|
||||
bool operator==(const ggml_webgpu_concat_pipeline_key & other) const { return type == other.type; }
|
||||
bool operator==(const ggml_webgpu_concat_pipeline_key & other) const {
|
||||
return type == other.type && src_overlap == other.src_overlap;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_concat_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_concat_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.type);
|
||||
ggml_webgpu_hash_combine(seed, key.src_overlap);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
@@ -2634,6 +2638,7 @@ class ggml_webgpu_shader_lib {
|
||||
webgpu_pipeline get_concat_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_concat_pipeline_key key = {};
|
||||
key.type = context.dst->type;
|
||||
key.src_overlap = ggml_webgpu_tensor_overlap(context.src0, context.src1);
|
||||
|
||||
auto it = concat_pipelines.find(key);
|
||||
if (it != concat_pipelines.end()) {
|
||||
@@ -2656,11 +2661,17 @@ class ggml_webgpu_shader_lib {
|
||||
GGML_ABORT("Unsupported type for concat shader");
|
||||
}
|
||||
|
||||
if (key.src_overlap) {
|
||||
defines.push_back("SRC_OVERLAP");
|
||||
variant += "_src_overlap";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_concat, defines);
|
||||
auto decisions = std::make_shared<ggml_webgpu_generic_shader_decisions>();
|
||||
auto decisions = std::make_shared<ggml_webgpu_binary_shader_decisions>();
|
||||
decisions->wg_size = context.max_wg_size;
|
||||
decisions->src_overlap = key.src_overlap;
|
||||
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
pipeline.context = decisions;
|
||||
concat_pipelines[key] = pipeline;
|
||||
|
||||
@@ -621,10 +621,11 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_global_context & ctx,
|
||||
uint32_t value,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
std::vector<uint32_t> params = { (uint32_t) offset, (uint32_t) size, value };
|
||||
std::vector<wgpu::BindGroupEntry> entries = { ggml_webgpu_make_bind_group_entry(0, buf, 0, buf.GetSize()) };
|
||||
size_t bytes_per_wg = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.memset_bytes_per_thread;
|
||||
uint32_t wg_x = CEIL_DIV(size + 3, bytes_per_wg);
|
||||
std::vector<uint32_t> params = { (uint32_t) offset, (uint32_t) size, value };
|
||||
std::vector<wgpu::BindGroupEntry> entries = { ggml_webgpu_make_bind_group_entry(0, buf, 0, buf.GetSize()) };
|
||||
size_t bytes_per_wg =
|
||||
ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.memset_bytes_per_thread;
|
||||
uint32_t wg_x = CEIL_DIV(size + 3, bytes_per_wg);
|
||||
|
||||
ctx->queue.WriteBuffer(ctx->memset_params_buf, 0, params.data(), params.size() * sizeof(uint32_t));
|
||||
|
||||
@@ -1362,7 +1363,7 @@ static webgpu_encoded_op ggml_webgpu_get_rows(webgpu_context & ctx,
|
||||
shader_lib_ctx.src0 = src;
|
||||
shader_lib_ctx.src1 = nullptr;
|
||||
shader_lib_ctx.dst = dst;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_get_rows_pipeline(shader_lib_ctx);
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
@@ -2169,8 +2170,10 @@ static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
|
||||
}
|
||||
|
||||
uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
uint32_t wg_x, wg_y;
|
||||
uint32_t total_wg = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx,
|
||||
@@ -2244,8 +2247,10 @@ static webgpu_encoded_op ggml_webgpu_binary_op(webgpu_context & ctx,
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
uint32_t wg_x, wg_y;
|
||||
uint32_t total_wg = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_add_id(webgpu_context & ctx,
|
||||
@@ -2305,33 +2310,6 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx,
|
||||
uint32_t ne = (uint32_t) ggml_nelements(dst);
|
||||
uint32_t dim = (uint32_t) dst->op_params[0];
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
ne,
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
||||
(uint32_t) dst->ne[0],
|
||||
(uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2],
|
||||
(uint32_t) dst->ne[3],
|
||||
dim,
|
||||
(uint32_t) src0->ne[dim]
|
||||
};
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1),
|
||||
ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
|
||||
};
|
||||
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
shader_lib_ctx.src0 = src0;
|
||||
shader_lib_ctx.src1 = src1;
|
||||
@@ -2339,8 +2317,52 @@ static webgpu_encoded_op ggml_webgpu_concat(webgpu_context & ctx,
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_concat_pipeline(shader_lib_ctx);
|
||||
auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
|
||||
uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
|
||||
auto * decisions = static_cast<ggml_webgpu_binary_shader_decisions *>(pipeline.context.get());
|
||||
|
||||
uint32_t offset_src0 = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type));
|
||||
uint32_t offset_src1 = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type));
|
||||
size_t merged_offset = 0;
|
||||
size_t merged_size = 0;
|
||||
if (decisions->src_overlap) {
|
||||
const ggml_webgpu_merged_binding_range merged_range =
|
||||
ggml_webgpu_tensor_merged_binding_range(ctx, { src0, src1 });
|
||||
merged_offset = merged_range.offset;
|
||||
merged_size = merged_range.size;
|
||||
offset_src0 = ggml_webgpu_tensor_merged_element_offset(src0, merged_range);
|
||||
offset_src1 = ggml_webgpu_tensor_merged_element_offset(src1, merged_range);
|
||||
}
|
||||
|
||||
std::vector<uint32_t> params = { ne,
|
||||
offset_src0,
|
||||
offset_src1,
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
||||
(uint32_t) (src0->nb[0] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
||||
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
||||
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
||||
(uint32_t) dst->ne[0],
|
||||
(uint32_t) dst->ne[1],
|
||||
(uint32_t) dst->ne[2],
|
||||
(uint32_t) dst->ne[3],
|
||||
dim,
|
||||
(uint32_t) src0->ne[dim] };
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries = {};
|
||||
if (decisions->src_overlap) {
|
||||
entries.push_back(
|
||||
ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(src0), merged_offset, merged_size));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
|
||||
} else {
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src0));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, src1));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
|
||||
}
|
||||
|
||||
uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
@@ -2673,8 +2695,10 @@ static webgpu_encoded_op ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * s
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
|
||||
}
|
||||
|
||||
uint32_t wg_x = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
||||
uint32_t wg_x, wg_y;
|
||||
uint32_t total_wg = CEIL_DIV(ggml_nelements(dst), decisions->wg_size);
|
||||
compute_2d_workgroups(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, wg_x, wg_y);
|
||||
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
||||
}
|
||||
|
||||
static webgpu_encoded_op ggml_webgpu_soft_max(webgpu_context & ctx,
|
||||
@@ -3751,7 +3775,8 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
|
||||
|
||||
static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
|
||||
// we use the maximum workgroup size for the memset pipeline
|
||||
size_t max_threads = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup * ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
size_t max_threads = ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup *
|
||||
ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
|
||||
// Size the bytes_per_thread so that the largest buffer size can be handled
|
||||
ctx->capabilities.memset_bytes_per_thread =
|
||||
CEIL_DIV(ctx->capabilities.limits.maxStorageBufferBindingSize, max_threads);
|
||||
|
||||
@@ -130,10 +130,13 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32) {
|
||||
}
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x < params.ne) {
|
||||
let src0_i = params.offset_src0 + src0_index(gid.x);
|
||||
let src1_i = params.offset_src1 + src1_index(gid.x);
|
||||
update(params.offset_dst + gid.x, src0_i, src1_i);
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>) {
|
||||
let threads_per_group = u32(WG_SIZE);
|
||||
let i = gid.x + (num_wg.x * threads_per_group) * gid.y;
|
||||
if (i < params.ne) {
|
||||
let src0_i = params.offset_src0 + src0_index(i);
|
||||
let src1_i = params.offset_src1 + src1_index(i);
|
||||
update(params.offset_dst + i, src0_i, src1_i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,6 +31,16 @@ struct Params {
|
||||
#define DataType i32
|
||||
#endif
|
||||
|
||||
#ifdef SRC_OVERLAP
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> merged_src: array<DataType>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> dst: array<DataType>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
#else
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src0: array<DataType>;
|
||||
|
||||
@@ -42,7 +52,7 @@ var<storage, read_write> dst: array<DataType>;
|
||||
|
||||
@group(0) @binding(3)
|
||||
var<uniform> params: Params;
|
||||
|
||||
#endif
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
|
||||
@@ -62,14 +72,22 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
ni[1] * params.stride_src0_1 +
|
||||
ni[2] * params.stride_src0_2 +
|
||||
ni[3] * params.stride_src0_3;
|
||||
#ifdef SRC_OVERLAP
|
||||
dst[params.offset_dst + gid.x] = merged_src[params.offset_src0 + src_i];
|
||||
#else
|
||||
dst[params.offset_dst + gid.x] = src0[params.offset_src0 + src_i];
|
||||
#endif
|
||||
} else {
|
||||
ni[params.dim] -= params.src0_nedim;
|
||||
let src_i = ni[0] * params.stride_src1_0 +
|
||||
ni[1] * params.stride_src1_1 +
|
||||
ni[2] * params.stride_src1_2 +
|
||||
ni[3] * params.stride_src1_3;
|
||||
#ifdef SRC_OVERLAP
|
||||
dst[params.offset_dst + gid.x] = merged_src[params.offset_src1 + src_i];
|
||||
#else
|
||||
dst[params.offset_dst + gid.x] = src1[params.offset_src1 + src_i];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,72 +98,50 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q1_0
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q4_0
|
||||
#if defined(INIT_SRC0_SHMEM_Q4_0) || defined(INIT_SRC0_SHMEM_Q4_1) || defined(INIT_SRC0_SHMEM_Q5_0) || defined(INIT_SRC0_SHMEM_Q5_1) || defined(INIT_SRC0_SHMEM_Q8_0) || defined(INIT_SRC0_SHMEM_Q8_1) || defined(INIT_SRC0_SHMEM_MXFP4)
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 18u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K/BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
#if defined(INIT_SRC0_SHMEM_Q8_0) || defined(INIT_SRC0_SHMEM_Q8_1)
|
||||
const BYTES_PER_THREAD = 16u; // NQ(16) weights use 16 bytes of q
|
||||
#else
|
||||
const BYTES_PER_THREAD = 8u; // NQ(16) weights use 8 bytes of q
|
||||
#endif
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
let shmem_idx = block_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let tile_m = block_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let block_k = block_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q4_0
|
||||
let block_byte_base = src0_idx * 18u; // BLOCK_SIZE_BYTES = 18u;
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
|
||||
// store NQ(16) weights
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
|
||||
let q_byte_offset = block_byte_base + 2u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
dequant_q4_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q4_0
|
||||
#elif INIT_SRC0_SHMEM_Q4_1
|
||||
let block_byte_base = src0_idx * 20u; // BLOCK_SIZE_BYTES = 20u;
|
||||
let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
|
||||
let d = f16(dm[0]);
|
||||
let m = f16(dm[1]);
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q4_1
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 20u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K/BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 8u; // NQ(16) weights use 8 bytes of q
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
let m = load_f16_at_src0(block_byte_base + 2u);
|
||||
|
||||
// store NQ(16) weights
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
|
||||
let q_byte_offset = block_byte_base + 4u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
|
||||
@@ -175,41 +153,13 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q4_1
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q5_0
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 22u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
// tile_k is defined as 32u, so blocks_k ends up being 1 always
|
||||
override BLOCKS_K = TILE_K / BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 8u; // NQ(16) weights use 8 bytes of q
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
#elif INIT_SRC0_SHMEM_Q5_0
|
||||
let block_byte_base = src0_idx * 22u; // BLOCK_SIZE_BYTES = 22u;
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
let qh_packed = load_u32_at_src0(block_byte_base + 2u);
|
||||
|
||||
// store NQ(16) weights
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
let q_byte_offset = block_byte_base + 6u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
@@ -226,44 +176,18 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q5_0
|
||||
#elif INIT_SRC0_SHMEM_Q5_1
|
||||
let block_byte_base = src0_idx * 24u; // BLOCK_SIZE_BYTES = 24u;
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q5_1
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 24u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K / BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 8u; // NQ(16) weights use 8 bytes of q
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
|
||||
let d = f16(dm[0]);
|
||||
let m = f16(dm[1]);
|
||||
let qh_packed = load_u32_at_src0_aligned(block_byte_base + 4u);
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
let m = load_f16_at_src0(block_byte_base + 2u);
|
||||
let qh_packed = load_u32_at_src0(block_byte_base + 4u);
|
||||
|
||||
// store NQ(16) weights
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
let q_byte_offset = block_byte_base + 8u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
let q_packed = load_u32_at_src0_aligned(q_byte_offset);
|
||||
|
||||
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
@@ -277,461 +201,306 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q5_1
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q8_0
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 34u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K/BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 16u; // NQ(16) weights use 16 bytes of q
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
#elif INIT_SRC0_SHMEM_Q8_0
|
||||
let block_byte_base = src0_idx * 34u; // BLOCK_SIZE_BYTES = 34u;
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
|
||||
// store NQ(16) weights
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
let q_byte_offset = block_byte_base + 2u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
dequant_q8_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q8_0
|
||||
#elif INIT_SRC0_SHMEM_Q8_1
|
||||
let block_byte_base = src0_idx * 36u; // BLOCK_SIZE_BYTES = 36u;
|
||||
let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base));
|
||||
let d = f16(dm[0]);
|
||||
let m = f16(dm[1]);
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q8_1
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 36u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K/BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 16u; // NQ(16) weights use 16 bytes of q
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
let m = load_f16_at_src0(block_byte_base + 2u);
|
||||
|
||||
// store NQ(16) weights
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
let q_byte_offset = block_byte_base + 4u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
|
||||
let q_byte = get_byte_i32(q_packed, k);
|
||||
|
||||
let q_val = f16(q_byte) * d + m;
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k] = q_val;
|
||||
}
|
||||
}
|
||||
#elif INIT_SRC0_SHMEM_MXFP4
|
||||
let block_byte_base = src0_idx * 17u;
|
||||
let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u);
|
||||
let e = ldexp(1.0, i32(eu8) - 128);
|
||||
|
||||
// load NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
let q_byte_offset = block_byte_base + 1u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * e;
|
||||
let q_lo = f32(kvalues_mxfp4[q_byte & 0xF]) * e;
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k] = f16(q_lo);
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = f16(q_hi);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q8_1
|
||||
#endif
|
||||
|
||||
// k-quants
|
||||
#if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K)
|
||||
const BLOCK_SIZE = 256u;
|
||||
const NQ = 4u;
|
||||
|
||||
fn store_shmem_kquants(val: vec4<f16>, idx: u32) {
|
||||
shmem[idx] = val.x;
|
||||
shmem[idx + 1] = val.y;
|
||||
shmem[idx + 2] = val.z;
|
||||
shmem[idx + 3] = val.w;
|
||||
}
|
||||
|
||||
fn load_byte_at_src0_aligned(byte_offset: u32) -> u32 {
|
||||
return get_byte(load_u32_at_src0_aligned(byte_offset), byte_offset % 4u);
|
||||
}
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
|
||||
if (global_m >= params.m || global_k >= params.k) {
|
||||
store_shmem_kquants(vec4<f16>(f16(0.0), f16(0.0), f16(0.0), f16(0.0)), elem_idx);
|
||||
continue;
|
||||
}
|
||||
|
||||
let block_k = global_k / BLOCK_SIZE;
|
||||
let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 4 == 0;
|
||||
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q2_K
|
||||
const BLOCK_SIZE = 256u;
|
||||
const BLOCK_SIZE_BYTES = 84u;
|
||||
let block_byte_base = src0_idx * 84u; // BLOCK_SIZE_BYTES = 84u;
|
||||
let scales_byte_base = block_byte_base;
|
||||
let qs_byte_base = block_byte_base + 16u;
|
||||
let dm_byte_base = block_byte_base + 80u;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
// Use standard thread layout instead of lane/row_group
|
||||
for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
let d_packed = unpack2x16float(load_u32_at_src0_aligned(dm_byte_base));
|
||||
let d = f16(d_packed[0]);
|
||||
let dmin = f16(d_packed[1]);
|
||||
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
let chunk = k_in_block / 128u;
|
||||
let pos_in_chunk = k_in_block % 32u;
|
||||
let sub_block = k_in_block / 16u;
|
||||
let shift_phase = (k_in_block % 128u) / 32u;
|
||||
|
||||
if (global_m >= params.m || global_k >= params.k) {
|
||||
shmem[elem_idx] = f16(0.0);
|
||||
continue;
|
||||
}
|
||||
// whole 2 bits (4 elems)
|
||||
let qs_word = load_u32_at_src0_aligned(qs_byte_base + 32u * chunk + 1u * pos_in_chunk);
|
||||
let qs_vec4 = vec4<f16>(
|
||||
f16((qs_word >> (2u * shift_phase + 0u)) & 0x3u),
|
||||
f16((qs_word >> (2u * shift_phase + 8u)) & 0x3u),
|
||||
f16((qs_word >> (2u * shift_phase + 16u)) & 0x3u),
|
||||
f16((qs_word >> (2u * shift_phase + 24u)) & 0x3u),
|
||||
);
|
||||
|
||||
let block_k = global_k / BLOCK_SIZE;
|
||||
let k_in_block = global_k % BLOCK_SIZE;
|
||||
let scale = load_byte_at_src0_aligned(scales_byte_base + sub_block);
|
||||
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
let dl = d * f16(scale & 0xFu);
|
||||
let ml = dmin * f16(scale >> 4u);
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base + 80u);
|
||||
let dmin = load_f16_at_src0(block_byte_base + 82u);
|
||||
store_shmem_kquants(qs_vec4 * dl - ml, elem_idx);
|
||||
#elif INIT_SRC0_SHMEM_Q3_K
|
||||
let block_byte_base = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u;
|
||||
let hmask_byte_base = block_byte_base + 0u;
|
||||
let qs_byte_base = block_byte_base + 32u;
|
||||
let scales_byte_base = block_byte_base + 96u;
|
||||
|
||||
// Decode the element at position k_in_block
|
||||
let block_of_32 = k_in_block / 32u;
|
||||
let pos_in_32 = k_in_block % 32u;
|
||||
let d_all = load_f16_at_src0(block_byte_base + 108u);
|
||||
|
||||
let q_b_idx = (block_of_32 / 4u) * 32u;
|
||||
let shift = (block_of_32 % 4u) * 2u;
|
||||
let k = (pos_in_32 / 16u) * 16u;
|
||||
let l = pos_in_32 % 16u;
|
||||
let chunk = k_in_block / 128u;
|
||||
let pos_in_chunk = k_in_block % 32u;
|
||||
let sub_block = k_in_block / 16u;
|
||||
let shift_phase = (k_in_block % 128u) / 32u;
|
||||
|
||||
let is = k_in_block / 16u;
|
||||
let hmask_block = pos_in_chunk;
|
||||
let hmask_shift_phase = k_in_block / 32u;
|
||||
|
||||
let sc_packed = load_u32_at_src0(block_byte_base + 4u * (is / 4u));
|
||||
let sc = get_byte(sc_packed, is % 4u);
|
||||
// low 2 bits (4 elems)
|
||||
let q_lo2_word = load_u32_at_src0(qs_byte_base + 32u * chunk + 1u * hmask_block);
|
||||
let q_lo2_vec4 = vec4<f16>(
|
||||
f16((q_lo2_word >> (2u * shift_phase + 0u)) & 3u),
|
||||
f16((q_lo2_word >> (2u * shift_phase + 8u)) & 3u),
|
||||
f16((q_lo2_word >> (2u * shift_phase + 16u)) & 3u),
|
||||
f16((q_lo2_word >> (2u * shift_phase + 24u)) & 3u)
|
||||
);
|
||||
|
||||
let dl = d * f16(sc & 0xFu);
|
||||
let ml = dmin * f16(sc >> 4u);
|
||||
// high 1 bit (4 elems)
|
||||
let q_hi1_word = load_u32_at_src0(hmask_byte_base + pos_in_chunk);
|
||||
let q_hi1_vec4 = vec4<f16>(
|
||||
f16(select(4.0, 0.0, ((q_hi1_word >> (1u * hmask_shift_phase + 0u)) & 1u) == 1u)),
|
||||
f16(select(4.0, 0.0, ((q_hi1_word >> (1u * hmask_shift_phase + 8u)) & 1u) == 1u)),
|
||||
f16(select(4.0, 0.0, ((q_hi1_word >> (1u * hmask_shift_phase + 16u)) & 1u) == 1u)),
|
||||
f16(select(4.0, 0.0, ((q_hi1_word >> (1u * hmask_shift_phase + 24u)) & 1u) == 1u))
|
||||
);
|
||||
|
||||
let q_idx = q_b_idx + k + l;
|
||||
let q_packed = load_u32_at_src0(block_byte_base + 16u + 4u * (q_idx / 4u));
|
||||
let q_byte = get_byte(q_packed, q_idx % 4u);
|
||||
let qs_val = (q_byte >> shift) & 3u;
|
||||
let q_vec4 = q_lo2_vec4 - q_hi1_vec4;
|
||||
|
||||
let q_val = f16(qs_val) * dl - ml;
|
||||
shmem[elem_idx] = q_val;
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q2_K
|
||||
let scale_low4 = (load_byte_at_src0_aligned(scales_byte_base + (sub_block % 8u)) >> (4u * (sub_block / 8u))) & 0xFu;
|
||||
let scale_hi2 = (load_byte_at_src0_aligned(scales_byte_base + 8u + (sub_block % 4u)) >> (2u * (sub_block / 4u))) & 3u;
|
||||
let dl = d_all * (f16((scale_hi2 << 4u) | scale_low4) - 32.0);
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q3_K
|
||||
const BLOCK_SIZE = 256u;
|
||||
const BLOCK_SIZE_BYTES = 110u;
|
||||
store_shmem_kquants(dl * q_vec4, elem_idx);
|
||||
#elif INIT_SRC0_SHMEM_Q4_K
|
||||
let block_byte_base = src0_idx * 144u; // BLOCK_SIZE_BYTES = 144u;
|
||||
let dm_byte_base = block_byte_base + 0u;
|
||||
let scale_byte_base = block_byte_base + 4u;
|
||||
let qs_byte_base = block_byte_base + 16u;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
let dm = unpack2x16float(load_u32_at_src0_aligned(dm_byte_base));
|
||||
let d = f16(dm[0]);
|
||||
let dmin = f16(dm[1]);
|
||||
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
let chunk = k_in_block / 64u;
|
||||
let pos_in_chunk = (k_in_block % 64u) % 32u;
|
||||
let sub_block = k_in_block / 32u;
|
||||
let shift_phase = sub_block & 1u;
|
||||
|
||||
if (global_m >= params.m || global_k >= params.k) {
|
||||
shmem[elem_idx] = f16(0.0);
|
||||
continue;
|
||||
}
|
||||
|
||||
let block_k = global_k / BLOCK_SIZE;
|
||||
let k_in_block = global_k % BLOCK_SIZE;
|
||||
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base + 108u);
|
||||
|
||||
// Load and unpack scales
|
||||
let kmask1: u32 = 0x03030303u;
|
||||
let kmask2: u32 = 0x0f0f0f0fu;
|
||||
|
||||
var scale_vals: array<u32, 4>;
|
||||
for (var i: u32 = 0u; i < 4u; i++) {
|
||||
scale_vals[i] = load_u32_at_src0(block_byte_base + 96u + 4u * i);
|
||||
}
|
||||
|
||||
var tmp: u32 = scale_vals[2];
|
||||
scale_vals[2] = ((scale_vals[0] >> 4u) & kmask2) | (((tmp >> 4u) & kmask1) << 4u);
|
||||
scale_vals[3] = ((scale_vals[1] >> 4u) & kmask2) | (((tmp >> 6u) & kmask1) << 4u);
|
||||
scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4u);
|
||||
scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2u) & kmask1) << 4u);
|
||||
|
||||
// Load hmask and qs arrays
|
||||
var hmask_vals: array<u32, 8>;
|
||||
for (var i: u32 = 0u; i < 8u; i++) {
|
||||
hmask_vals[i] = load_u32_at_src0(block_byte_base + 4u * i);
|
||||
}
|
||||
|
||||
var qs_vals: array<u32, 16>;
|
||||
for (var i: u32 = 0u; i < 16u; i++) {
|
||||
qs_vals[i] = load_u32_at_src0(block_byte_base + 32u + 4u * i);
|
||||
}
|
||||
|
||||
let half = k_in_block / 128u; // 0 or 1
|
||||
let pos_in_half = k_in_block % 128u; // 0-127
|
||||
let shift_group = pos_in_half / 32u; // 0-3
|
||||
let pos_in_32 = pos_in_half % 32u; // 0-31
|
||||
let k_group = pos_in_32 / 16u; // 0 or 1
|
||||
let l = pos_in_32 % 16u; // 0-15
|
||||
|
||||
let q_b_idx = half * 32u; // 0 or 32
|
||||
let shift = shift_group * 2u; // 0, 2, 4, 6
|
||||
let k = k_group * 16u; // 0 or 16
|
||||
let is = k_in_block / 16u; // 0-15
|
||||
|
||||
// m increments every 32 elements across entire 256 element block
|
||||
let m_shift = k_in_block / 32u; // 0-7
|
||||
let m: u32 = 1u << m_shift; // 1,2,4,8,16,32,64,128
|
||||
|
||||
let sc = get_byte(scale_vals[is / 4u], is % 4u);
|
||||
let dl = d * (f16(sc) - 32.0);
|
||||
|
||||
let q_idx = q_b_idx + k + l;
|
||||
let hm_idx = k + l;
|
||||
|
||||
let q_byte = get_byte(qs_vals[q_idx / 4u], q_idx % 4u);
|
||||
let hmask_byte = get_byte(hmask_vals[hm_idx / 4u], hm_idx % 4u);
|
||||
|
||||
let hm = select(4.0, 0.0, (hmask_byte & m) != 0);
|
||||
let qs_val = (q_byte >> shift) & 3u;
|
||||
|
||||
let q_val = (f16(qs_val) - f16(hm)) * dl;
|
||||
shmem[elem_idx] = q_val;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // INIT_SRC0_SHMEM_Q3_K
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q4_K
|
||||
const BLOCK_SIZE = 256u;
|
||||
const BLOCK_SIZE_BYTES = 144u;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
|
||||
if (global_m >= params.m || global_k >= params.k) {
|
||||
shmem[elem_idx] = f16(0.0);
|
||||
continue;
|
||||
}
|
||||
|
||||
let block_k = global_k / BLOCK_SIZE;
|
||||
let k_in_block = global_k % BLOCK_SIZE;
|
||||
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
let dmin = load_f16_at_src0(block_byte_base + 2u);
|
||||
|
||||
// Map k_in_block to loop structure:
|
||||
// Outer loop over 64-element groups (alternating q_b_idx)
|
||||
// Inner loop over 2 shifts per group
|
||||
let group_of_64 = k_in_block / 64u; // 0-3 (maps to q_b_idx)
|
||||
let pos_in_64 = k_in_block % 64u; // 0-63
|
||||
let shift_group = pos_in_64 / 32u; // 0 or 1
|
||||
let l = pos_in_64 % 32u; // 0-31
|
||||
|
||||
let q_b_idx = group_of_64 * 32u; // 0, 32, 64, 96
|
||||
let shift = shift_group * 4u; // 0 or 4
|
||||
let is = k_in_block / 32u; // 0-7
|
||||
// whole 4 bits (4 elems)
|
||||
let qs_word = load_u32_at_src0_aligned(qs_byte_base + 32u * chunk + 1u * pos_in_chunk);
|
||||
let qs_vec4 = vec4<f16>(
|
||||
f16((qs_word >> (4u * shift_phase + 0u)) & 0xFu),
|
||||
f16((qs_word >> (4u * shift_phase + 8u)) & 0xFu),
|
||||
f16((qs_word >> (4u * shift_phase + 16u)) & 0xFu),
|
||||
f16((qs_word >> (4u * shift_phase + 24u)) & 0xFu)
|
||||
);
|
||||
|
||||
var sc: u32;
|
||||
var mn: u32;
|
||||
|
||||
let scale_base = block_byte_base + 4u;
|
||||
|
||||
if (is < 4u) {
|
||||
let sc_byte = get_byte(load_u32_at_src0(scale_base), is % 4u);
|
||||
let min_byte = get_byte(load_u32_at_src0(scale_base + 4), is % 4u);
|
||||
sc = sc_byte & 63u;
|
||||
mn = min_byte & 63u;
|
||||
if (sub_block < 4u) {
|
||||
let sc_byte = get_byte(load_u32_at_src0_aligned(scale_byte_base), sub_block % 4u);
|
||||
let min_byte = get_byte(load_u32_at_src0_aligned(scale_byte_base + 4), sub_block % 4u);
|
||||
sc = sc_byte & 63u;
|
||||
mn = min_byte & 63u;
|
||||
} else {
|
||||
let sc_min_lo = get_byte(load_u32_at_src0(scale_base + 8), (is + 4u) % 4u);
|
||||
let sc_hi = get_byte(load_u32_at_src0(scale_base), (is - 4u) % 4u);
|
||||
let min_hi = get_byte(load_u32_at_src0(scale_base + 4), is % 4u);
|
||||
|
||||
sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
|
||||
mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
|
||||
let sc_min_lo = get_byte(load_u32_at_src0_aligned(scale_byte_base + 8), (sub_block + 4u) % 4u);
|
||||
let sc_hi = get_byte(load_u32_at_src0_aligned(scale_byte_base), (sub_block - 4u) % 4u);
|
||||
let min_hi = get_byte(load_u32_at_src0_aligned(scale_byte_base + 4), sub_block % 4u);
|
||||
sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
|
||||
mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
|
||||
}
|
||||
|
||||
let dl = d * f16(sc);
|
||||
let ml = dmin * f16(mn);
|
||||
|
||||
let q_idx = q_b_idx + l;
|
||||
let q_packed = load_u32_at_src0(block_byte_base + 16u + 4u * (q_idx / 4u));
|
||||
store_shmem_kquants(dl * qs_vec4 - vec4(ml, ml, ml, ml), elem_idx);
|
||||
#elif INIT_SRC0_SHMEM_Q5_K
|
||||
let block_byte_base = src0_idx * 176u; // BLOCK_SIZE_BYTES = 176u;
|
||||
let dm_byte_base = block_byte_base + 0u;
|
||||
let scale_byte_base = block_byte_base + 4u;
|
||||
let qh_byte_base = block_byte_base + 16u;
|
||||
let qs_byte_base = block_byte_base + 48u;
|
||||
|
||||
let q_byte = get_byte(q_packed, q_idx % 4u);
|
||||
let qs_val = (q_byte >> shift) & 0xFu;
|
||||
let dm = unpack2x16float(load_u32_at_src0_aligned(dm_byte_base));
|
||||
let d = f16(dm[0]);
|
||||
let dmin = f16(dm[1]);
|
||||
|
||||
let q_val = f16(qs_val) * dl - ml;
|
||||
shmem[elem_idx] = q_val;
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q4_K
|
||||
let chunk = k_in_block / 64u;
|
||||
let pos_in_chunk = (k_in_block % 64u) % 32u;
|
||||
let sub_block = k_in_block / 32u;
|
||||
let shift_phase = sub_block & 1u;
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q5_K
|
||||
const BLOCK_SIZE = 256u;
|
||||
const BLOCK_SIZE_BYTES = 176u;
|
||||
let qh_block = k_in_block % 32u;
|
||||
let qh_shift_phase = sub_block;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
// low 4 bits (4 elems)
|
||||
let qs_word = load_u32_at_src0_aligned(qs_byte_base + 32u * chunk + 1u * pos_in_chunk);
|
||||
let qs_lo4_vec4 = vec4<f16>(
|
||||
f16((qs_word >> (4u * shift_phase + 0u)) & 0xFu),
|
||||
f16((qs_word >> (4u * shift_phase + 8u)) & 0xFu),
|
||||
f16((qs_word >> (4u * shift_phase + 16u)) & 0xFu),
|
||||
f16((qs_word >> (4u * shift_phase + 24u)) & 0xFu)
|
||||
);
|
||||
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
|
||||
if (global_m >= params.m || global_k >= params.k) {
|
||||
shmem[elem_idx] = f16(0.0);
|
||||
continue;
|
||||
}
|
||||
|
||||
let block_k = global_k / BLOCK_SIZE;
|
||||
let k_in_block = global_k % BLOCK_SIZE;
|
||||
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base);
|
||||
let dmin = load_f16_at_src0(block_byte_base + 2u);
|
||||
|
||||
|
||||
// The original loop processes elements in groups of 64
|
||||
// Each group of 64: q_b_idx cycles through [0,32,64,96], shift cycles [0,4]
|
||||
// But u increments EVERY 32 elements (after each l loop)
|
||||
let group_of_64 = k_in_block / 64u; // 0-3
|
||||
let pos_in_64 = k_in_block % 64u; // 0-63
|
||||
let shift_group = pos_in_64 / 32u; // 0 or 1
|
||||
let l = pos_in_64 % 32u; // 0-31
|
||||
|
||||
let q_b_idx = group_of_64 * 32u; // 0, 32, 64, 96
|
||||
let shift = shift_group * 4u; // 0 or 4
|
||||
let is = k_in_block / 32u; // 0-7
|
||||
|
||||
// u increments every 32 elements (0->1, 1->2, 2->4, 3->8, 4->16, 5->32, 6->64, 7->128)
|
||||
let u_shift = k_in_block / 32u; // 0-7
|
||||
let u: u32 = 1u << u_shift;
|
||||
// high 1 bit (4 elems)
|
||||
let qh_word = load_u32_at_src0_aligned(qh_byte_base + qh_block);
|
||||
let qh_vec4 = vec4<f16>(
|
||||
f16(select(0.0, 16.0, ((qh_word >> (1u * qh_shift_phase + 0u)) & 1u) == 1u)),
|
||||
f16(select(0.0, 16.0, ((qh_word >> (1u * qh_shift_phase + 8u)) & 1u) == 1u)),
|
||||
f16(select(0.0, 16.0, ((qh_word >> (1u * qh_shift_phase + 16u)) & 1u) == 1u)),
|
||||
f16(select(0.0, 16.0, ((qh_word >> (1u * qh_shift_phase + 24u)) & 1u) == 1u))
|
||||
);
|
||||
|
||||
var sc: u32;
|
||||
var mn: u32;
|
||||
|
||||
let scale_base = block_byte_base + 4u;
|
||||
|
||||
if (is < 4u) {
|
||||
let sc_byte = get_byte(load_u32_at_src0(scale_base), is % 4u);
|
||||
let min_byte = get_byte(load_u32_at_src0(scale_base + 4), is % 4u);
|
||||
sc = sc_byte & 63u;
|
||||
mn = min_byte & 63u;
|
||||
if (sub_block < 4u) {
|
||||
let sc_byte = get_byte(load_u32_at_src0_aligned(scale_byte_base), sub_block % 4u);
|
||||
let min_byte = get_byte(load_u32_at_src0_aligned(scale_byte_base + 4), sub_block % 4u);
|
||||
sc = sc_byte & 63u;
|
||||
mn = min_byte & 63u;
|
||||
} else {
|
||||
let sc_min_lo = get_byte(load_u32_at_src0(scale_base + 8), (is + 4u) % 4u);
|
||||
let sc_hi = get_byte(load_u32_at_src0(scale_base), (is - 4u) % 4u);
|
||||
let min_hi = get_byte(load_u32_at_src0(scale_base + 4), is % 4u);
|
||||
|
||||
sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
|
||||
mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
|
||||
let sc_min_lo = get_byte(load_u32_at_src0_aligned(scale_byte_base + 8), (sub_block + 4u) % 4u);
|
||||
let sc_hi = get_byte(load_u32_at_src0_aligned(scale_byte_base), (sub_block - 4u) % 4u);
|
||||
let min_hi = get_byte(load_u32_at_src0_aligned(scale_byte_base + 4), sub_block % 4u);
|
||||
sc = (sc_min_lo & 0xFu) | ((sc_hi >> 6u) << 4u);
|
||||
mn = (sc_min_lo >> 4u) | ((min_hi >> 6u) << 4u);
|
||||
}
|
||||
|
||||
let dl = d * f16(sc);
|
||||
let ml = dmin * f16(mn);
|
||||
|
||||
let q_idx = q_b_idx + l;
|
||||
let q_packed = load_u32_at_src0(block_byte_base + 48u + 4u * (q_idx / 4u));
|
||||
store_shmem_kquants((qh_vec4 + qs_lo4_vec4) * dl - vec4<f16>(ml, ml, ml, ml), elem_idx);
|
||||
#elif INIT_SRC0_SHMEM_Q6_K
|
||||
let block_byte_base = src0_idx * 210u; // BLOCK_SIZE_BYTES = 210u;
|
||||
let ql_byte_base = block_byte_base;
|
||||
let qh_byte_base = block_byte_base + 128u;
|
||||
let scales_byte_base = block_byte_base + 192u;
|
||||
let d_byte_base = block_byte_base + 208u;
|
||||
|
||||
let q_byte = get_byte(q_packed, q_idx % 4u);
|
||||
let d = load_f16_at_src0(d_byte_base);
|
||||
|
||||
let qh_packed = load_u32_at_src0(block_byte_base + 16u + 4u * (l / 4u));
|
||||
let chunk = k_in_block / 128u;
|
||||
let ql_pos_in_chunk = (k_in_block % 128u) % 64u;
|
||||
let qh_pos_in_chunk = (k_in_block % 128u) % 32u;
|
||||
let sub_block = k_in_block / 16u;
|
||||
let ql_shift_phase = (k_in_block % 128u) / 64u;
|
||||
let qh_shift_phase = (k_in_block % 128u) / 32u;
|
||||
|
||||
let qh_byte = get_byte(qh_packed, l % 4u);
|
||||
// low 4 bits (4 elems)
|
||||
let ql_word = load_u32_at_src0(ql_byte_base + 64u * chunk + 1u * ql_pos_in_chunk);
|
||||
let ql_lo4_vec4 = vec4<u32>(
|
||||
(ql_word >> (4u * ql_shift_phase + 0u)) & 0xFu,
|
||||
(ql_word >> (4u * ql_shift_phase + 8u)) & 0xFu,
|
||||
(ql_word >> (4u * ql_shift_phase + 16u)) & 0xFu,
|
||||
(ql_word >> (4u * ql_shift_phase + 24u)) & 0xFu
|
||||
);
|
||||
|
||||
let qs_val = (q_byte >> shift) & 0xFu;
|
||||
let qh_val = select(0.0, 16.0, (qh_byte & u) != 0);
|
||||
// hi 2 bits (4 elems)
|
||||
let qh_word = load_u32_at_src0(qh_byte_base + 32u * chunk + 1u * qh_pos_in_chunk);
|
||||
let qh_hi2_vec4 = vec4<u32>(
|
||||
((qh_word >> (2u * qh_shift_phase + 0u)) & 0x3u) << 4u,
|
||||
((qh_word >> (2u * qh_shift_phase + 8u)) & 0x3u) << 4u,
|
||||
((qh_word >> (2u * qh_shift_phase + 16u)) & 0x3u) << 4u,
|
||||
((qh_word >> (2u * qh_shift_phase + 24u)) & 0x3u) << 4u,
|
||||
);
|
||||
|
||||
let q_val = (f16(qs_val) + f16(qh_val)) * dl - ml;
|
||||
shmem[elem_idx] = q_val;
|
||||
let q_vec4 = vec4<f16>(qh_hi2_vec4 | ql_lo4_vec4) - vec4<f16>(32.0, 32.0, 32.0, 32.0);
|
||||
|
||||
let scale_byte = scales_byte_base + 1u * sub_block;
|
||||
let scale_word = load_u32_at_src0_aligned(scale_byte);
|
||||
let scale = get_byte_i32(scale_word, scale_byte & 3u);
|
||||
|
||||
store_shmem_kquants(d * q_vec4 * f16(scale), elem_idx);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif // INIT_SRC0_SHMEM_Q5_K
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_Q6_K
|
||||
const BLOCK_SIZE = 256u;
|
||||
const BLOCK_SIZE_BYTES = 210u;
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) {
|
||||
let tile_m = elem_idx / TILE_K;
|
||||
let tile_k = elem_idx % TILE_K;
|
||||
|
||||
let global_m = offset_m + tile_m;
|
||||
let global_k = k_outer + tile_k;
|
||||
|
||||
if (global_m >= params.m || global_k >= params.k) {
|
||||
shmem[elem_idx] = f16(0.0);
|
||||
continue;
|
||||
}
|
||||
|
||||
let block_k = global_k / BLOCK_SIZE;
|
||||
let k_in_block = global_k % BLOCK_SIZE;
|
||||
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
|
||||
let half = k_in_block / 128u;
|
||||
let pos_in_half = k_in_block % 128u;
|
||||
let quarter = pos_in_half / 32u;
|
||||
let l = pos_in_half % 32u;
|
||||
|
||||
let ql_b_idx = half * 64u;
|
||||
let qh_b_idx = half * 32u;
|
||||
let sc_b_idx = half * 8u;
|
||||
|
||||
// Load only ql13 word needed
|
||||
let ql13_flat = ql_b_idx + l;
|
||||
let ql13 = load_u32_at_src0(block_byte_base + ql13_flat);
|
||||
let ql13_b = get_byte(ql13, 0u);
|
||||
|
||||
// Load only ql24 word needed
|
||||
let ql24_flat = ql_b_idx + l + 32u;
|
||||
let ql24 = load_u32_at_src0(block_byte_base + ql24_flat);
|
||||
let ql24_b = get_byte(ql24, 0u);
|
||||
|
||||
// Load only qh word needed
|
||||
let qh_flat = qh_b_idx + l;
|
||||
let qh = load_u32_at_src0(block_byte_base + 128u + qh_flat);
|
||||
let qh_b = get_byte(qh, 0u);
|
||||
|
||||
let q1 = f16((ql13_b & 0xFu) | ((qh_b & 3u) << 4u)) - f16(32.0);
|
||||
let q2 = f16((ql24_b & 0xFu) | (((qh_b >> 2u) & 3u) << 4u)) - f16(32.0);
|
||||
let q3 = f16((ql13_b >> 4u) | (((qh_b >> 4u) & 3u) << 4u)) - f16(32.0);
|
||||
let q4 = f16((ql24_b >> 4u) | (((qh_b >> 6u) & 3u) << 4u)) - f16(32.0);
|
||||
|
||||
// Load only the scale word needed
|
||||
let is = l / 16u;
|
||||
let sc_idx = sc_b_idx + is + quarter * 2u;
|
||||
let sc = load_u32_at_src0(block_byte_base + 192u + sc_idx);
|
||||
let sc_val = get_byte_i32(sc, 0u);
|
||||
|
||||
let d = load_f16_at_src0(block_byte_base + 208u);
|
||||
|
||||
var q_val: f16;
|
||||
if (quarter == 0u) {
|
||||
q_val = q1;
|
||||
} else if (quarter == 1u) {
|
||||
q_val = q2;
|
||||
} else if (quarter == 2u) {
|
||||
q_val = q3;
|
||||
} else {
|
||||
q_val = q4;
|
||||
}
|
||||
|
||||
shmem[elem_idx] = d * f16(sc_val) * q_val;
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_Q6_K
|
||||
#endif // k-quants
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_IQ4_NL
|
||||
const BLOCK_SIZE = 32u;
|
||||
@@ -1155,48 +924,3 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_IQ3_S
|
||||
|
||||
#ifdef INIT_SRC0_SHMEM_MXFP4
|
||||
const BLOCK_SIZE = 32u;
|
||||
const BLOCK_SIZE_BYTES = 17u;
|
||||
// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
|
||||
override BLOCKS_K = TILE_K/BLOCK_SIZE;
|
||||
const NQ = 16u;
|
||||
const BYTES_PER_THREAD = 8u; // NQ(16) weights uses 8 bytes of q
|
||||
const BYTES_PER_INNER_LOOP = 4u; // == sizeof(q_packed)
|
||||
|
||||
fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
|
||||
for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
|
||||
let blck_idx = i / BLOCK_SIZE;
|
||||
let block_offset = (i % BLOCK_SIZE) / NQ;
|
||||
let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * BYTES_PER_THREAD;
|
||||
|
||||
let tile_m = blck_idx / BLOCKS_K;
|
||||
let global_m = offset_m + tile_m;
|
||||
let block_k = blck_idx % BLOCKS_K;
|
||||
let global_block_k = k_outer / BLOCK_SIZE + block_k;
|
||||
|
||||
if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) {
|
||||
let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k;
|
||||
let block_byte_base = src0_idx * BLOCK_SIZE_BYTES;
|
||||
let eu8 = get_byte(load_u32_at_src0(block_byte_base), 0);
|
||||
let e = ldexp(1.0, i32(eu8) - 128);
|
||||
|
||||
// store NQ(16) weights
|
||||
for (var j = 0u; j < BYTES_PER_THREAD / BYTES_PER_INNER_LOOP; j += 1) {
|
||||
|
||||
let q_byte_offset = block_byte_base + 1u + block_offset * BYTES_PER_THREAD + j * BYTES_PER_INNER_LOOP;
|
||||
let q_packed = load_u32_at_src0(q_byte_offset);
|
||||
|
||||
for (var k = 0u; k < BYTES_PER_INNER_LOOP; k++) {
|
||||
let q_byte = get_byte(q_packed, k);
|
||||
let q_hi = f32(kvalues_mxfp4[(q_byte >> 4) & 0xF]) * e;
|
||||
let q_lo = f32(kvalues_mxfp4[q_byte & 0xF]) * e;
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k] = f16(q_lo);
|
||||
shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = f16(q_hi);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // INIT_SRC0_SHMEM_MXFP4
|
||||
|
||||
@@ -43,12 +43,14 @@ struct Params {
|
||||
var<storage, read_write> src: array<f32>;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x >= params.ne) {
|
||||
fn main(
|
||||
@builtin(global_invocation_id) gid: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>) {
|
||||
let threads_per_group = u32(WG_SIZE);
|
||||
var i = gid.x + (num_wg.x * threads_per_group) * gid.y;
|
||||
if (i >= params.ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
|
||||
i = i % (params.ne2 * params.ne1 * params.ne0);
|
||||
let i2 = i / (params.ne1 * params.ne0);
|
||||
|
||||
@@ -66,11 +66,14 @@ fn erf_approx(x: TYPE) -> TYPE {
|
||||
}
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
if (gid.x >= params.ne) {
|
||||
fn main(@builtin(global_invocation_id) gid: vec3<u32>,
|
||||
@builtin(num_workgroups) num_wg: vec3<u32>) {
|
||||
let threads_per_group = u32(WG_SIZE);
|
||||
let flat_i = gid.x + (num_wg.x * threads_per_group) * gid.y;
|
||||
if (flat_i >= params.ne) {
|
||||
return;
|
||||
}
|
||||
var i = gid.x;
|
||||
var i = flat_i;
|
||||
let ne2 = params.ne2;
|
||||
#ifdef DIAG
|
||||
let ne1 = params.ne0;
|
||||
@@ -205,6 +208,6 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
#ifdef INPLACE
|
||||
src[params.offset_src + src_idx] = res;
|
||||
#else
|
||||
dst[params.offset_dst + gid.x] = res;
|
||||
dst[params.offset_dst + flat_i] = res;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -538,6 +538,8 @@ class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
TOKEN_EMBD = auto()
|
||||
TOKEN_EMBD_NORM = auto()
|
||||
MASKED_EMBD_CENTROIDS= auto()
|
||||
MASKED_EMBD_ORDERING = auto()
|
||||
TOKEN_TYPES = auto()
|
||||
POS_EMBD = auto()
|
||||
OUTPUT = auto()
|
||||
@@ -1087,6 +1089,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||
MODEL_TENSOR.MASKED_EMBD_CENTROIDS: "masked_embd_centroids",
|
||||
MODEL_TENSOR.MASKED_EMBD_ORDERING: "masked_embd_ordering",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
@@ -2586,6 +2590,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_ARCH.GEMMA4_ASSISTANT: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.MASKED_EMBD_CENTROIDS,
|
||||
MODEL_TENSOR.MASKED_EMBD_ORDERING,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.NEXTN_PROJ_PRE,
|
||||
MODEL_TENSOR.NEXTN_PROJ_POST,
|
||||
|
||||
@@ -37,6 +37,14 @@ class TensorNameMap:
|
||||
"model.embed", # talkie
|
||||
),
|
||||
|
||||
# Masked embeddings
|
||||
MODEL_TENSOR.MASKED_EMBD_CENTROIDS: (
|
||||
"masked_embedding.centroids", # gemma-4 E2B/E4B assistants
|
||||
),
|
||||
MODEL_TENSOR.MASKED_EMBD_ORDERING: (
|
||||
"masked_embedding.token_ordering", # gemma-4 E2B/E4B assistants
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
MODEL_TENSOR.TOKEN_TYPES: (
|
||||
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||
|
||||
@@ -559,6 +559,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
||||
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
|
||||
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
|
||||
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
||||
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
|
||||
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
|
||||
};
|
||||
|
||||
// declare information about the model weight tensors:
|
||||
@@ -783,6 +785,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
|
||||
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
||||
@@ -566,8 +566,11 @@ enum llm_tensor {
|
||||
LLM_TENSOR_NEXTN_HNORM,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
|
||||
LLM_TENSOR_MASKED_EMBD_ORDERING,
|
||||
};
|
||||
|
||||
|
||||
enum llm_tensor_layer {
|
||||
LLM_TENSOR_LAYER_INPUT,
|
||||
LLM_TENSOR_LAYER_REPEATING,
|
||||
|
||||
@@ -567,7 +567,10 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
}
|
||||
|
||||
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
// the kq mask guards on its own buffer: shared cells leave idxs unbacked while the mask stays live
|
||||
if (self_kq_mask && self_kq_mask->buffer) {
|
||||
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
// swa tensors may not be allocated if there are no SWA attention layers
|
||||
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
|
||||
@@ -575,7 +578,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
}
|
||||
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
if (self_k_rot) {
|
||||
mctx->get_base()->set_input_k_rot(self_k_rot);
|
||||
@@ -607,7 +612,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
}
|
||||
|
||||
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
|
||||
if (self_kq_mask && self_kq_mask->buffer) {
|
||||
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
|
||||
}
|
||||
|
||||
// swa tensors may not be allocated if there are no SWA attention layers
|
||||
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
|
||||
@@ -615,7 +622,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
}
|
||||
|
||||
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
|
||||
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
|
||||
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,9 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
|
||||
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||
|
||||
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_CENTROIDS, "weight"), {}, TENSOR_NOT_REQUIRED);
|
||||
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_ORDERING), {}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
const int64_t n_embd_backbone = hparams.n_embd_inp();
|
||||
nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0);
|
||||
|
||||
|
||||
@@ -1393,6 +1393,9 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
|
||||
//
|
||||
void server_task_result_cmpl_partial::update(task_result_state & state) {
|
||||
is_updated = true;
|
||||
if (is_begin) {
|
||||
return; // begin marker only flushes headers, skip parsing
|
||||
}
|
||||
state.update_chat_msg(content, true, oaicompat_msg_diffs);
|
||||
|
||||
// Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
|
||||
|
||||
Reference in New Issue
Block a user