Compare commits

..

2 Commits

Author SHA1 Message Date
Masashi Yoshimura
a95a11e5b8 ggml-webgpu: Improve performance of mat-vec and mat-mat for MUL_MAT_ID (#22464)
* Add mat-vec fast path of MUL_MAT_ID.

* Add shared accumulation vec logic and the other types supports.

* Add i-quant mat-mat for MUL_MAT_ID and fix some parts

* Remove n_experts from shader_lib_context.
2026-04-30 14:19:10 -07:00
Reese Levine
5cbfb18075 Update llama-mmap to use ftello/fseeko (#22497)
* Update llama-mmap to work with 32-bit wasm and >2GB models

* Update to gguf.cpp style
2026-04-30 14:17:52 -07:00
6 changed files with 1790 additions and 1297 deletions

View File

@@ -664,7 +664,7 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
}
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
size_t bytes_per_kv = 0;
size_t bytes_per_kv = 0;
if (!key.kv_direct) {
bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
}
@@ -701,10 +701,10 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
(v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
const bool kv_vec_type_supported =
K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && (context.src0->ne[0] % 32 == 0) &&
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
(context.src2->type == K->type);
const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && (context.src0->ne[0] % 32 == 0) &&
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
(context.src2->type == K->type);
const bool use_tile = context.supports_subgroups && !context.supports_subgroup_matrix && K->type == GGML_TYPE_F16 &&
V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
(context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
@@ -862,9 +862,12 @@ struct ggml_webgpu_mul_mat_shader_decisions {
struct ggml_webgpu_mul_mat_id_pipeline_key {
ggml_type src0_type;
ggml_type src1_type;
uint32_t n_experts;
int vectorized;
bool operator==(const ggml_webgpu_mul_mat_id_pipeline_key & other) const {
return src0_type == other.src0_type && src1_type == other.src1_type;
return src0_type == other.src0_type && src1_type == other.src1_type && n_experts == other.n_experts &&
vectorized == other.vectorized;
}
};
@@ -873,6 +876,8 @@ struct ggml_webgpu_mul_mat_id_pipeline_key_hash {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.src0_type);
ggml_webgpu_hash_combine(seed, key.src1_type);
ggml_webgpu_hash_combine(seed, key.n_experts);
ggml_webgpu_hash_combine(seed, key.vectorized);
return seed;
}
};
@@ -1023,6 +1028,8 @@ class ggml_webgpu_shader_lib {
std::unordered_map<int, webgpu_pipeline> mul_mat_id_gather_pipelines; // key is fixed
std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
mul_mat_id_pipelines; // src0_type/src1_type
std::unordered_map<ggml_webgpu_mul_mat_id_pipeline_key, webgpu_pipeline, ggml_webgpu_mul_mat_id_pipeline_key_hash>
mul_mat_id_vec_pipelines; // src0_type/src1_type
std::unordered_map<ggml_webgpu_set_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_set_rows_pipeline_key_hash>
set_rows_pipelines;
@@ -1516,7 +1523,7 @@ class ggml_webgpu_shader_lib {
key.type = context.dst->type;
key.d_state = (int) context.src0->ne[0];
key.xbc_overlap = ggml_webgpu_tensor_overlap(context.src1, context.src4) &&
ggml_webgpu_tensor_overlap(context.src1, context.src5);
ggml_webgpu_tensor_overlap(context.src1, context.src5);
auto it = ssm_scan_pipelines.find(key);
if (it != ssm_scan_pipelines.end()) {
@@ -1633,10 +1640,10 @@ class ggml_webgpu_shader_lib {
ggml_webgpu_mul_mat_vec_pipeline_key key = {};
key.src0_type = context.src0->type;
key.src1_type = context.src1->type;
key.vectorized = (context.src0->ne[0] % 4 == 0 &&
key.vectorized = (context.src0->ne[0] % 4 == 0 &&
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
1 :
0;
1 :
0;
auto it = mul_mat_vec_pipelines.find(key);
if (it != mul_mat_vec_pipelines.end()) {
@@ -2012,6 +2019,11 @@ class ggml_webgpu_shader_lib {
ggml_webgpu_mul_mat_id_pipeline_key key = {};
key.src0_type = context.src0->type;
key.src1_type = context.src1->type;
key.n_experts = context.src0->ne[2];
key.vectorized = (context.src0->ne[0] % 4 == 0 && context.src0->ne[1] % 4 == 0 &&
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
1 :
0;
auto it = mul_mat_id_pipelines.find(key);
if (it != mul_mat_id_pipelines.end()) {
@@ -2041,14 +2053,12 @@ class ggml_webgpu_shader_lib {
switch (context.src0->type) {
case GGML_TYPE_F32:
defines.push_back("SRC0_INNER_TYPE=f32");
defines.push_back("FLOAT");
defines.push_back("INIT_SRC0_SHMEM_FLOAT");
defines.push_back("INIT_SRC1_SHMEM_FLOAT");
variant += "_f32";
break;
case GGML_TYPE_F16:
defines.push_back("SRC0_INNER_TYPE=f16");
defines.push_back("FLOAT");
defines.push_back("INIT_SRC0_SHMEM_FLOAT");
defines.push_back("INIT_SRC1_SHMEM_FLOAT");
variant += "_f16";
@@ -2064,12 +2074,32 @@ class ggml_webgpu_shader_lib {
defines.push_back("U32_DEQUANT_HELPERS");
defines.push_back("SRC0_INNER_TYPE=u32");
switch (context.src0->type) {
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
defines.push_back(type_upper + "_GRID");
break;
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
defines.push_back(type_upper + "_GRID");
defines.push_back(type_upper + "_TABLES");
break;
default:
break;
}
variant += std::string("_") + src0_name;
break;
}
}
defines.push_back("SCALAR");
// VEC/SCALAR controls
defines.push_back(key.vectorized ? "VEC" : "SCALAR");
// mul_mat_id is register-tile only.
const uint32_t tile_k =
@@ -2102,6 +2132,123 @@ class ggml_webgpu_shader_lib {
return mul_mat_id_pipelines[key];
}
webgpu_pipeline get_mul_mat_id_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_mul_mat_id_pipeline_key key = {};
key.src0_type = context.src0->type;
key.src1_type = context.src1->type;
key.n_experts = context.src0->ne[2];
key.vectorized = (context.src0->ne[0] % 4 == 0 &&
(context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ?
1 :
0;
auto it = mul_mat_id_vec_pipelines.find(key);
if (it != mul_mat_id_vec_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant = "mul_mat_id_vec";
const char * shader_src = wgsl_mul_mat_id_vec;
// src1 type
switch (context.src1->type) {
case GGML_TYPE_F32:
defines.push_back("SRC1_INNER_TYPE=f32");
break;
case GGML_TYPE_F16:
defines.push_back("SRC1_INNER_TYPE=f16");
break;
default:
GGML_ABORT("Unsupported src1 type for mul_mat fast shader");
}
// src0 type
switch (context.src0->type) {
case GGML_TYPE_F32:
defines.push_back("SRC0_INNER_TYPE=f32");
defines.push_back("MUL_ACC_FLOAT");
variant += "_f32";
break;
case GGML_TYPE_F16:
defines.push_back("SRC0_INNER_TYPE=f16");
defines.push_back("MUL_ACC_FLOAT");
variant += "_f16";
break;
default:
{
// Quantized types: use helpers but accumulate in f16
const struct ggml_type_traits * src0_traits = ggml_get_type_traits(context.src0->type);
std::string src0_name = src0_traits->type_name;
std::string type_upper = src0_name;
variant += "_" + src0_name;
std::transform(type_upper.begin(), type_upper.end(), type_upper.begin(), ::toupper);
defines.push_back("BYTE_HELPERS");
defines.push_back("MUL_ACC_" + type_upper);
defines.push_back("U32_DEQUANT_HELPERS");
defines.push_back("SRC0_INNER_TYPE=u32");
switch (context.src0->type) {
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
defines.push_back(type_upper + "_GRID");
break;
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
defines.push_back(type_upper + "_GRID");
defines.push_back(type_upper + "_TABLES");
break;
default:
break;
}
break;
}
}
// VEC/SCALAR controls
defines.push_back(key.vectorized ? "VEC" : "SCALAR");
uint32_t wg_size = WEBGPU_MUL_MAT_VEC_WG_SIZE;
uint32_t outputs_per_wg = WEBGPU_MUL_MAT_VEC_FLOAT_OUTPUTS_PER_WG;
if (key.src0_type == GGML_TYPE_Q1_0) {
outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
} else if (key.src0_type >= GGML_TYPE_Q2_K) {
outputs_per_wg = WEBGPU_MUL_MAT_VEC_K_Q_OUTPUTS_PER_WG;
} else if (key.src0_type >= GGML_TYPE_Q4_0) {
outputs_per_wg = WEBGPU_MUL_MAT_VEC_LEGACY_Q_OUTPUTS_PER_WG;
}
// variant suffix for src1 type
variant += std::string("_") + (context.src1->type == GGML_TYPE_F32 ? "f32" : "f16");
defines.push_back(std::string("WG_SIZE=") + std::to_string(wg_size));
defines.push_back(std::string("OUTPUTS_PER_WG=") + std::to_string(outputs_per_wg));
defines.push_back(context.supports_subgroups ? "USE_SUBGROUP_REDUCTION" : "USE_WORKGROUP_REDUCTION");
variant += context.supports_subgroups ? "_sg_reduce" : "_wg_reduce";
if (key.vectorized) {
variant += "_vectorized";
}
defines.push_back(std::string("N_EXPERTS=") + std::to_string(key.n_experts));
auto processed = preprocessor.preprocess(shader_src, defines);
auto decisions = std::make_shared<ggml_webgpu_mul_mat_vec_shader_decisions>();
decisions->wg_size = wg_size;
decisions->outputs_per_wg = outputs_per_wg;
webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
pipeline.context = decisions;
mul_mat_id_vec_pipelines[key] = pipeline;
return mul_mat_id_vec_pipelines[key];
}
webgpu_pipeline get_unary_pipeline(const ggml_webgpu_shader_lib_context & context) {
const bool is_unary = context.dst->op == GGML_OP_UNARY;
const int op = is_unary ? (int) ggml_get_unary_op(context.dst) : context.dst->op;

View File

@@ -1404,7 +1404,6 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
@@ -1527,11 +1526,74 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx,
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_mul_mat_id_vec(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * src2,
ggml_tensor * dst) {
const uint32_t param_n_expert = (uint32_t) src0->ne[2];
const uint32_t param_n_expert_used = (uint32_t) dst->ne[1];
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
shader_lib_ctx.src2 = src2;
shader_lib_ctx.dst = dst;
shader_lib_ctx.supports_subgroups = ctx->global_ctx->capabilities.supports_subgroups;
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
webgpu_pipeline pipeline = ctx->shader_lib->get_mul_mat_id_vec_pipeline(shader_lib_ctx);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)),
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
(uint32_t) src0->ne[0],
(uint32_t) src0->ne[1],
param_n_expert,
param_n_expert_used,
(uint32_t) src1->ne[1],
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
};
std::vector<wgpu::BindGroupEntry> entries = {
ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(src0), ggml_webgpu_tensor_align_offset(ctx, src0),
ggml_webgpu_tensor_binding_size(ctx, src0)),
ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(src1), ggml_webgpu_tensor_align_offset(ctx, src1),
ggml_webgpu_tensor_binding_size(ctx, src1)),
ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(src2), ggml_webgpu_tensor_align_offset(ctx, src2),
ggml_webgpu_tensor_binding_size(ctx, src2)),
ggml_webgpu_make_bind_group_entry(3, ggml_webgpu_tensor_buf(dst), ggml_webgpu_tensor_align_offset(ctx, dst),
ggml_webgpu_tensor_binding_size(ctx, dst)),
};
uint32_t wg_x = 1;
uint32_t wg_y = 1;
auto * decisions = static_cast<ggml_webgpu_mul_mat_vec_shader_decisions *>(pipeline.context.get());
const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
uint32_t total_wg = output_groups * param_n_expert_used;
compute_2d_workgroups(total_wg, max_wg_per_dim, wg_x, wg_y);
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
}
static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
ggml_tensor * src0,
ggml_tensor * src1,
ggml_tensor * src2,
ggml_tensor * dst) {
// we can use mat-vec fast path
if (dst->ne[2] == 1) {
return ggml_webgpu_mul_mat_id_vec(ctx, src0, src1, src2, dst);
}
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.src0 = src0;
shader_lib_ctx.src1 = src1;
@@ -3879,6 +3941,15 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
supports_op = true;
break;
default:

View File

@@ -0,0 +1,154 @@
#ifdef USE_SUBGROUP_REDUCTION
enable subgroups;
#endif
enable f16;
#define DECLARE_BYTE_LOADERS_SRC0
#include "common_decls.tmpl"
#include "mul_mat_vec_acc.tmpl"
struct MulMatIdVecParams {
offset_src0: u32,
offset_src1: u32,
offset_ids: u32,
offset_dst: u32,
k: u32,
m: u32,
n_expert: u32,
n_expert_used: u32,
b_ne1: u32,
stride_01: u32,
stride_11: u32,
stride_02: u32,
stride_12: u32,
};
@group(0) @binding(0) var<storage, read_write> src0: array<SRC0_TYPE>; // [cols, rows, n_expert]
@group(0) @binding(1) var<storage, read_write> src1: array<SRC1_TYPE>; // [cols, b_ne1, n_tokens(1)]
@group(0) @binding(2) var<storage, read_write> ids: array<u32>; // [n_experd_used, n_tokens(1)]
@group(0) @binding(3) var<storage, read_write> dst: array<f32>; // [rows, n_expert_used, n_tokens(1)]
// "mul_mat_vec_acc.tmpl" requires params.k, params.m, params.stride_01
@group(0) @binding(4) var<uniform> params: MulMatIdVecParams;
// Flattened as [row][thread] to keep each row's reduction contiguous in memory.
var<workgroup> partial_sums: array<f32, OUTPUTS_PER_WG * WG_SIZE>;
fn partial_index(row: u32, thread: u32) -> u32 {
return row * WG_SIZE + thread;
}
var<workgroup> gathered_count_ids: array<u32, N_EXPERTS>;
var<workgroup> gathered_expert_used: array<u32, N_EXPERTS>;
@compute @workgroup_size(WG_SIZE)
fn main(
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(num_workgroups) num_wg: vec3<u32>
#ifdef USE_SUBGROUP_REDUCTION
, @builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_invocation_id) subgroup_invocation_id: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_size) subgroup_size: u32
#endif
) {
let thread_id = local_id.x;
for (var i = thread_id;i < params.n_expert;i += WG_SIZE) {
gathered_count_ids[i] = 0;
}
workgroupBarrier();
// gather the selected experts for the target token.
for (var col = thread_id;col < params.n_expert_used;col += WG_SIZE) {
let expert = ids[params.offset_ids + col];
gathered_count_ids[expert] = 1;
gathered_expert_used[expert] = col;
}
workgroupBarrier();
let output_groups:u32 = (params.m + OUTPUTS_PER_WG - 1u) / OUTPUTS_PER_WG;
let wg_linear = wg_id.y * num_wg.x + wg_id.x;
var own_expert:u32 = 0;
var wg_in_batch:u32 = 0;
var wg_sum:u32 = 0;
for (var i = 0u;i < params.n_expert;i += 1) {
let wg_vec_count = gathered_count_ids[i]; // 1 or 0
let wg_per_matrix = output_groups * wg_vec_count;
if (wg_sum <= wg_linear && wg_linear < wg_sum + wg_per_matrix) {
own_expert = i;
wg_in_batch = wg_linear - wg_sum;
break;
}
wg_sum += wg_per_matrix;
}
let row_base = (wg_linear % output_groups) * OUTPUTS_PER_WG;
let dst1_stride = params.m;
let src0_batch_offset = params.offset_src0 + own_expert * params.stride_02;
let src1_idx_base = params.offset_src1 + (gathered_expert_used[own_expert] % params.b_ne1) * params.stride_11;
let dst_idx_base = params.offset_dst + gathered_expert_used[own_expert] * dst1_stride + row_base;
let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base);
#ifdef USE_SUBGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
let subgroup_total = subgroupAdd(acc[row]);
if (subgroup_invocation_id == 0u) {
partial_sums[partial_index(row, subgroup_id)] = subgroup_total;
}
}
workgroupBarrier();
for (var row = subgroup_id; (row < OUTPUTS_PER_WG) && (row_base + row < params.m); row += num_subgroups) {
let output_row = row_base + row;
var row_acc = 0.0f;
for (var k = subgroup_invocation_id; k < num_subgroups; k += subgroup_size) {
row_acc += partial_sums[partial_index(row, k)];
}
let row_total = subgroupAdd(row_acc);
if (subgroup_invocation_id == 0) {
dst[dst_idx_base + row] = row_total;
}
}
#endif
#ifdef USE_WORKGROUP_REDUCTION
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] = acc[row];
}
workgroupBarrier();
var stride:u32 = WG_SIZE / 2u;
while (stride > 0) {
if (thread_id < stride) {
for (var row = 0u; row < OUTPUTS_PER_WG; row++) {
partial_sums[partial_index(row, thread_id)] += partial_sums[partial_index(row, thread_id + stride)];
}
}
workgroupBarrier();
stride = stride / 2;
}
if (thread_id < OUTPUTS_PER_WG) {
let output_row = row_base + thread_id;
if (output_row < params.m) {
dst[dst_idx_base + thread_id] = partial_sums[partial_index(thread_id, 0)];
}
}
#endif
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -40,6 +40,14 @@
#include <TargetConditionals.h>
#endif
#ifdef _WIN32
# define llama_mmap_ftell _ftelli64
# define llama_mmap_fseek _fseeki64
#else
# define llama_mmap_ftell ftello
# define llama_mmap_fseek fseeko
#endif
// TODO: consider moving to llama-impl.h if needed in more places
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
@@ -226,7 +234,7 @@ struct llama_file::impl {
size_t tell() const {
if (fd == -1) {
long ret = std::ftell(fp);
off_t ret = llama_mmap_ftell(fp);
if (ret == -1) {
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
}
@@ -244,7 +252,7 @@ struct llama_file::impl {
void seek(size_t offset, int whence) const {
off_t ret = 0;
if (fd == -1) {
ret = std::fseek(fp, (long) offset, whence);
ret = llama_mmap_fseek(fp, offset, whence);
} else {
ret = lseek(fd, offset, whence);
}