mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-25 13:09:46 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f728adab68 | |||
| 3e61ea0e2f | |||
| fdbd6abee2 | |||
| e12a0128ab | |||
| b3ce5cedf4 | |||
| e9fb3b3fc0 | |||
| 9c10954865 | |||
| fdb2c11c70 |
@@ -222,6 +222,16 @@ if (LLAMA_BUILD_APP)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
|
||||
# Standalone libmtmd build without pulling in the rest of the tools/ tree.
|
||||
# Useful when packaging just the mtmd library for language bindings (e.g. an
|
||||
# Apple XCFramework, or a WASM build). When the full tools build is enabled,
|
||||
# mtmd is already built by the tools/ subdirectory above; this hook only fires
|
||||
# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target.
|
||||
option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF)
|
||||
if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS))
|
||||
add_subdirectory(tools/mtmd)
|
||||
endif()
|
||||
|
||||
#
|
||||
# install
|
||||
#
|
||||
|
||||
@@ -13,6 +13,7 @@ LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
LLAMA_BUILD_SERVER=OFF
|
||||
LLAMA_BUILD_MTMD=ON
|
||||
GGML_METAL=ON
|
||||
GGML_METAL_EMBED_LIBRARY=ON
|
||||
GGML_BLAS_DEFAULT=ON
|
||||
@@ -39,6 +40,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
-DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
|
||||
-DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD}
|
||||
-DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
|
||||
-DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
|
||||
-DGGML_METAL=${GGML_METAL}
|
||||
@@ -126,6 +128,8 @@ setup_framework_structure() {
|
||||
cp ggml/include/ggml-cpu.h ${header_path}
|
||||
cp ggml/include/ggml-blas.h ${header_path}
|
||||
cp ggml/include/gguf.h ${header_path}
|
||||
cp tools/mtmd/mtmd.h ${header_path}
|
||||
cp tools/mtmd/mtmd-helper.h ${header_path}
|
||||
|
||||
# Create module map (common for all platforms)
|
||||
cat > ${module_path}module.modulemap << EOF
|
||||
@@ -247,6 +251,7 @@ combine_static_libraries() {
|
||||
"${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
"${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a"
|
||||
)
|
||||
|
||||
# Create temporary directory for processing
|
||||
|
||||
@@ -413,6 +413,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
|
||||
|------------------|----------------------------------------|
|
||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||
| Multiple devices | --split-mode layer (default) |
|
||||
| Multiple devices | --split-mode tensor (tensor parallelism) |
|
||||
|
||||
`--split-mode tensor` (tensor parallelism) shards each layer across the selected
|
||||
GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
|
||||
left at its default `auto`, so `--split-mode tensor` works out of the box.
|
||||
Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
|
||||
context creation. The default `f16` KV cache is recommended. Tensor parallelism
|
||||
is currently optimized for 2 GPUs; other device counts fall back to a generic
|
||||
all-reduce.
|
||||
|
||||
Examples:
|
||||
|
||||
@@ -715,6 +724,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c
|
||||
|------------------|----------------------------------------|
|
||||
| Single device | --split-mode none --main-gpu DEVICE_ID |
|
||||
| Multiple devices | --split-mode layer (default) |
|
||||
| Multiple devices | --split-mode tensor (tensor parallelism) |
|
||||
|
||||
`--split-mode tensor` (tensor parallelism) shards each layer across the selected
|
||||
GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is
|
||||
left at its default `auto`, so `--split-mode tensor` works out of the box.
|
||||
Passing `--flash-attn off` together with `--split-mode tensor` is rejected at
|
||||
context creation. The default `f16` KV cache is recommended. Tensor parallelism
|
||||
is currently optimized for 2 GPUs; other device counts fall back to a generic
|
||||
all-reduce.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -27,6 +27,14 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor
|
||||
// trio queried by the meta-backend via ggml_backend_reg_get_proc_address.
|
||||
// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's
|
||||
// pattern (ggml_backend_cuda_comm_*).
|
||||
GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_comm_free(void * comm_ctx);
|
||||
GGML_BACKEND_API bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
|
||||
@@ -34,26 +34,26 @@ template <float (*bin_op)(const float, const float),
|
||||
static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const src1_t * src1,
|
||||
dst_t * dst,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const uint32_t ne0,
|
||||
const uint32_t ne1,
|
||||
const uint32_t ne2,
|
||||
const uint3 ne3,
|
||||
const uint3 ne10,
|
||||
const uint3 ne11,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
/*const uint32_t s0,*/
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
ggml_cuda_pdl_lc();
|
||||
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
@@ -61,7 +61,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
|
||||
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
|
||||
|
||||
if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0,
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const uint32_t s0 = blockDim.x * gridDim.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
|
||||
for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) {
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
|
||||
// protect i0 from overflow
|
||||
if (ne0 - i0 <= s0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
const uint3 ne12,
|
||||
const uint3 ne13,
|
||||
/*const int s0,*/
|
||||
const int s1,
|
||||
const int s2,
|
||||
const int s3,
|
||||
const int s00,
|
||||
const int s01,
|
||||
const int s02,
|
||||
const int s03,
|
||||
const int s10,
|
||||
const int s11,
|
||||
const int s12,
|
||||
const int s13,
|
||||
const uint32_t s1,
|
||||
const uint32_t s2,
|
||||
const uint32_t s3,
|
||||
const uint32_t s00,
|
||||
const uint32_t s01,
|
||||
const uint32_t s02,
|
||||
const uint32_t s03,
|
||||
const uint32_t s10,
|
||||
const uint32_t s11,
|
||||
const uint32_t s12,
|
||||
const uint32_t s13,
|
||||
src1_ptrs... src1s) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const uint32_t i3 = fastdiv(i, prod_012);
|
||||
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
|
||||
@@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0,
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = fastmodulo(i1, ne11);
|
||||
const int i12 = fastmodulo(i2, ne12);
|
||||
const int i13 = fastmodulo(i3, ne13);
|
||||
const uint32_t i11 = fastmodulo(i1, ne11);
|
||||
const uint32_t i12 = fastmodulo(i2, ne12);
|
||||
const uint32_t i13 = fastmodulo(i3, ne13);
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01;
|
||||
const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11;
|
||||
const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1;
|
||||
|
||||
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = fastmodulo(i0, ne10);
|
||||
const uint32_t i10 = fastmodulo(i0, ne10);
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
float result = src0_row ? (float) src0_row[i0*s00] : 0.0f;
|
||||
float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f;
|
||||
if constexpr (sizeof...(src1_ptrs) > 0) {
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10])));
|
||||
result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10])));
|
||||
} else {
|
||||
result = bin_op(result, (float)src1[i_src1 + i10*s10]);
|
||||
result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]);
|
||||
}
|
||||
|
||||
dst_row[i0] = (dst_t) result;
|
||||
@@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_ASSERT(ne0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
//GGML_ASSERT(s0 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s2 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s3 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s00 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s01 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s02 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s03 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(s10 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s11 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s12 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(s13 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(cne1[0] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[1] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[2] <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(cne1[3] <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
@@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(ne2 * ne3 <= std::numeric_limits<unsigned int>::max());
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0 / 2LL, 1LL);
|
||||
@@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
|
||||
|
||||
if (block_nums.z > 65535 || block_nums.y > 65535) {
|
||||
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
|
||||
|
||||
GGML_ASSERT(block_num <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(block_num * block_size <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
|
||||
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
|
||||
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
|
||||
@@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
|
||||
s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits<uint32_t>::max());
|
||||
GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits<uint32_t>::max());
|
||||
|
||||
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
|
||||
{
|
||||
const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream);
|
||||
|
||||
@@ -10152,14 +10152,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
|
||||
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
||||
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
||||
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
||||
GGML_TENSOR_LOCALS(int, ne0, src0, ne);
|
||||
GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb);
|
||||
|
||||
const int nth = MIN(64, ne00);
|
||||
|
||||
@@ -10173,11 +10167,12 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &eps));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float)*nth, NULL));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
|
||||
@@ -24,6 +24,7 @@ kernel void kernel_norm(
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
ulong nb00,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
@@ -43,7 +44,8 @@ kernel void kernel_norm(
|
||||
// parallel sum
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
sum[get_local_id(0)] += x[i00];
|
||||
// this kernel handles float, nb00/4 translates byte offset to element offset
|
||||
sum[get_local_id(0)] += x[i00*nb00/4];
|
||||
}
|
||||
// reduce
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -60,7 +62,8 @@ kernel void kernel_norm(
|
||||
global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
sum[get_local_id(0)] = 0.0f;
|
||||
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
|
||||
y[i00] = x[i00] - mean;
|
||||
// this kernel handles float, nb00/4 translates byte offset to element offset
|
||||
y[i00] = x[i00*nb00/4] - mean;
|
||||
sum[get_local_id(0)] += y[i00] * y[i00];
|
||||
}
|
||||
|
||||
|
||||
@@ -103,8 +103,8 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
// allocate packed arrays: A_packed (k x m), B_packed (k x n)
|
||||
ggml_sycl_pool_alloc<float> A_packed_alloc(ctx.pool());
|
||||
ggml_sycl_pool_alloc<float> B_packed_alloc(ctx.pool());
|
||||
A_packed_alloc.alloc((size_t) knl_n_total * patch_total * sizeof(float));
|
||||
B_packed_alloc.alloc((size_t) knl_n_total * oc * sizeof(float));
|
||||
A_packed_alloc.alloc((size_t) knl_n_total * patch_total);
|
||||
B_packed_alloc.alloc((size_t) knl_n_total * oc);
|
||||
|
||||
float * A_packed = A_packed_alloc.get();
|
||||
float * B_packed = B_packed_alloc.get();
|
||||
@@ -115,10 +115,16 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
// Combined kernel: im2col -> pack A, and pack B simultaneously
|
||||
const char * src1_base = (const char *) src1->data;
|
||||
const char * src0_base = (const char *) src0->data;
|
||||
const int64_t src1_nb0 = src1->nb[0];
|
||||
const int64_t src1_nb1 = src1->nb[1];
|
||||
const int64_t src1_nb2 = src1->nb[2];
|
||||
const int64_t src1_nb3 = src1->nb[3];
|
||||
const int64_t src1_w = src1->ne[0];
|
||||
const int64_t src1_h = src1->ne[1];
|
||||
const int64_t src1_d = src1->ne[2];
|
||||
|
||||
const bool src0_is_f32 = (src0->type == GGML_TYPE_F32);
|
||||
|
||||
// Compute correct strides for src0 as (knl_n_total, oc) matrix
|
||||
const int64_t src0_packed_nb0 = kernel_type_size;
|
||||
@@ -165,7 +171,7 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
const int64_t sz = dst_z * s2 + kz * d2 - p2;
|
||||
|
||||
float val = 0.0f;
|
||||
if (sx >= 0 && sx < src1->ne[0] && sy >= 0 && sy < src1->ne[1] && sz >= 0 && sz < src1->ne[2]) {
|
||||
if (sx >= 0 && sx < src1_w && sy >= 0 && sy < src1_h && sz >= 0 && sz < src1_d) {
|
||||
const int64_t channel_idx = batch_idx * c + ic;
|
||||
const char * ptr = src1_base + sx * src1_nb0 + sy * src1_nb1 + sz * src1_nb2 + channel_idx * src1_nb3;
|
||||
val = *(const float *) ptr;
|
||||
@@ -184,9 +190,9 @@ void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
const int64_t row = t % k;
|
||||
const int64_t col = t / k;
|
||||
const char * src_ptr = (const char *) src0->data + row * src0_packed_nb0 + col * src0_packed_nb1;
|
||||
const char * src_ptr = src0_base + row * src0_packed_nb0 + col * src0_packed_nb1;
|
||||
float v;
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
if (src0_is_f32) {
|
||||
v = *(const float *) src_ptr;
|
||||
} else {
|
||||
v = sycl::vec<sycl::half, 1>(*(const sycl::half *) src_ptr).convert<float, sycl::rounding_mode::automatic>()[0];
|
||||
|
||||
@@ -5859,6 +5859,250 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
// Tensor parallelism (--split-mode tensor) for the SYCL backend.
|
||||
//
|
||||
// The meta-backend invokes these three entry points via get_proc_address:
|
||||
// * ggml_backend_sycl_comm_init - one-time per-graph setup
|
||||
// * ggml_backend_sycl_comm_allreduce_tensor - per-allreduce step
|
||||
// * ggml_backend_sycl_comm_free - tear-down
|
||||
//
|
||||
// For N=2 (dual-GPU), this is a degenerate ring allreduce with dual paths
|
||||
// chosen by tensor size:
|
||||
//
|
||||
// * Small (nelem < 32K): FP32 direct memcpy + per-device ADD
|
||||
// kernel. The kernel depends_on() its corresponding memcpy event
|
||||
// so it doesn't read partial data. Both devices run in parallel.
|
||||
//
|
||||
// * Large (nelem >= 32K): BF16-compressed. Each device compresses
|
||||
// its FP32 partial to BF16 locally, cross-device memcpys
|
||||
// to the peer (half the PCI bandwidth), where it is decompressed
|
||||
// and added into the local FP32 partial. 6 SYCL submissions per
|
||||
// allreduce (2 compress + 2 memcpy + 2 decompress-add) vs the
|
||||
// 4 for the small path, but the bandwidth saving > 6 GB/s PCIe x 2
|
||||
// dominates for larger tensors.
|
||||
//
|
||||
// Storage: A persistent uint8_t buffer per device, sized to
|
||||
// 4 * nelem bytes. Both paths reinterpret the same bytes (small path
|
||||
// as nelem floats; large path as outbox + inbox = 2*nelem uint16_t
|
||||
// each, using the full 4*nelem byte budget either way). Single
|
||||
// alloc+free per device keeps the SYCL pool's strict-LIFO invariant
|
||||
// trivial.
|
||||
//
|
||||
// For non-(N=2 FP32 contiguous) cases, comm_init or comm_allreduce_tensor
|
||||
// returns null/false, causing the meta-backend to use its generic
|
||||
// butterfly all-reduce fallback.
|
||||
// ==========================================================================
|
||||
|
||||
struct ggml_backend_sycl_comm_context {
|
||||
std::vector<ggml_backend_t> backends;
|
||||
// ONE persistent per-device byte buffer, 4*nelem bytes. Both the
|
||||
// FP32 small-tensor path and the BF16 large-tensor path share it
|
||||
// by reinterpreting.
|
||||
std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf0;
|
||||
std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>> buf1;
|
||||
int64_t buf_nelem = 0;
|
||||
};
|
||||
|
||||
void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends) try {
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if (!ggml_backend_is_sycl(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Initial version: N=2 only. For N!=2, returning null makes the
|
||||
// meta-backend skip this backend-specific allreduce entirely.
|
||||
if (n_backends != 2) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * ctx = new ggml_backend_sycl_comm_context;
|
||||
ctx->backends.assign(backends, backends + n_backends);
|
||||
auto * sctx0 = (ggml_backend_sycl_context *) backends[0]->context;
|
||||
auto * sctx1 = (ggml_backend_sycl_context *) backends[1]->context;
|
||||
ctx->buf0 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx0->pool());
|
||||
ctx->buf1 = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(sctx1->pool());
|
||||
return ctx;
|
||||
}
|
||||
catch (const sycl::exception &) { return nullptr; }
|
||||
catch (...) { return nullptr; }
|
||||
|
||||
void ggml_backend_sycl_comm_free(void * comm_ctx_v) {
|
||||
auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
|
||||
if (comm_ctx == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Sync both per-device queues so the pool_alloc destructors don't
|
||||
// return memory still in use by the last kernel.
|
||||
if (comm_ctx->backends.size() == 2) {
|
||||
auto * sctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
|
||||
auto * sctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
|
||||
try {
|
||||
sctx0->stream()->wait();
|
||||
sctx1->stream()->wait();
|
||||
} catch (...) { /* best effort during shutdown */ }
|
||||
}
|
||||
|
||||
delete comm_ctx;
|
||||
}
|
||||
|
||||
bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) try {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * comm_ctx = static_cast<ggml_backend_sycl_comm_context *>(comm_ctx_v);
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
// Fast path: N=2, F32/F16, contiguous, matching shapes.
|
||||
if (n_backends != 2) {
|
||||
return false;
|
||||
}
|
||||
// Accept F32 or F16 inputs natively (types must match). F16 takes the
|
||||
// direct 2-byte memcpy + add path below; other types return false so the
|
||||
// meta-backend uses its generic all-reduce.
|
||||
if (tensors[0]->type != tensors[1]->type) {
|
||||
return false;
|
||||
}
|
||||
if (tensors[0]->type != GGML_TYPE_F32 && tensors[0]->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguous(tensors[0]) || !ggml_is_contiguous(tensors[1])) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(tensors[0]) != ggml_nelements(tensors[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int64_t nelem = ggml_nelements(tensors[0]);
|
||||
const size_t nbytes = ggml_nbytes(tensors[0]);
|
||||
if (nelem == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto * ctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context;
|
||||
auto * ctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context;
|
||||
queue_ptr q0 = ctx0->stream();
|
||||
queue_ptr q1 = ctx1->stream();
|
||||
|
||||
// Grow per-device byte buffers if needed (4 * nelem bytes each).
|
||||
if (comm_ctx->buf_nelem < nelem) {
|
||||
comm_ctx->buf0->realloc(nelem * 4);
|
||||
comm_ctx->buf1->realloc(nelem * 4);
|
||||
comm_ctx->buf_nelem = nelem;
|
||||
}
|
||||
uint8_t * buf0 = comm_ctx->buf0->get();
|
||||
uint8_t * buf1 = comm_ctx->buf1->get();
|
||||
|
||||
// F16 native path: direct 2-byte cross-device copy + add, skipping the
|
||||
// F32 round-trip the meta-backend fallback would force. Cross-device copies
|
||||
// go through dev2dev_memcpy because the two devices are in separate SYCL
|
||||
// contexts (a raw peer-USM q->memcpy would be a silent no-op).
|
||||
if (tensors[0]->type == GGML_TYPE_F16) {
|
||||
sycl::half * f16_out0 = (sycl::half *) tensors[0]->data;
|
||||
sycl::half * f16_out1 = (sycl::half *) tensors[1]->data;
|
||||
sycl::half * f16_tmp0 = (sycl::half *) buf0;
|
||||
sycl::half * f16_tmp1 = (sycl::half *) buf1;
|
||||
|
||||
q0->wait();
|
||||
q1->wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, f16_tmp0, tensors[1]->data, nbytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, f16_tmp1, tensors[0]->data, nbytes);
|
||||
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
f16_out0[i] = (sycl::half) ((float) f16_out0[i] + (float) f16_tmp0[i]);
|
||||
});
|
||||
});
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
f16_out1[i] = (sycl::half) ((float) f16_out1[i] + (float) f16_tmp1[i]);
|
||||
});
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
float * out0 = (float *) tensors[0]->data;
|
||||
float * out1 = (float *) tensors[1]->data;
|
||||
|
||||
// BF16 threshold: above this, the PCIe savings from halving the
|
||||
// cross-device bytes outweigh the 2 extra compress kernels.
|
||||
// Below: stay on the FP32 fast path. Threshold mirrors the CUDA
|
||||
// NCCL allreduce pattern for n_backends=2.
|
||||
static constexpr int64_t BF16_THRESHOLD = 32768;
|
||||
|
||||
if (nelem < BF16_THRESHOLD) {
|
||||
// FP32 small path: 4 SYCL submissions per allreduce.
|
||||
float * tmp0 = (float *) buf0;
|
||||
float * tmp1 = (float *) buf1;
|
||||
|
||||
// COMM-D2D-FIX: the two devices are in SEPARATE SYCL contexts, so a raw
|
||||
// q->memcpy of a peer USM pointer is a silent no-op. Route cross-device
|
||||
// copies through dev2dev_memcpy (L0 direct copy / host staging). It is
|
||||
// synchronous, so wait for the local partials to be produced first.
|
||||
q0->wait();
|
||||
q1->wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, tmp0, tensors[1]->data, nbytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, tmp1, tensors[0]->data, nbytes);
|
||||
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out0[i] += tmp0[i];
|
||||
});
|
||||
});
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out1[i] += tmp1[i];
|
||||
});
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
// BF16 large path: 6 SYCL submissions per allreduce, but the
|
||||
// cross-device memcpy is HALF the bytes. Pure bit-shift
|
||||
// conversion (no rounding) — matches ggml's truncating fp32->bf16.
|
||||
uint16_t * outbox0 = (uint16_t *) buf0;
|
||||
uint16_t * inbox0 = outbox0 + nelem;
|
||||
uint16_t * outbox1 = (uint16_t *) buf1;
|
||||
uint16_t * inbox1 = outbox1 + nelem;
|
||||
|
||||
// Phase A: compress each device's local partial in parallel.
|
||||
sycl::event c0 = q0->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
outbox0[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out0[i]) >> 16);
|
||||
});
|
||||
|
||||
sycl::event c1 = q1->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
outbox1[i] = (uint16_t) (sycl::bit_cast<uint32_t>(out1[i]) >> 16);
|
||||
});
|
||||
|
||||
// Phase B: COMM-D2D-FIX-BF16 cross-device copy of compressed bytes via
|
||||
// dev2dev_memcpy (separate SYCL contexts; sync copy after compress).
|
||||
const size_t bf16_bytes = nelem * sizeof(uint16_t);
|
||||
c0.wait();
|
||||
c1.wait();
|
||||
dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, inbox0, outbox1, bf16_bytes);
|
||||
dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, inbox1, outbox0, bf16_bytes);
|
||||
|
||||
// Phase C: decompress + add into local FP32 partial.
|
||||
q0->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out0[i] += sycl::bit_cast<float>(((uint32_t) inbox0[i]) << 16);
|
||||
});
|
||||
});
|
||||
|
||||
q1->submit([&](sycl::handler & h) {
|
||||
h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) {
|
||||
out1[i] += sycl::bit_cast<float>(((uint32_t) inbox1[i]) << 16);
|
||||
});
|
||||
});
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (const sycl::exception &) { return false; }
|
||||
catch (...) { return false; }
|
||||
|
||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
|
||||
GGML_UNUSED(reg);
|
||||
|
||||
@@ -5866,6 +6110,17 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
||||
return (void *)ggml_backend_sycl_split_buffer_type;
|
||||
}
|
||||
|
||||
// Tensor parallelism (--split-mode tensor) entry points.
|
||||
if (strcmp(name, "ggml_backend_comm_init") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_free") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) {
|
||||
return (void *)ggml_backend_sycl_comm_allreduce_tensor;
|
||||
}
|
||||
|
||||
// SYCL doesn't support registering host memory, left here for reference
|
||||
// "ggml_backend_register_host_buffer"
|
||||
// "ggml_backend_unregister_host_buffer"
|
||||
|
||||
+1
-1
@@ -847,7 +847,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vector<t
|
||||
qs.has_tied_embeddings = false;
|
||||
}
|
||||
}
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer();
|
||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer_all;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -146,6 +146,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
|
||||
|
||||
llama_synchronize(ctx.get());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
+23
-17
@@ -115,22 +115,28 @@ if (TARGET mtmd)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
# Gate CLI binaries on LLAMA_BUILD_TOOLS so that standalone library-only
|
||||
# builds (LLAMA_BUILD_MTMD=ON with LLAMA_BUILD_TOOLS=OFF — e.g. Apple
|
||||
# XCFramework packaging) skip the executables entirely. LLAMA_BUILD_COMMON
|
||||
# defaults to ON in standalone builds, so we cannot rely on it for gating.
|
||||
if (LLAMA_BUILD_TOOLS)
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
add_executable(llama-gemma3-cli deprecation-warning.cpp)
|
||||
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
|
||||
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
|
||||
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
set(TARGET llama-mtmd-cli)
|
||||
add_executable (${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# mtmd-debug tool
|
||||
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
||||
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
||||
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|
||||
endif()
|
||||
target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# mtmd-debug tool
|
||||
add_executable(llama-mtmd-debug debug/mtmd-debug.cpp)
|
||||
set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug)
|
||||
target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads)
|
||||
target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17)
|
||||
|
||||
+10
-1
@@ -14,6 +14,7 @@
|
||||
import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
|
||||
import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';
|
||||
import { chatStore } from '$lib/stores/chat.svelte';
|
||||
import { config } from '$lib/stores/settings.svelte';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { TooltipSide } from '$lib/enums';
|
||||
@@ -34,6 +35,14 @@
|
||||
|
||||
const isStripExpanded = $derived(isExpandedMode || hoveredTooltip !== null);
|
||||
const isOnMobile = $derived(isMobile.current);
|
||||
const alwaysShowOnDesktop = $derived(config().alwaysShowSidebarOnDesktop as boolean);
|
||||
|
||||
// Keep the sidebar expanded on desktop when the user pins it open
|
||||
$effect(() => {
|
||||
if (alwaysShowOnDesktop && !isOnMobile) {
|
||||
isExpandedMode = true;
|
||||
}
|
||||
});
|
||||
|
||||
function toggleExpandedMode() {
|
||||
isExpandedMode = !isExpandedMode;
|
||||
@@ -183,7 +192,7 @@
|
||||
/>
|
||||
</div>
|
||||
|
||||
{#if isExpandedMode || isOnMobile}
|
||||
{#if isOnMobile || (isExpandedMode && !alwaysShowOnDesktop)}
|
||||
<div
|
||||
class="flex items-center transition-all duration-150 ease-out {isMobile.current &&
|
||||
!isExpandedMode
|
||||
|
||||
@@ -33,8 +33,6 @@
|
||||
import { SETTINGS_KEYS } from '$lib/constants';
|
||||
|
||||
let { children } = $props();
|
||||
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
|
||||
let isDesktop = $derived(!isMobile.current);
|
||||
let innerHeight = $state<number | undefined>();
|
||||
let innerWidth = $state(browser ? window.innerWidth : 0);
|
||||
|
||||
@@ -164,12 +162,6 @@
|
||||
updateFavicon();
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
if (alwaysShowSidebarOnDesktop && isDesktop) {
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
// Initialize server properties on app load (run once)
|
||||
$effect(() => {
|
||||
// Only fetch if we don't already have props
|
||||
|
||||
Reference in New Issue
Block a user