mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-21 09:10:58 +02:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29f1482221 | ||
|
|
e6b4acfe86 | ||
|
|
e2b129e1bf | ||
|
|
7e50ef7d79 | ||
|
|
5028447384 | ||
|
|
585080d310 | ||
|
|
57ebaf4edd | ||
|
|
871b0b70f8 | ||
|
|
b39a7bf1b0 |
@@ -58,6 +58,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full && \
|
||||
cp build/bin/* /app/full/ && \
|
||||
cp *.py /app/full/ && \
|
||||
cp -r conversion /app/full/ && \
|
||||
cp -r gguf-py /app/full/ && \
|
||||
cp -r requirements /app/full/ && \
|
||||
cp requirements.txt /app/full/
|
||||
|
||||
@@ -30,6 +30,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -41,6 +41,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -81,6 +81,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -53,6 +53,7 @@ RUN mkdir -p /app/lib \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
|
||||
COPY *.py /opt/llama.cpp/bin
|
||||
COPY .devops/tools.sh /opt/llama.cpp/bin
|
||||
COPY conversion /opt/llama.cpp/conversion
|
||||
|
||||
COPY gguf-py /opt/llama.cpp/gguf-py
|
||||
COPY requirements.txt /opt/llama.cpp/gguf-py
|
||||
@@ -47,9 +48,10 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
|
||||
FROM scratch AS collector
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
|
||||
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
|
||||
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
|
||||
|
||||
|
||||
### Base image
|
||||
@@ -107,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
|
||||
|
||||
COPY --from=collector /llama.cpp/bin /app
|
||||
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
|
||||
COPY --from=collector /llama.cpp/conversion /app/conversion
|
||||
|
||||
RUN pip install --no-cache-dir --break-system-packages \
|
||||
-r /app/gguf-py/requirements.txt
|
||||
|
||||
@@ -26,6 +26,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
linux-iot-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
@@ -104,12 +104,13 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
||||
|
||||
# extra artifacts
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" OFF)
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
# Backward compat: when old var is set but new one isn't, forward the value
|
||||
if(DEFINED LLAMA_BUILD_WEBUI)
|
||||
@@ -120,8 +121,9 @@ if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
|
||||
set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
|
||||
message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
|
||||
endif()
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
||||
@@ -226,6 +228,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||
add_subdirectory(tools)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_APP)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
|
||||
# Automatically add all files from the 'licenses' directory
|
||||
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
||||
|
||||
|
||||
11
app/CMakeLists.txt
Normal file
11
app/CMakeLists.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
set(TARGET llama-app)
|
||||
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE llama-server-impl llama-cli-impl llama-completion-impl llama-bench-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
67
app/llama.cpp
Normal file
67
app/llama.cpp
Normal file
@@ -0,0 +1,67 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int llama_server(int argc, char ** argv);
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
// hidden
|
||||
int llama_completion(int argc, char ** argv);
|
||||
int llama_bench(int argc, char ** argv);
|
||||
static int help(int argc, char ** argv);
|
||||
|
||||
struct command {
|
||||
const char * name;
|
||||
const char * desc;
|
||||
std::vector<std::string> aliases;
|
||||
bool hidden;
|
||||
int (*func)(int, char **);
|
||||
};
|
||||
|
||||
static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmarking tool", {}, true, llama_bench },
|
||||
{"help", "Show available commands", {}, true, help },
|
||||
};
|
||||
|
||||
static int help(int argc, char ** argv) {
|
||||
const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
|
||||
|
||||
printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (show_all || !cmd.hidden) {
|
||||
printf(" %-15s %s\n", cmd.name, cmd.desc);
|
||||
}
|
||||
}
|
||||
printf("\nRun 'llama <command> --help' for command-specific usage.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool matches(const std::string & arg, const command & cmd) {
|
||||
if (arg == cmd.name) {
|
||||
return true;
|
||||
}
|
||||
for (const auto & alias : cmd.aliases) {
|
||||
if (arg == alias) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
const std::string arg = argc >= 2 ? argv[1] : "help";
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (matches(arg, cmd)) {
|
||||
return cmd.func(argc - 1, argv + 1);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
@@ -10,8 +10,8 @@
|
||||
"ANDROID_ABI": "arm64-v8a",
|
||||
"ANDROID_PLATFORM": "android-31",
|
||||
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
@@ -59,8 +59,8 @@
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake",
|
||||
"CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
|
||||
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
|
||||
@@ -359,7 +359,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return 8;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 2;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return 8;
|
||||
default:
|
||||
|
||||
@@ -1897,7 +1897,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type));
|
||||
// note: this is slower
|
||||
//const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0;
|
||||
const bool is_c4 = false;
|
||||
|
||||
snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : "");
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
|
||||
@@ -1907,6 +1911,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
|
||||
res.c4 = is_c4;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
@@ -816,9 +816,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
|
||||
} else {
|
||||
const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
|
||||
const int nth = MIN(args.ne00, nth_max);
|
||||
|
||||
const int nk0 = (args.ne00 + nth - 1)/nth;
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1);
|
||||
@@ -1863,7 +1861,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
|
||||
nk0 = ne00/ggml_blck_size(op->type);
|
||||
}
|
||||
|
||||
int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
int nth = std::min<int>(nk0*ne01, 256);
|
||||
|
||||
// when rows are small, we can batch them together in a single threadgroup
|
||||
int nrptg = 1;
|
||||
@@ -1874,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
|
||||
nrptg = (nth + nk0 - 1)/nk0;
|
||||
nth = nk0;
|
||||
|
||||
if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
if (nrptg*nth > 256) {
|
||||
nrptg--;
|
||||
}
|
||||
}
|
||||
@@ -4039,14 +4037,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
|
||||
|
||||
const int nth = std::min(1024, ne0);
|
||||
if (pipeline.c4) {
|
||||
args.ne00 = ne00/4;
|
||||
args.ne0 = ne0/4;
|
||||
}
|
||||
|
||||
const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
const int nth = MIN(args.ne0, nth_max);
|
||||
const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel!
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -2643,7 +2643,7 @@ kernel void kernel_gated_delta_net_impl(
|
||||
b_ptr += args.ne21;
|
||||
g_ptr += args.ne21*G;
|
||||
|
||||
if (K > 1u) {
|
||||
if (K > 1) {
|
||||
const int target_slot = (int)t - shift;
|
||||
if (target_slot >= 0 && target_slot < (int)K) {
|
||||
device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base;
|
||||
@@ -2655,7 +2655,7 @@ kernel void kernel_gated_delta_net_impl(
|
||||
}
|
||||
}
|
||||
|
||||
if (K == 1u) {
|
||||
if (K == 1) {
|
||||
device float * dst_state = (device float *) (dst) + attn_size + state_out_base;
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
@@ -5104,7 +5104,7 @@ kernel void kernel_upscale_bilinear_f32(
|
||||
for (int64_t sx = x_min; sx < x_max; ++sx) {
|
||||
const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0);
|
||||
const float w = wx * wy;
|
||||
const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
|
||||
device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
|
||||
sum += (*src_ptr) * w;
|
||||
wsum += w;
|
||||
}
|
||||
@@ -5286,7 +5286,7 @@ kernel void kernel_upscale_bicubic_f32(
|
||||
const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx));
|
||||
const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3;
|
||||
|
||||
const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
|
||||
device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
|
||||
sum += (*src_ptr) * wx * wy;
|
||||
}
|
||||
}
|
||||
@@ -5329,42 +5329,46 @@ kernel void kernel_roll_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_pad_f32(
|
||||
template <typename T>
|
||||
kernel void kernel_pad_impl(
|
||||
constant ggml_metal_kargs_pad & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int32_t i3 = tgpig.z;
|
||||
const int32_t i2 = tgpig.y;
|
||||
const int32_t k0 = tgpig.x/args.ne1;
|
||||
const int32_t i1 = tgpig.x - k0*args.ne1;
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
const int32_t i03 = i3;
|
||||
const int32_t i02 = i2;
|
||||
const int32_t i01 = i1;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1;
|
||||
device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
|
||||
|
||||
if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
if (i0 < args.ne00) {
|
||||
dst_ptr[i0] = src0_ptr[i0];
|
||||
} else {
|
||||
dst_ptr[i0] = 0.0f;
|
||||
}
|
||||
for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) {
|
||||
const int32_t i0 = k0*1024 + tpitg.x + l0;
|
||||
if (i0 >= args.ne0) {
|
||||
break;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
dst_ptr[i0] = 0.0f;
|
||||
if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
|
||||
dst_ptr[i0] = src0_ptr[i0];
|
||||
} else {
|
||||
dst_ptr[i0] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_pad_impl<float>) kernel_pad_t;
|
||||
|
||||
template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl<float>;
|
||||
template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl<float4>;
|
||||
|
||||
// TODO: this is slow - optimize
|
||||
kernel void kernel_pad_reflect_1d_f32(
|
||||
constant ggml_metal_kargs_pad_reflect_1d & args,
|
||||
device const char * src0,
|
||||
@@ -7328,23 +7332,27 @@ kernel void kernel_cpy_t_t(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig[2];
|
||||
const int i02 = tgpig[1];
|
||||
const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0];
|
||||
const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
const int32_t i03 = tgpig[2];
|
||||
const int32_t i02 = tgpig[1];
|
||||
const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
|
||||
const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
|
||||
|
||||
const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
|
||||
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
|
||||
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
|
||||
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
|
||||
const int32_t i3 = n/(args.ne2*args.ne1*args.ne0);
|
||||
const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
|
||||
const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
|
||||
const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
|
||||
|
||||
device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.ne00; ) {
|
||||
for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) {
|
||||
device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
|
||||
dst_data[i00] = (T1) src[0];
|
||||
break;
|
||||
@@ -7376,23 +7384,27 @@ kernel void kernel_cpy_f32_q(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig[2];
|
||||
const int i02 = tgpig[1];
|
||||
const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0];
|
||||
const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
const int32_t i03 = tgpig[2];
|
||||
const int32_t i02 = tgpig[1];
|
||||
const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
|
||||
const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
|
||||
|
||||
const int64_t i3 = n / (args.ne2*args.ne1*args.ne0);
|
||||
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
|
||||
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
|
||||
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK;
|
||||
const int32_t i3 = n / (args.ne2*args.ne1*args.ne0);
|
||||
const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
|
||||
const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
|
||||
const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK;
|
||||
|
||||
device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) {
|
||||
for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) {
|
||||
device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00);
|
||||
|
||||
quantize_func(src, dst_data[i00]);
|
||||
@@ -7417,24 +7429,28 @@ kernel void kernel_cpy_q_f32(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig[2];
|
||||
const int i02 = tgpig[1];
|
||||
const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0];
|
||||
const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
const int32_t i03 = tgpig[2];
|
||||
const int32_t i02 = tgpig[1];
|
||||
const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
|
||||
const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
|
||||
|
||||
const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
|
||||
const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
|
||||
const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
|
||||
const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
|
||||
const int32_t i3 = n/(args.ne2*args.ne1*args.ne0);
|
||||
const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
|
||||
const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
|
||||
const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
|
||||
|
||||
device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) {
|
||||
for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) {
|
||||
T4x4 temp;
|
||||
dequantize_func(src_data + i00/nl, i00%nl, temp);
|
||||
dst_data[i00] = temp;
|
||||
|
||||
@@ -562,13 +562,13 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
}
|
||||
|
||||
const int64_t D = S_v * S_v * H_v;
|
||||
const int64_t K = (int64_t) cparams.n_rs_seq + 1;
|
||||
const int64_t K = cparams.n_rs_seq + 1;
|
||||
|
||||
// TODO: remove pad + simplify
|
||||
ggml_tensor * state_in_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
|
||||
ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0);
|
||||
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
|
||||
ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0);
|
||||
|
||||
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d);
|
||||
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad);
|
||||
if (n_seq_tokens > 1) {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
} else {
|
||||
|
||||
@@ -1,9 +1,19 @@
|
||||
set(TARGET llama-cli)
|
||||
add_executable(${TARGET} cli.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
# llama-cli-impl: CLI logic, reusable by app
|
||||
|
||||
include_directories(../server)
|
||||
set(TARGET llama-cli-impl)
|
||||
|
||||
add_library(${TARGET} STATIC cli.cpp)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
|
||||
target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# llama-cli executable
|
||||
|
||||
set(TARGET llama-cli)
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-cli-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
@@ -342,7 +342,10 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
|
||||
|
||||
static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
int llama_cli(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
|
||||
|
||||
5
tools/cli/main.cpp
Normal file
5
tools/cli/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_cli(argc, argv);
|
||||
}
|
||||
@@ -1,6 +1,18 @@
|
||||
# llama-completion-impl: completion logic, reusable by app
|
||||
|
||||
set(TARGET llama-completion-impl)
|
||||
|
||||
add_library(${TARGET} STATIC completion.cpp)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# llama-completion executable
|
||||
|
||||
set(TARGET llama-completion)
|
||||
add_executable(${TARGET} completion.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-completion-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
@@ -84,7 +84,10 @@ static void sigint_handler(int signo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_completion(int argc, char ** argv);
|
||||
|
||||
int llama_completion(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
5
tools/completion/main.cpp
Normal file
5
tools/completion/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_completion(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_completion(argc, argv);
|
||||
}
|
||||
@@ -1,6 +1,18 @@
|
||||
# llama-bench-impl: benchmark logic, reusable by app
|
||||
|
||||
set(TARGET llama-bench-impl)
|
||||
|
||||
add_library(${TARGET} STATIC llama-bench.cpp)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# llama-bench executable
|
||||
|
||||
set(TARGET llama-bench)
|
||||
add_executable(${TARGET} llama-bench.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-bench-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
@@ -2136,7 +2136,10 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_bench(int argc, char ** argv);
|
||||
|
||||
int llama_bench(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
// try to set locale for unicode characters in markdown
|
||||
std::setlocale(LC_CTYPE, ".UTF-8");
|
||||
|
||||
5
tools/llama-bench/main.cpp
Normal file
5
tools/llama-bench/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_bench(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_bench(argc, argv);
|
||||
}
|
||||
@@ -162,8 +162,14 @@ struct clip_ctx {
|
||||
|
||||
bool debug_output_embeddings = false;
|
||||
|
||||
// for measuring memory usage
|
||||
bool no_alloc = false;
|
||||
std::map<ggml_backend_dev_t, size_t> mem_usage;
|
||||
std::map<ggml_backend_dev_t, size_t> mem_compute;
|
||||
|
||||
clip_ctx(clip_context_params & ctx_params) {
|
||||
flash_attn_type = ctx_params.flash_attn_type;
|
||||
no_alloc = ctx_params.no_alloc;
|
||||
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (!backend_cpu) {
|
||||
throw std::runtime_error("failed to initialize CPU backend");
|
||||
@@ -1688,6 +1694,8 @@ struct clip_model_loader {
|
||||
ggml_set_name(data_tensor, cur->name);
|
||||
loaded_tensor_names.insert(name);
|
||||
cur = data_tensor;
|
||||
// add to weight memory counter
|
||||
ctx_clip.mem_usage[ggml_backend_get_device(ctx_clip.backend)] += ggml_nbytes(cur);
|
||||
}
|
||||
return cur;
|
||||
};
|
||||
@@ -2602,7 +2610,7 @@ struct clip_model_loader {
|
||||
}
|
||||
|
||||
// load data
|
||||
{
|
||||
if (!ctx_clip.no_alloc) {
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
// alloc memory and offload data
|
||||
@@ -2676,7 +2684,7 @@ struct clip_model_loader {
|
||||
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
|
||||
// try to enable flash attention to see if it's supported
|
||||
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
|
||||
info = alloc_compute_meta(ctx_clip, batch);
|
||||
info = reserve_compute_meta(ctx_clip, batch);
|
||||
if (!info.fattn && info.fattn_op) {
|
||||
auto op = info.fattn_op;
|
||||
LOG_WRN("%s: *****************************************************************\n", __func__);
|
||||
@@ -2695,10 +2703,10 @@ struct clip_model_loader {
|
||||
LOG_WRN("%s: please report this on github as an issue\n", __func__);
|
||||
LOG_WRN("%s: *****************************************************************\n", __func__);
|
||||
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
|
||||
alloc_compute_meta(ctx_clip, batch);
|
||||
reserve_compute_meta(ctx_clip, batch);
|
||||
}
|
||||
} else {
|
||||
info = alloc_compute_meta(ctx_clip, batch);
|
||||
info = reserve_compute_meta(ctx_clip, batch);
|
||||
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
|
||||
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
|
||||
}
|
||||
@@ -2737,12 +2745,14 @@ struct clip_model_loader {
|
||||
}
|
||||
}
|
||||
|
||||
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
// only initialize backend buffers, but do not allocate them yet
|
||||
static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
|
||||
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||
|
||||
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
|
||||
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
|
||||
|
||||
ctx_clip.mem_compute.clear();
|
||||
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
|
||||
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
|
||||
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
|
||||
@@ -2752,6 +2762,7 @@ struct clip_model_loader {
|
||||
ggml_backend_buft_name(buft),
|
||||
size / 1024.0 / 1024.0);
|
||||
}
|
||||
ctx_clip.mem_compute[ggml_backend_get_device(backend)] += size;
|
||||
}
|
||||
|
||||
const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
|
||||
@@ -4266,22 +4277,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
|
||||
return ctx->model.hparams.minicpmv_version;
|
||||
}
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV4_6) {
|
||||
return 46;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
// TODO: remove this function
|
||||
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
|
||||
}
|
||||
|
||||
bool clip_is_llava(const struct clip_ctx * ctx) {
|
||||
return ctx->model.hparams.has_llava_projector;
|
||||
}
|
||||
@@ -4330,6 +4325,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
|
||||
return &ctx->model.hparams;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx) {
|
||||
std::map<ggml_backend_dev_t, size_t> result = ctx->mem_usage;
|
||||
for (auto & [dev, size] : ctx->mem_compute) {
|
||||
result[dev] += size;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//
|
||||
// API for debugging
|
||||
//
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <map>
|
||||
|
||||
// !!! Internal header, to be used by mtmd only !!!
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
@@ -40,6 +42,7 @@ struct clip_context_params {
|
||||
bool warmup;
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
bool no_alloc;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
@@ -102,8 +105,6 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||
|
||||
int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
bool clip_is_llava(const struct clip_ctx * ctx);
|
||||
// note for contributor: this clip_is_(model) pattern is deprecated
|
||||
// do NOT add new functions like this
|
||||
@@ -116,6 +117,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
|
||||
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
|
||||
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
|
||||
|
||||
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
|
||||
|
||||
struct clip_cap {
|
||||
bool has_vision;
|
||||
bool has_audio;
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <climits>
|
||||
#include <vector>
|
||||
|
||||
// represents raw image data, layout is RGBRGBRGB...
|
||||
@@ -139,13 +140,13 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_v; // vision
|
||||
struct clip_ctx * ctx_a; // audio
|
||||
const struct llama_model * text_model;
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
std::string media_marker;
|
||||
const int n_embd_text;
|
||||
const int n_embd_text = -1; // -1 means llm context not provided, skip checking this
|
||||
const llama_vocab * vocab = nullptr; // can be nullptr if text_model is not provided
|
||||
mtmd_pos_type pos_type;
|
||||
|
||||
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
|
||||
@@ -178,12 +179,13 @@ struct mtmd_context {
|
||||
|
||||
mtmd_context(const char * mmproj_fname,
|
||||
const llama_model * text_model,
|
||||
const mtmd_context_params & ctx_params) :
|
||||
text_model (text_model),
|
||||
const mtmd_context_params & ctx_params,
|
||||
bool no_alloc = false) :
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd_inp(text_model))
|
||||
n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1),
|
||||
vocab (text_model ? llama_model_get_vocab(text_model) : nullptr)
|
||||
{
|
||||
if (ctx_params.image_marker != nullptr) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
@@ -193,21 +195,23 @@ struct mtmd_context {
|
||||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
auto decoder_rope_type = llama_model_rope_type(text_model);
|
||||
switch (decoder_rope_type) {
|
||||
case LLAMA_ROPE_TYPE_NONE:
|
||||
case LLAMA_ROPE_TYPE_NORM:
|
||||
case LLAMA_ROPE_TYPE_NEOX:
|
||||
{
|
||||
pos_type = MTMD_POS_TYPE_NORMAL;
|
||||
} break;
|
||||
case LLAMA_ROPE_TYPE_MROPE:
|
||||
case LLAMA_ROPE_TYPE_IMROPE:
|
||||
{
|
||||
pos_type = MTMD_POS_TYPE_MROPE;
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type));
|
||||
if (text_model) {
|
||||
auto decoder_rope_type = llama_model_rope_type(text_model);
|
||||
switch (decoder_rope_type) {
|
||||
case LLAMA_ROPE_TYPE_NONE:
|
||||
case LLAMA_ROPE_TYPE_NORM:
|
||||
case LLAMA_ROPE_TYPE_NEOX:
|
||||
{
|
||||
pos_type = MTMD_POS_TYPE_NORMAL;
|
||||
} break;
|
||||
case LLAMA_ROPE_TYPE_MROPE:
|
||||
case LLAMA_ROPE_TYPE_IMROPE:
|
||||
{
|
||||
pos_type = MTMD_POS_TYPE_MROPE;
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("unsupported decoder rope type: %d\n", decoder_rope_type));
|
||||
}
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params {
|
||||
@@ -218,6 +222,7 @@ struct mtmd_context {
|
||||
/* warmup */ ctx_params.warmup,
|
||||
/* cb_eval */ ctx_params.cb_eval,
|
||||
/* cb_eval_user_data */ ctx_params.cb_eval_user_data,
|
||||
/* no_alloc */ no_alloc,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
@@ -241,7 +246,7 @@ struct mtmd_context {
|
||||
// since we already validate n_embd of vision and audio mmproj,
|
||||
// we can safely assume that they are the same
|
||||
int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
|
||||
if (n_embd_text != n_embd_clip) {
|
||||
if (n_embd_text > 0 && n_embd_text != n_embd_clip) {
|
||||
throw std::runtime_error(string_format(
|
||||
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
|
||||
"hint: you may be using wrong mmproj\n",
|
||||
@@ -279,7 +284,7 @@ struct mtmd_context {
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
int minicpmv_version = clip_get_hparams(ctx_v)->minicpmv_version;
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
@@ -594,7 +599,11 @@ struct mtmd_context {
|
||||
|
||||
private:
|
||||
llama_token lookup_token(const std::string & token_text) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(text_model);
|
||||
if (vocab == nullptr) {
|
||||
// TODO @ngxson : this case is currently hit by mtmd_get_memory_usage
|
||||
// but we should reconsider this if this case is needed in other places in the future
|
||||
return LLAMA_TOKEN_NULL;
|
||||
}
|
||||
const int n_vocab = llama_vocab_n_tokens(vocab);
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
if (token_to_piece(vocab, i, true) == token_text) {
|
||||
@@ -605,6 +614,9 @@ private:
|
||||
}
|
||||
|
||||
std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error("llama_vocab is not provided");
|
||||
}
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
||||
@@ -653,7 +665,7 @@ struct mtmd_tokenizer {
|
||||
add_special = text->add_special;
|
||||
parse_special = text->parse_special;
|
||||
input_text = text->text;
|
||||
vocab = llama_model_get_vocab(ctx->text_model);
|
||||
vocab = ctx->vocab;
|
||||
}
|
||||
|
||||
int32_t tokenize(mtmd_input_chunks * output) {
|
||||
@@ -679,27 +691,29 @@ struct mtmd_tokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && llama_vocab_get_add_bos(vocab)) {
|
||||
// if first chunk is text, we add BOS token to first text chunk
|
||||
// otherwise, create a new text chunk with BOS token
|
||||
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
// add BOS token to the beginning of first text chunk
|
||||
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
|
||||
} else {
|
||||
// create a new text chunk with BOS token at the beginning
|
||||
mtmd_input_chunk bos_chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
{llama_vocab_bos(vocab)},
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
|
||||
if (vocab != nullptr) {
|
||||
if (add_special && llama_vocab_get_add_bos(vocab)) {
|
||||
// if first chunk is text, we add BOS token to first text chunk
|
||||
// otherwise, create a new text chunk with BOS token
|
||||
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
// add BOS token to the beginning of first text chunk
|
||||
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
|
||||
} else {
|
||||
// create a new text chunk with BOS token at the beginning
|
||||
mtmd_input_chunk bos_chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
{llama_vocab_bos(vocab)},
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && llama_vocab_get_add_eos(vocab)) {
|
||||
// if last chunk is text, we add EOS token to it
|
||||
add_text({llama_vocab_eos(vocab)});
|
||||
if (add_special && llama_vocab_get_add_eos(vocab)) {
|
||||
// if last chunk is text, we add EOS token to it
|
||||
add_text({llama_vocab_eos(vocab)});
|
||||
}
|
||||
}
|
||||
|
||||
if (i_bm != bitmaps.size()) {
|
||||
@@ -714,6 +728,9 @@ struct mtmd_tokenizer {
|
||||
}
|
||||
|
||||
void add_text(const std::string & txt, bool parse_special) {
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error("llama_vocab is not provided");
|
||||
}
|
||||
LOG_DBG("%s: %s\n", __func__, txt.c_str());
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
|
||||
add_text(tokens);
|
||||
@@ -1002,10 +1019,16 @@ struct mtmd_tokenizer {
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error("llama_vocab is not provided");
|
||||
}
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
||||
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
||||
}
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
@@ -1067,8 +1090,8 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
||||
bool ok = false;
|
||||
|
||||
if (clip_is_llava(ctx_clip)
|
||||
|| clip_is_minicpmv(ctx_clip)
|
||||
|| clip_is_glm(ctx_clip)
|
||||
|| proj_type == PROJECTOR_TYPE_MINICPMV
|
||||
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| proj_type == PROJECTOR_TYPE_INTERNVL) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
@@ -1542,3 +1565,36 @@ void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> &
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void stub_log_callback(enum ggml_log_level, const char *, void *) {
|
||||
// do nothing
|
||||
}
|
||||
|
||||
std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(const char * mmproj_fname,
|
||||
struct mtmd_context_params ctx_params) {
|
||||
mtmd::context_ptr ctx;
|
||||
auto saved_log_callback = g_logger_state.log_callback;
|
||||
auto saved_log_user_data = g_logger_state.log_callback_user_data;
|
||||
try {
|
||||
mtmd_log_set(stub_log_callback, nullptr); // suppress logging
|
||||
ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params));
|
||||
mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback
|
||||
std::map<ggml_backend_dev_t, size_t> total_mem;
|
||||
auto merge = [&](const struct clip_ctx * c) {
|
||||
for (auto & [dev, size] : clip_get_mem_usage(c)) {
|
||||
total_mem[dev] += size;
|
||||
}
|
||||
};
|
||||
if (ctx->ctx_v) {
|
||||
merge(ctx->ctx_v);
|
||||
}
|
||||
if (ctx->ctx_a) {
|
||||
merge(ctx->ctx_a);
|
||||
}
|
||||
return total_mem;
|
||||
} catch (const std::exception & e) {
|
||||
mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback
|
||||
LOG_ERR("%s: error: %s\n", __func__, e.what());
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cinttypes>
|
||||
@@ -261,6 +262,14 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
// Get memory usage of the current model in bytes, per backend device
|
||||
// Note: this is an unstable API, used internally by fit_params; it WILL be removed or changed without deprecation
|
||||
#ifdef __cplusplus
|
||||
MTMD_API std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(
|
||||
const char * mmproj_fname,
|
||||
struct mtmd_context_params ctx_params);
|
||||
#endif
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
@@ -27,12 +27,11 @@ target_include_directories(${TARGET} PRIVATE ../mtmd)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# llama-server-impl: server logic, reusable by app
|
||||
|
||||
# llama-server executable
|
||||
set(TARGET llama-server-impl)
|
||||
|
||||
set(TARGET llama-server)
|
||||
|
||||
set(TARGET_SRCS
|
||||
add_library(${TARGET} STATIC
|
||||
server.cpp
|
||||
server-http.cpp
|
||||
server-http.h
|
||||
@@ -40,11 +39,16 @@ set(TARGET_SRCS
|
||||
server-models.h
|
||||
)
|
||||
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC server-context llama-ui cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
# llama-server executable
|
||||
|
||||
set(TARGET llama-server)
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ../mtmd)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PRIVATE server-context llama-ui PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE llama-server-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
5
tools/server/main.cpp
Normal file
5
tools/server/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_server(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_server(argc, argv);
|
||||
}
|
||||
@@ -746,6 +746,46 @@ private:
|
||||
|
||||
params_base = params;
|
||||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
bool has_mmproj = !mmproj_path.empty();
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
if (has_mmproj) {
|
||||
mparams.use_gpu = params_base.mmproj_use_gpu;
|
||||
mparams.print_timings = false;
|
||||
mparams.n_threads = params_base.cpuparams.n_threads;
|
||||
mparams.flash_attn_type = params_base.flash_attn_type;
|
||||
mparams.warmup = params_base.warmup;
|
||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||
mparams.media_marker = get_media_marker();
|
||||
}
|
||||
|
||||
// optionally get the memory usage of mmproj
|
||||
if (has_mmproj && params_base.fit_params) {
|
||||
auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams);
|
||||
if (!mmproj_mem.empty()) {
|
||||
size_t total = 0;
|
||||
for (auto & [dev, size] : mmproj_mem) {
|
||||
total += size;
|
||||
}
|
||||
SRV_INF("[mtmd] estimated memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0));
|
||||
GGML_ASSERT(!params_base.fit_params_target.empty());
|
||||
for (auto & [dev, size] : mmproj_mem) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
if (ggml_backend_dev_get(i) == dev) {
|
||||
if (i < params_base.fit_params_target.size()) {
|
||||
SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev));
|
||||
params_base.fit_params_target[i] += size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
SRV_ERR("%s", "[mtmd] failed to get memory usage of mmproj\n");
|
||||
}
|
||||
}
|
||||
|
||||
llama_init = common_init_from_params(params_base);
|
||||
|
||||
model_tgt = llama_init->model();
|
||||
@@ -830,18 +870,10 @@ private:
|
||||
params_base.speculative.draft.ctx_dft = ctx_dft.get();
|
||||
}
|
||||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
if (!mmproj_path.empty()) {
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
|
||||
mparams.use_gpu = params_base.mmproj_use_gpu;
|
||||
mparams.print_timings = false;
|
||||
mparams.n_threads = params_base.cpuparams.n_threads;
|
||||
mparams.flash_attn_type = params_base.flash_attn_type;
|
||||
mparams.warmup = params_base.warmup;
|
||||
mparams.image_min_tokens = params_base.image_min_tokens;
|
||||
mparams.image_max_tokens = params_base.image_max_tokens;
|
||||
mparams.media_marker = get_media_marker();
|
||||
if (has_mmproj) {
|
||||
if (!is_resume) {
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
}
|
||||
|
||||
mctx = mtmd_init_from_file(mmproj_path.c_str(), model_tgt, mparams);
|
||||
if (mctx == nullptr) {
|
||||
|
||||
@@ -71,7 +71,10 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
|
||||
};
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_server(int argc, char ** argv);
|
||||
|
||||
int llama_server(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
// own arguments required by this example
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import ChatFormActionAddDropdown from './ChatFormActionAddDropdown.svelte';
|
||||
import ChatFormActionAddSheet from './ChatFormActionAddSheet.svelte';
|
||||
import ChatFormActionAddButton from './ChatFormActionAddButton.svelte';
|
||||
@@ -31,8 +31,6 @@
|
||||
onMcpSettingsClick,
|
||||
onSystemPromptClick
|
||||
}: Props = $props();
|
||||
|
||||
const isMobile = new IsMobile();
|
||||
</script>
|
||||
|
||||
{#if isMobile.current}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
|
||||
import { isRouterMode, serverError } from '$lib/stores/server.svelte';
|
||||
import { ModelsSelectorDropdown, ModelsSelectorSheet } from '$lib/components/app';
|
||||
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { activeMessages } from '$lib/stores/conversations.svelte';
|
||||
|
||||
interface Props {
|
||||
@@ -152,8 +152,6 @@
|
||||
let selectorModelRef: ModelsSelectorDropdown | ModelsSelectorSheet | undefined =
|
||||
$state(undefined);
|
||||
|
||||
let isMobile = new IsMobile();
|
||||
|
||||
export function open() {
|
||||
selectorModelRef?.open();
|
||||
}
|
||||
|
||||
@@ -41,12 +41,16 @@
|
||||
});
|
||||
</script>
|
||||
|
||||
<div class="pointer-events-auto relative z-50 mx-auto mb-4 flex max-w-[48rem] justify-center">
|
||||
<div
|
||||
class="pointer-events-{show
|
||||
? 'auto'
|
||||
: 'none'} relative z-50 mx-auto mb-4 flex max-w-[48rem] justify-center"
|
||||
>
|
||||
<Button
|
||||
onclick={scrollToBottom}
|
||||
variant="secondary"
|
||||
size="icon"
|
||||
class="absolute h-10 w-10 rounded-full bg-background/80 shadow-lg backdrop-blur-sm transition-all duration-200 hover:bg-muted/80"
|
||||
class="pointer-events-all absolute h-10 w-10 rounded-full bg-background/80 shadow-lg backdrop-blur-sm transition-all duration-200 hover:bg-muted/80"
|
||||
style="bottom: {buttonBottom}; transform: translateY({show ? '0' : '2rem'}); opacity: {show
|
||||
? 1
|
||||
: 0};"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { IsMobile } from '$lib/hooks/is-mobile.svelte.js';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte.js';
|
||||
import { getContext, setContext } from 'svelte';
|
||||
import { SIDEBAR_KEYBOARD_SHORTCUT, SIDEBAR_MIN_WIDTH } from './constants.js';
|
||||
|
||||
@@ -27,19 +27,17 @@ class SidebarState {
|
||||
sidebarWidth = $state(SIDEBAR_MIN_WIDTH);
|
||||
isResizing = $state(false);
|
||||
setOpen: SidebarStateProps['setOpen'];
|
||||
#isMobile: IsMobile;
|
||||
state = $derived.by(() => (this.open ? 'expanded' : 'collapsed'));
|
||||
|
||||
constructor(props: SidebarStateProps) {
|
||||
this.setOpen = props.setOpen;
|
||||
this.#isMobile = new IsMobile();
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
// Convenience getter for checking if the sidebar is mobile
|
||||
// without this, we would need to use `sidebar.isMobile.current` everywhere
|
||||
get isMobile() {
|
||||
return this.#isMobile.current;
|
||||
return isMobile.current;
|
||||
}
|
||||
|
||||
// Event handler to apply to the `<svelte:window>`
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants';
|
||||
import { MediaQuery } from 'svelte/reactivity';
|
||||
|
||||
export class IsMobile extends MediaQuery {
|
||||
constructor(breakpoint: number = DEFAULT_MOBILE_BREAKPOINT) {
|
||||
super(`max-width: ${breakpoint - 1}px`);
|
||||
}
|
||||
}
|
||||
@@ -824,26 +824,6 @@ export class ChatService {
|
||||
|
||||
const contentParts: ApiChatMessageContentPart[] = [];
|
||||
|
||||
if (message.content) {
|
||||
contentParts.push({
|
||||
type: ContentPartType.TEXT,
|
||||
text: message.content
|
||||
});
|
||||
}
|
||||
|
||||
// Include images from all messages
|
||||
const imageFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
|
||||
extra.type === AttachmentType.IMAGE
|
||||
);
|
||||
|
||||
for (const image of imageFiles) {
|
||||
contentParts.push({
|
||||
type: ContentPartType.IMAGE_URL,
|
||||
image_url: { url: image.base64Url }
|
||||
});
|
||||
}
|
||||
|
||||
const textFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraTextFile =>
|
||||
extra.type === AttachmentType.TEXT
|
||||
@@ -869,6 +849,26 @@ export class ChatService {
|
||||
});
|
||||
}
|
||||
|
||||
if (message.content) {
|
||||
contentParts.push({
|
||||
type: ContentPartType.TEXT,
|
||||
text: message.content
|
||||
});
|
||||
}
|
||||
|
||||
// Include images from all messages
|
||||
const imageFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
|
||||
extra.type === AttachmentType.IMAGE
|
||||
);
|
||||
|
||||
for (const image of imageFiles) {
|
||||
contentParts.push({
|
||||
type: ContentPartType.IMAGE_URL,
|
||||
image_url: { url: image.base64Url }
|
||||
});
|
||||
}
|
||||
|
||||
const audioFiles = message.extra.filter(
|
||||
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraAudioFile =>
|
||||
extra.type === AttachmentType.AUDIO
|
||||
|
||||
@@ -41,8 +41,7 @@ import {
|
||||
SETTINGS_KEYS,
|
||||
USER_OVERRIDES_LOCALSTORAGE_KEY
|
||||
} from '$lib/constants';
|
||||
|
||||
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
import { ParameterSyncService } from '$lib/services/parameter-sync.service';
|
||||
import { serverStore } from '$lib/stores/server.svelte';
|
||||
import {
|
||||
@@ -132,7 +131,7 @@ class SettingsStore {
|
||||
|
||||
// Default sendOnEnter to false on mobile when the user has no saved preference
|
||||
if (!(SETTINGS_KEYS.SEND_ON_ENTER in savedVal)) {
|
||||
if (new IsMobile().current) {
|
||||
if (isMobile.current) {
|
||||
this.config[SETTINGS_KEYS.SEND_ON_ENTER] = false;
|
||||
}
|
||||
}
|
||||
|
||||
9
tools/ui/src/lib/stores/viewport.svelte.ts
Normal file
9
tools/ui/src/lib/stores/viewport.svelte.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import { browser } from '$app/environment';
|
||||
import { DEFAULT_MOBILE_BREAKPOINT } from '$lib/constants/viewport';
|
||||
import { MediaQuery } from 'svelte/reactivity';
|
||||
|
||||
export const viewport = $state({
|
||||
width: browser ? window.innerWidth : 0
|
||||
});
|
||||
|
||||
export const isMobile = new MediaQuery(`max-width: ${DEFAULT_MOBILE_BREAKPOINT - 1}px`);
|
||||
@@ -26,18 +26,18 @@
|
||||
import { modelsStore } from '$lib/stores/models.svelte';
|
||||
import { mcpStore } from '$lib/stores/mcp.svelte';
|
||||
import { TOOLTIP_DELAY_DURATION } from '$lib/constants';
|
||||
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
|
||||
import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte';
|
||||
import { useSettingsNavigation } from '$lib/hooks/use-settings-navigation.svelte';
|
||||
import { conversations } from '$lib/stores/conversations.svelte';
|
||||
import { isMobile } from '$lib/stores/viewport.svelte';
|
||||
|
||||
let { children } = $props();
|
||||
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
|
||||
let isMobile = new IsMobile();
|
||||
let isDesktop = $derived(!isMobile.current);
|
||||
let sidebarOpen = $state(false);
|
||||
let mounted = $state(false);
|
||||
let innerHeight = $state<number | undefined>();
|
||||
let innerWidth = $state(browser ? window.innerWidth : 0);
|
||||
|
||||
let chatSidebar:
|
||||
| {
|
||||
@@ -278,4 +278,4 @@
|
||||
</Sidebar.Provider>
|
||||
</Tooltip.Provider>
|
||||
|
||||
<svelte:window onkeydown={handleKeydown} bind:innerHeight />
|
||||
<svelte:window onkeydown={handleKeydown} bind:innerHeight bind:innerWidth />
|
||||
|
||||
Reference in New Issue
Block a user