mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-04 18:12:57 +02:00
Compare commits
262 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
65ef50a0a4 | ||
|
|
3d1998634e | ||
|
|
e8c54893f2 | ||
|
|
3c7450cee1 | ||
|
|
f478f1b6d7 | ||
|
|
94a220cd67 | ||
|
|
166fe29492 | ||
|
|
c8d6a00636 | ||
|
|
a731805ced | ||
|
|
ee4cf705bb | ||
|
|
9e58d4d692 | ||
|
|
3571fa5435 | ||
|
|
f8f0a47a55 | ||
|
|
06938ac129 | ||
|
|
d545a2a993 | ||
|
|
4da6370d43 | ||
|
|
e3666269f9 | ||
|
|
63e66fdd23 | ||
|
|
5c394fdc8b | ||
|
|
4fb16eccce | ||
|
|
bfb4308b05 | ||
|
|
2187e00337 | ||
|
|
0b7154066e | ||
|
|
60130d18f9 | ||
|
|
a468b89018 | ||
|
|
d5ab0834ab | ||
|
|
69cea5b669 | ||
|
|
f8e67fc583 | ||
|
|
2365315955 | ||
|
|
f7a0777a5c | ||
|
|
4f3a4beb8d | ||
|
|
8f7f3bf141 | ||
|
|
d178a11818 | ||
|
|
354ebac8cb | ||
|
|
1fd5f48037 | ||
|
|
210a6570ce | ||
|
|
b8275a8acc | ||
|
|
5dcb711666 | ||
|
|
5aa3a64596 | ||
|
|
27d9ed8397 | ||
|
|
335abed17d | ||
|
|
de6f727aae | ||
|
|
95b8b8ec1a | ||
|
|
55ac0909e5 | ||
|
|
bef69f1306 | ||
|
|
5aba5364d9 | ||
|
|
8e6fff84de | ||
|
|
02a57017f6 | ||
|
|
48b88c3b00 | ||
|
|
19620004f5 | ||
|
|
f8c0a19d46 | ||
|
|
5254a7994d | ||
|
|
e22b0de60d | ||
|
|
a51142497a | ||
|
|
4162522688 | ||
|
|
44e211cecf | ||
|
|
af6528e6df | ||
|
|
6f165c1c64 | ||
|
|
399739d5c5 | ||
|
|
d4c8e2c29c | ||
|
|
3292da09f6 | ||
|
|
e6123e2080 | ||
|
|
22cadc1944 | ||
|
|
d749821db3 | ||
|
|
aa46bda89b | ||
|
|
d6588daa80 | ||
|
|
d38d50e7ff | ||
|
|
8b0e0db606 | ||
|
|
2d9b7c8e98 | ||
|
|
e674b1279b | ||
|
|
4c4e91b799 | ||
|
|
d48a56effb | ||
|
|
6e093b80ea | ||
|
|
337528571d | ||
|
|
d4204b03a5 | ||
|
|
1738129bee | ||
|
|
0821c5fcfd | ||
|
|
151f3a98e9 | ||
|
|
b22da25889 | ||
|
|
689a9a470e | ||
|
|
5a46b46acd | ||
|
|
22d66b567e | ||
|
|
2084434e66 | ||
|
|
764f1e64a1 | ||
|
|
b5f52280fb | ||
|
|
dc71236b6c | ||
|
|
06d26dfdff | ||
|
|
da3f990a47 | ||
|
|
6ed481eea4 | ||
|
|
cb47092b00 | ||
|
|
1f0aa2a696 | ||
|
|
031ddb2e08 | ||
|
|
fe12e422ad | ||
|
|
ea02bc37f5 | ||
|
|
b000431a0b | ||
|
|
eef59a7642 | ||
|
|
98e480a32e | ||
|
|
241cbd41d2 | ||
|
|
33c718db1f | ||
|
|
19e92c33ef | ||
|
|
751ebd17a5 | ||
|
|
c8914ad4f4 | ||
|
|
408ae2b9e5 | ||
|
|
3ef2369551 | ||
|
|
2f6c815dc4 | ||
|
|
445b7cef62 | ||
|
|
479a9a1b03 | ||
|
|
0b56d283bf | ||
|
|
d6be3158e1 | ||
|
|
dd1557907a | ||
|
|
7fb1e70b59 | ||
|
|
d374e71e55 | ||
|
|
30af6e2b98 | ||
|
|
d7be46189f | ||
|
|
bc81d47aba | ||
|
|
0b246862b9 | ||
|
|
a919001134 | ||
|
|
48e7078ee0 | ||
|
|
bb771cbd2b | ||
|
|
7c48fb81ce | ||
|
|
91eb8f4fa0 | ||
|
|
d205df6812 | ||
|
|
e8d2567429 | ||
|
|
09e7b76c93 | ||
|
|
48e7eae41c | ||
|
|
c5229087a5 | ||
|
|
e31cdaa0eb | ||
|
|
491c4d7d2e | ||
|
|
939a7dd648 | ||
|
|
8ad8aef447 | ||
|
|
f12cc6d0fa | ||
|
|
aa50b2c2ae | ||
|
|
c40006a62e | ||
|
|
c6e4088376 | ||
|
|
b36eefc1b3 | ||
|
|
837bb6b447 | ||
|
|
ba4dd0bc67 | ||
|
|
617255d437 | ||
|
|
87b0a60cdd | ||
|
|
fda8528aa8 | ||
|
|
2d0656fbdd | ||
|
|
6b4e4bd582 | ||
|
|
9f0e4b14d2 | ||
|
|
b3a739c9b6 | ||
|
|
4d8cc0c56f | ||
|
|
0d227ec358 | ||
|
|
1d971bba36 | ||
|
|
9777256c31 | ||
|
|
7085492c6f | ||
|
|
b4c0549a49 | ||
|
|
0d18aaa9d1 | ||
|
|
08bc21b459 | ||
|
|
35a74c8fb9 | ||
|
|
5190c2ea8d | ||
|
|
7799d31e68 | ||
|
|
3a3ed153d9 | ||
|
|
ef66bfab68 | ||
|
|
678d43d720 | ||
|
|
ef41a69179 | ||
|
|
3dc7684f39 | ||
|
|
dbe9c0c8ce | ||
|
|
6fe90deffa | ||
|
|
581d020b12 | ||
|
|
7623de11d9 | ||
|
|
c9d98295a3 | ||
|
|
1506d39e76 | ||
|
|
54121f7325 | ||
|
|
192d8ae8b8 | ||
|
|
35c9b1f39e | ||
|
|
4bead4e30d | ||
|
|
302e2c2652 | ||
|
|
328874d054 | ||
|
|
c1f1e28d29 | ||
|
|
5a4126adc1 | ||
|
|
a4d2d4ae41 | ||
|
|
d161ea7071 | ||
|
|
45158f460e | ||
|
|
22307b3e8b | ||
|
|
ce5890b5f7 | ||
|
|
b251f74f49 | ||
|
|
fa97041524 | ||
|
|
ae251b5ff2 | ||
|
|
66efd13375 | ||
|
|
6c4cbdc70b | ||
|
|
5fdf07e33b | ||
|
|
062d3115aa | ||
|
|
314e729347 | ||
|
|
d55fb97174 | ||
|
|
826539ce59 | ||
|
|
b96487645c | ||
|
|
9627d0f540 | ||
|
|
e2ef8fe42c | ||
|
|
6d57c26ef8 | ||
|
|
28123a3937 | ||
|
|
549b9d8433 | ||
|
|
5d246a792d | ||
|
|
63248fc3e3 | ||
|
|
83eebe9d08 | ||
|
|
fff63b5108 | ||
|
|
f3061116ff | ||
|
|
1c0f6db545 | ||
|
|
cec51c7a7d | ||
|
|
b22ff4b7b4 | ||
|
|
c0c7e147e7 | ||
|
|
b0df4c0cfd | ||
|
|
a497476330 | ||
|
|
95405ac65f | ||
|
|
0f3cb3fc8b | ||
|
|
1acee6bf89 | ||
|
|
ef570f6308 | ||
|
|
cc9e331213 | ||
|
|
bcfd1989e9 | ||
|
|
56f16f235c | ||
|
|
8cc67efcd4 | ||
|
|
95feeab52e | ||
|
|
99d4026b11 | ||
|
|
9c92e96a64 | ||
|
|
afcda09d15 | ||
|
|
bbce619adb | ||
|
|
4f0e43da6f | ||
|
|
bb28c1fe24 | ||
|
|
ee7c30578a | ||
|
|
47c0eda9d4 | ||
|
|
5306f4b3b5 | ||
|
|
40d5358d3c | ||
|
|
b65bb4baae | ||
|
|
a1a69f777a | ||
|
|
52fb93a2bd | ||
|
|
c9021714e8 | ||
|
|
1d7ab2b947 | ||
|
|
12e5d99078 | ||
|
|
7ea23ddf7b | ||
|
|
2fc8d1851e | ||
|
|
5e932a1c8d | ||
|
|
2754ce1b3e | ||
|
|
eeeaf6180b | ||
|
|
0be84685bd | ||
|
|
ce02093fdd | ||
|
|
6a257d4463 | ||
|
|
3a479c9132 | ||
|
|
ad27757261 | ||
|
|
3a6db741a8 | ||
|
|
510b5c2a35 | ||
|
|
a8681a0ed2 | ||
|
|
acd604fb27 | ||
|
|
6ce96713de | ||
|
|
c9872a2575 | ||
|
|
e947228222 | ||
|
|
29f1482221 | ||
|
|
e6b4acfe86 | ||
|
|
e2b129e1bf | ||
|
|
7e50ef7d79 | ||
|
|
5028447384 | ||
|
|
585080d310 | ||
|
|
57ebaf4edd | ||
|
|
871b0b70f8 | ||
|
|
b39a7bf1b0 | ||
|
|
b28a2f372a | ||
|
|
17d22a35b2 | ||
|
|
67ace021da | ||
|
|
a8078675a6 | ||
|
|
57cb35c886 |
@@ -58,6 +58,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full && \
|
||||
cp build/bin/* /app/full/ && \
|
||||
cp *.py /app/full/ && \
|
||||
cp -r conversion /app/full/ && \
|
||||
cp -r gguf-py /app/full/ && \
|
||||
cp -r requirements /app/full/ && \
|
||||
cp requirements.txt /app/full/
|
||||
|
||||
@@ -30,6 +30,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -41,6 +41,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
glibc,
|
||||
config,
|
||||
stdenv,
|
||||
stdenvNoCC,
|
||||
runCommand,
|
||||
cmake,
|
||||
ninja,
|
||||
@@ -19,6 +20,8 @@
|
||||
openssl,
|
||||
shaderc,
|
||||
spirv-headers,
|
||||
nodejs,
|
||||
importNpmLock,
|
||||
useBlas ?
|
||||
builtins.all (x: !x) [
|
||||
useCuda
|
||||
@@ -130,7 +133,31 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
|
||||
postPatch = ''
|
||||
# Builds the webui locally, taking care not to require updating any sha256 hash.
|
||||
webui = stdenvNoCC.mkDerivation {
|
||||
pname = "webui";
|
||||
version = llamaVersion;
|
||||
src = lib.cleanSource ../../tools/ui;
|
||||
|
||||
nativeBuildInputs = [
|
||||
nodejs
|
||||
importNpmLock.linkNodeModulesHook
|
||||
];
|
||||
|
||||
# no sha256 required when using buildNodeModules
|
||||
npmDeps = importNpmLock.buildNodeModules {
|
||||
npmRoot = ../../tools/ui;
|
||||
inherit nodejs;
|
||||
};
|
||||
|
||||
installPhase = ''
|
||||
LLAMA_UI_OUT_DIR=$out npm run build --offline
|
||||
'';
|
||||
};
|
||||
|
||||
postPatch = lib.optionalString useWebUi ''
|
||||
cp -r ${finalAttrs.webui} tools/ui/dist
|
||||
chmod -R u+w tools/ui/dist
|
||||
'';
|
||||
|
||||
# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
|
||||
|
||||
@@ -81,6 +81,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/ReleaseOV/bin/* /app/full/ \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -53,6 +53,7 @@ RUN mkdir -p /app/lib \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \
|
||||
|
||||
COPY *.py /opt/llama.cpp/bin
|
||||
COPY .devops/tools.sh /opt/llama.cpp/bin
|
||||
COPY conversion /opt/llama.cpp/conversion
|
||||
|
||||
COPY gguf-py /opt/llama.cpp/gguf-py
|
||||
COPY requirements.txt /opt/llama.cpp/gguf-py
|
||||
@@ -47,9 +48,10 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
|
||||
FROM scratch AS collector
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
|
||||
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
|
||||
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
|
||||
|
||||
|
||||
### Base image
|
||||
@@ -107,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
|
||||
|
||||
COPY --from=collector /llama.cpp/bin /app
|
||||
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
|
||||
COPY --from=collector /llama.cpp/conversion /app/conversion
|
||||
|
||||
RUN pip install --no-cache-dir --break-system-packages \
|
||||
-r /app/gguf-py/requirements.txt
|
||||
|
||||
@@ -26,6 +26,7 @@ RUN mkdir -p /app/lib && \
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
|
||||
101
.devops/zendnn.Dockerfile
Normal file
101
.devops/zendnn.Dockerfile
Normal file
@@ -0,0 +1,101 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
|
||||
|
||||
ENV CC=gcc-13 CXX=g++-13
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libnuma1 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
4
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
4
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -100,8 +100,8 @@ body:
|
||||
label: Relevant log output
|
||||
description: >
|
||||
Please copy and paste any relevant log output, including the command that you entered and any generated text.
|
||||
For very long logs (thousands of lines), preferably upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
|
||||
On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
4
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@@ -88,8 +88,8 @@ body:
|
||||
description: >
|
||||
If applicable, please copy and paste any relevant log output, including any generated text.
|
||||
If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
|
||||
For very long logs (thousands of lines), please upload them as files instead.
|
||||
On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
|
||||
For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
|
||||
On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
|
||||
value: |
|
||||
<details>
|
||||
<summary>Logs</summary>
|
||||
|
||||
22
.github/actions/ccache-clear/action.yml
vendored
Normal file
22
.github/actions/ccache-clear/action.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
name: "ccache-clear"
|
||||
description: "Delete all GitHub Actions caches matching a key prefix"
|
||||
inputs:
|
||||
key:
|
||||
description: "Cache key prefix to match and delete"
|
||||
required: true
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Clear caches
|
||||
shell: bash
|
||||
run: |
|
||||
CACHES=$(gh cache list --key "ccache-${{ inputs.key }}" --json id,key --jq '.[] | "\(.id) \(.key)"' 2>/dev/null)
|
||||
if [ -z "$CACHES" ]; then
|
||||
echo "No caches found with key prefix: ${{ inputs.key }}"
|
||||
exit 0
|
||||
fi
|
||||
while read -r id key; do
|
||||
echo "Deleting cache: $id ($key)"
|
||||
gh cache delete "$id"
|
||||
done <<< "$CACHES"
|
||||
@@ -15,6 +15,6 @@ runs:
|
||||
id: setup
|
||||
uses: ./.github/actions/unarchive-tar
|
||||
with:
|
||||
url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
|
||||
url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
|
||||
path: ${{ inputs.path }}
|
||||
strip: 1
|
||||
|
||||
2
.github/actions/unarchive-tar/action.yml
vendored
2
.github/actions/unarchive-tar/action.yml
vendored
@@ -24,4 +24,4 @@ runs:
|
||||
run: |
|
||||
mkdir -p ${{ inputs.path }}
|
||||
cd ${{ inputs.path }}
|
||||
curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
|
||||
curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
|
||||
|
||||
31
.github/actions/windows-setup-cuda/action.yml
vendored
31
.github/actions/windows-setup-cuda/action.yml
vendored
@@ -96,3 +96,34 @@ runs:
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 13.3
|
||||
if: ${{ inputs.cuda_version == '13.3' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
6
.github/workflows/build-3rd-party.yml
vendored
6
.github/workflows/build-3rd-party.yml
vendored
@@ -22,9 +22,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-llguidance:
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
linux-iot-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
71
.github/workflows/build-android.yml
vendored
71
.github/workflows/build-android.yml
vendored
@@ -27,12 +27,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
android:
|
||||
default:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -58,7 +58,7 @@ jobs:
|
||||
cd examples/llama.android
|
||||
./gradlew build --no-daemon
|
||||
|
||||
android-ndk:
|
||||
ndk:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
@@ -73,6 +73,11 @@ jobs:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
run: |
|
||||
@@ -86,3 +91,59 @@ jobs:
|
||||
with:
|
||||
name: llama-cpp-android-arm64-cpu
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
|
||||
# for some reason, the ccache does not improve the build time in this case
|
||||
# example:
|
||||
# cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
|
||||
# cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
|
||||
#
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: android-ubuntu-arm64
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
96
.github/workflows/build-apple.yml
vendored
96
.github/workflows/build-apple.yml
vendored
@@ -32,12 +32,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-ios:
|
||||
macos-latest-arm64:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -48,7 +48,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-ios
|
||||
key: apple-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -56,18 +56,58 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build -G Xcode \
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -E "test-llama-archs" --verbose --timeout 900
|
||||
|
||||
macos-latest-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macos-latest-ios-xcode:
|
||||
runs-on: macos-latest
|
||||
@@ -89,6 +129,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -115,7 +156,7 @@ jobs:
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
macOS-latest-tvos:
|
||||
macos-latest-tvos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -123,10 +164,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-tvos
|
||||
key: apple-tvos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -138,6 +180,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -147,7 +190,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-visionos:
|
||||
macos-latest-visionos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -155,6 +198,14 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-visionos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -163,6 +214,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -172,7 +224,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-swift:
|
||||
macos-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: macos-latest-ios-xcode
|
||||
|
||||
@@ -185,10 +237,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
key: apple-swift
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -206,6 +259,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
8
.github/workflows/build-cache.yml
vendored
8
.github/workflows/build-cache.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
# - name: Setup SpacemiT Toolchain
|
||||
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
|
||||
140
.github/workflows/build-cann.yml
vendored
140
.github/workflows/build-cann.yml
vendored
@@ -29,74 +29,76 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
openEuler-latest-cann:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
chip_type: ['910b', '310p']
|
||||
build: ['Release']
|
||||
use_acl_graph: ['on', 'off']
|
||||
exclude:
|
||||
# 310P does not support USE_ACL_GRAPH=on
|
||||
- chip_type: '310p'
|
||||
use_acl_graph: 'on'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-latest-cann:
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash -el {0}
|
||||
# strategy:
|
||||
# matrix:
|
||||
# arch: [x86, aarch64]
|
||||
# chip_type: ['910b', '310p']
|
||||
# build: ['Release']
|
||||
# use_acl_graph: ['on', 'off']
|
||||
# exclude:
|
||||
# # 310P does not support USE_ACL_GRAPH=on
|
||||
# - chip_type: '310p'
|
||||
# use_acl_graph: 'on'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
|
||||
18
.github/workflows/build-cmake-pkg.yml
vendored
18
.github/workflows/build-cmake-pkg.yml
vendored
@@ -5,23 +5,23 @@ on:
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, Linux, CPU]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y build-essential tcl cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
|
||||
215
.github/workflows/build-cpu.yml
vendored
Normal file
215
.github/workflows/build-cpu.yml
vendored
Normal file
@@ -0,0 +1,215 @@
|
||||
name: CI (cpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cpu.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cpu.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
build-cmake-pkg:
|
||||
uses: ./.github/workflows/build-cmake-pkg.yml
|
||||
|
||||
ubuntu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cpu-${{ matrix.os }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.4.313.2
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64-cpu-static'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'x64-openblas'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'x64-vulkan'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cpu-windows-2025-${{ matrix.build }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'x64-vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.arch == 'x64' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
# TODO: disabled for now, consider adding tests for all CPU variants instead
|
||||
# - name: Test (Intel SDE)
|
||||
# id: cmake_test_sde
|
||||
# if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
# run: |
|
||||
# curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# # for some weird reason windows tar doesn't like sde tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
# $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
# cd build
|
||||
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
4
.github/workflows/build-cross.yml
vendored
4
.github/workflows/build-cross.yml
vendored
@@ -277,7 +277,7 @@ jobs:
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-cache.yml
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
@@ -287,7 +287,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
|
||||
134
.github/workflows/build-cuda-ubuntu.yml
vendored
Normal file
134
.github/workflows/build-cuda-ubuntu.yml
vendored
Normal file
@@ -0,0 +1,134 @@
|
||||
name: CI (CUDA, ubuntu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cuda-ubuntu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cuda-ubuntu.yml',
|
||||
'ggml/src/ggml-cuda/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
cuda:
|
||||
runs-on: ubuntu-24.04
|
||||
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
DEBIAN_FRONTEND: noninteractive
|
||||
run: |
|
||||
apt update
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-24.04-cuda
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with CMake
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
||||
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CUDA=ON \
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
cmake --build build
|
||||
|
||||
hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-22.04-hip
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with native CMake HIP support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGPU_TARGETS="gfx1030" \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential git cmake libssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-22.04-musa
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DGGML_MUSA=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
162
.github/workflows/build-cuda-windows.yml
vendored
Normal file
162
.github/workflows/build-cuda-windows.yml
vendored
Normal file
@@ -0,0 +1,162 @@
|
||||
name: CI (CUDA, windows)
|
||||
|
||||
# TODO: this workflow is only triggered manually because it is very heavy on the CI
|
||||
# when we provision dedicated windows runners, we can enable it for pushes too
|
||||
# note: running this workflow manually will populate the ccache for the release builds
|
||||
# this can be used before merging a PR to speed up the release workflow
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
|
||||
# note: this will run in queue with the release workflow
|
||||
concurrency:
|
||||
group: release
|
||||
queue: max
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
cuda:
|
||||
runs-on: windows-2022
|
||||
|
||||
permissions:
|
||||
actions: write
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '13.3']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DGGML_RPC=ON ^
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
|
||||
|
||||
hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
permissions:
|
||||
actions: write
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-cache.yml
|
||||
HIPSDK_INSTALLER_VERSION: "26.Q1"
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# sync with release.yml
|
||||
- name: "radeon"
|
||||
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Grab rocWMMA package
|
||||
id: grab_rocwmma
|
||||
run: |
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
|
||||
7z x rocwmma.deb
|
||||
7z x data.tar
|
||||
|
||||
- name: Use ROCm Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/windows-setup-rocm
|
||||
with:
|
||||
version: ${{ env.HIPSDK_INSTALLER_VERSION }}
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
# Find and test ROCm installation
|
||||
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
|
||||
if (-not $clangPath) {
|
||||
Write-Error "ROCm installation not found"
|
||||
exit 1
|
||||
}
|
||||
& $clangPath.FullName --version
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
# TODO: this build does not match the build in release.yml, so we use a different cache key
|
||||
# ideally, the builds should match, similar to the CUDA build above so that we would be able
|
||||
# to populate the ccache for the release with manual runs of this workflow
|
||||
#key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON `
|
||||
-DROCM_DIR="${env:HIP_PATH}" `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGPU_TARGETS="gfx1100" `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
with:
|
||||
#key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
150
.github/workflows/build-ibm.yml
vendored
Normal file
150
.github/workflows/build-ibm.yml
vendored
Normal file
@@ -0,0 +1,150 @@
|
||||
name: CI (ibm)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-ibm.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-ibm.yml',
|
||||
'ggml/src/ggml-cpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-s390x:
|
||||
runs-on: ubuntu-24.04-s390x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Swap Endianness
|
||||
id: endianness
|
||||
run: |
|
||||
for f in models/*.gguf; do
|
||||
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
|
||||
done
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c (s390x)
|
||||
id: llama2c_test_s390x
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch llama2c big-endian model"
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-24-ppc64le:
|
||||
runs-on: ubuntu-24.04-ppc64le
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
8
.github/workflows/build-msys.yml
vendored
8
.github/workflows/build-msys.yml
vendored
@@ -15,9 +15,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-msys2:
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: windows-msys2
|
||||
# key: msys-windows-2025-x64
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
82
.github/workflows/build-opencl.yml
vendored
Normal file
82
.github/workflows/build-opencl.yml
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
name: CI (opencl)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-opencl.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-opencl.yml',
|
||||
'ggml/src/ggml-opencl/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-2025-opencl-adreno:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: opencl-windows-2025-x64
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
cmake -B build `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
cmake -B build-arm64-release `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
54
.github/workflows/build-openvino.yml
vendored
54
.github/workflows/build-openvino.yml
vendored
@@ -29,30 +29,18 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-openvino:
|
||||
name: ubuntu-24-openvino-${{ matrix.openvino_device }}
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
|
||||
group: openvino-gpu-${{ github.head_ref || github.ref }}
|
||||
cancel-in-progress: false
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- variant: cpu
|
||||
runner: '"ubuntu-24.04"'
|
||||
openvino_device: "CPU"
|
||||
- variant: gpu
|
||||
runner: '["self-hosted","Linux","Intel","OpenVINO"]'
|
||||
openvino_device: "GPU"
|
||||
|
||||
runs-on: ${{ fromJSON(matrix.runner) }}
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.0"
|
||||
@@ -63,14 +51,6 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
@@ -78,16 +58,7 @@ jobs:
|
||||
sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
|
||||
sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
|
||||
|
||||
- name: Use OpenVINO Toolkit Cache
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: actions/cache@v5
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/linux-setup-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
@@ -109,12 +80,17 @@ jobs:
|
||||
-DGGML_OPENVINO=ON
|
||||
time cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
- name: Test (CPU)
|
||||
id: cmake_test_cpu
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
run: |
|
||||
cd ${{ github.workspace }}
|
||||
if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
fi
|
||||
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
|
||||
|
||||
- name: Test (GPU)
|
||||
id: cmake_test_gpu
|
||||
# TODO: fix and re-enable the `test-llama-archs` test below
|
||||
run: |
|
||||
cd ${{ github.workspace }}
|
||||
export GGML_OPENVINO_DEVICE=GPU
|
||||
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
|
||||
|
||||
92
.github/workflows/build-riscv.yml
vendored
92
.github/workflows/build-riscv.yml
vendored
@@ -29,11 +29,84 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-cpu-riscv64-native:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install necessary packages
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libssl-dev
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: Check environment
|
||||
run: |
|
||||
uname -a
|
||||
gcc --version
|
||||
g++ --version
|
||||
ldd --version
|
||||
cmake --version
|
||||
rustc --version
|
||||
env
|
||||
echo "nproc=$(nproc)"
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: riscv-ubuntu-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=ON \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-riscv64-native-sanitizer:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
@@ -62,12 +135,13 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
66
.github/workflows/build-rpc.yml
vendored
Normal file
66
.github/workflows/build-rpc.yml
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
name: CI (rpc)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-rpc.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-rpc.yml',
|
||||
'ggml/src/ggml-rpc/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-rpc:
|
||||
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev ninja-build
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
51
.github/workflows/build-sanitize.yml
vendored
51
.github/workflows/build-sanitize.yml
vendored
@@ -22,66 +22,65 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
ctest:
|
||||
runs-on: [self-hosted, X64, CPU, Linux]
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
# with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
|
||||
- name: Build (undefined)
|
||||
id: cmake_build_undefined
|
||||
if: ${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config Debug -j $(nproc)
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
if: ${{ matrix.sanitizer == 'ADDRESS' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# skip run in Debug - very slow
|
||||
if: ${{ matrix.sanitizer != 'UNDEFINED' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L main -E tokenizer --verbose --timeout 900
|
||||
|
||||
185
.github/workflows/build-self-hosted.yml
vendored
185
.github/workflows/build-self-hosted.yml
vendored
@@ -50,29 +50,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
determine-tag:
|
||||
name: Determine tag name
|
||||
runs-on: ubuntu-slim
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
ggml-ci-nvidia-cuda:
|
||||
needs: determine-tag
|
||||
gpu-cuda:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -82,14 +65,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-nvidia-cm:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -99,14 +79,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-nvidia-cm2:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
@@ -116,14 +93,12 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-webgpu:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
gpu-webgpu-nvidia:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, X64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -149,10 +124,10 @@ jobs:
|
||||
GG_BUILD_WEBGPU=1 \
|
||||
GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
#cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
|
||||
# steps:
|
||||
@@ -163,10 +138,10 @@ jobs:
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
# amd-vulkan:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
@@ -178,10 +153,10 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
# amd-rocm:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
@@ -193,10 +168,9 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
needs: determine-tag
|
||||
gpu-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -206,13 +180,10 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
needs: determine-tag
|
||||
gpu-webgpu-apple:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -235,14 +206,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-apple:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -252,14 +220,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-intel-linux:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
@@ -271,14 +236,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-intel-windows:
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
@@ -293,15 +255,13 @@ jobs:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
# a valid python environment for testing
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
needs: determine-tag
|
||||
gpu-openvino-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
@@ -333,8 +293,99 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-x64-high-perf:
|
||||
runs-on: [self-hosted, Linux, X64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-arm64-high-perf-graviton4:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-arm64-graviton4-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 \
|
||||
GG_BUILD_EXTRA_TESTS_0=1 \
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
252
.github/workflows/build-sycl.yml
vendored
252
.github/workflows/build-sycl.yml
vendored
@@ -29,132 +29,134 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-latest-sycl:
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-windows-latest
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
51
.github/workflows/build-vulkan.yml
vendored
51
.github/workflows/build-vulkan.yml
vendored
@@ -31,26 +31,56 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-vulkan-llvmpipe:
|
||||
runs-on: ubuntu-24.04
|
||||
ubuntu-arm64:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-vulkan-llvmpipe
|
||||
key: vulkan-ubuntu-24.04-arm-new
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Configure
|
||||
id: cmake_configure
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
time cmake --build build -j $(nproc)
|
||||
|
||||
ubuntu-llvmpipe:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
@@ -68,7 +98,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
@@ -77,6 +107,13 @@ jobs:
|
||||
path: ./vulkan_sdk
|
||||
version: ${{ env.VULKAN_SDK_VERSION }}
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: vulkan-ubuntu-24.04-llvmpipe
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
|
||||
173
.github/workflows/build-webgpu.yml
vendored
Normal file
173
.github/workflows/build-webgpu.yml
vendored
Normal file
@@ -0,0 +1,173 @@
|
||||
name: CI (webgpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-webgpu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-webgpu.yml',
|
||||
'ggml/src/ggml-webgpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-macos-latest
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
export CMAKE_PREFIX_PATH=dawn
|
||||
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-ubuntu-24.04
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||
cmake -B build \
|
||||
-DGGML_WEBGPU=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
# test-backend-ops is too slow on llvmpipe, skip it
|
||||
ctest -L main -E test-backend-ops --verbose --timeout 900
|
||||
|
||||
ubuntu-wasm:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-ubuntu-24.04-arm-wasm
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Emscripten
|
||||
run: |
|
||||
git clone https://github.com/emscripten-core/emsdk.git
|
||||
cd emsdk
|
||||
./emsdk install latest
|
||||
./emsdk activate latest
|
||||
|
||||
- name: Fetch emdawnwebgpu
|
||||
run: |
|
||||
DAWN_TAG="v20260317.182325"
|
||||
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
|
||||
echo "Downloading ${EMDAWN_PKG}"
|
||||
curl -L -o emdawn.zip \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
|
||||
unzip emdawn.zip
|
||||
|
||||
- name: Build WASM WebGPU
|
||||
run: |
|
||||
source emsdk/emsdk_env.sh
|
||||
emcmake cmake -B build-wasm \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_WEBGPU=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||
|
||||
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
|
||||
1110
.github/workflows/build.yml
vendored
1110
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
2
.github/workflows/check-vendor.yml
vendored
2
.github/workflows/check-vendor.yml
vendored
@@ -19,7 +19,7 @@ on:
|
||||
|
||||
jobs:
|
||||
check-vendor:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/code-style.yml
vendored
2
.github/workflows/code-style.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
model-naming:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Check model naming conventions
|
||||
|
||||
2
.github/workflows/editorconfig.yml
vendored
2
.github/workflows/editorconfig.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
editorconfig:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
|
||||
8
.github/workflows/hip-quality-check.yml
vendored
8
.github/workflows/hip-quality-check.yml
vendored
@@ -28,9 +28,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-22-hip-quality-check:
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-hip-quality-check
|
||||
key: hip-quality-check-ubuntu-22.04
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
|
||||
16
.github/workflows/pre-tokenizer-hashes.yml
vendored
16
.github/workflows/pre-tokenizer-hashes.yml
vendored
@@ -3,16 +3,16 @@ name: Check Pre-Tokenizer Hashes
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'conversion/base.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'conversion/base.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -30,16 +30,16 @@ jobs:
|
||||
|
||||
- name: Update pre-tokenizer hashes
|
||||
run: |
|
||||
cp convert_hf_to_gguf.py /tmp
|
||||
cp conversion/base.py /tmp
|
||||
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||
|
||||
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||
run: |
|
||||
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||
if ! diff -q conversion/base.py /tmp/base.py; then
|
||||
echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
|
||||
echo "Differences found:"
|
||||
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||
diff conversion/base.py /tmp/base.py || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Model pre-tokenizer hashes are up to date."
|
||||
|
||||
@@ -20,7 +20,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, CPU, fast]
|
||||
name: check-requirements
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
2
.github/workflows/python-lint.yml
vendored
2
.github/workflows/python-lint.yml
vendored
@@ -21,7 +21,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: Lint
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
2
.github/workflows/python-type-check.yml
vendored
2
.github/workflows/python-type-check.yml
vendored
@@ -22,7 +22,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-type-check:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: python type-check
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
888
.github/workflows/release.yml
vendored
888
.github/workflows/release.yml
vendored
File diff suppressed because it is too large
Load Diff
36
.github/workflows/server-sanitize.yml
vendored
36
.github/workflows/server-sanitize.yml
vendored
@@ -26,10 +26,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -37,7 +37,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [self-hosted, CPU, Linux, llama-server]
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -46,19 +46,19 @@ jobs:
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
#- name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get -y install \
|
||||
# build-essential \
|
||||
# xxd \
|
||||
# git \
|
||||
# cmake \
|
||||
# curl \
|
||||
# wget \
|
||||
# language-pack-en \
|
||||
# libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
||||
181
.github/workflows/server-self-hosted.yml
vendored
181
.github/workflows/server-self-hosted.yml
vendored
@@ -29,10 +29,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -42,22 +42,65 @@ jobs:
|
||||
server-metal:
|
||||
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
||||
|
||||
name: server-metal (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2"
|
||||
wf_name: "GPUx2"
|
||||
- build_type: Release
|
||||
extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx2, backend-sampling"
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Tests (GPUx1)
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx1, backend-sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx2)
|
||||
id: server_integration_tests_gpu2
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export GGML_METAL_DEVICES=2
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Tests (GPUx2, backend-sampling)
|
||||
id: server_integration_tests_gpu2_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -67,83 +110,40 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests (GPUx1)
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
# TODO: provision CUDA runner
|
||||
# server-cuda:
|
||||
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
#
|
||||
# name: server-cuda (${{ matrix.wf_name }})
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build_type: [Release]
|
||||
# wf_name: ["GPUx1"]
|
||||
# include:
|
||||
# - build_type: Release
|
||||
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
# wf_name: "GPUx1, backend-sampling"
|
||||
# fail-fast: false
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
#
|
||||
# - name: Tests
|
||||
# id: server_integration_tests
|
||||
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
# run: |
|
||||
# cd tools/server/tests
|
||||
# python3 -m venv venv
|
||||
# source venv/bin/activate
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
- name: Tests (GPUx1, backend-sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
name: server-kleidiai (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
|
||||
extra_args: ""
|
||||
wf_name: "CPUx1, kleidiai"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@@ -182,16 +182,21 @@ jobs:
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
source venv/bin/activate
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
84
.github/workflows/server.yml
vendored
84
.github/workflows/server.yml
vendored
@@ -44,32 +44,18 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["default"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: ""
|
||||
wf_name: "default"
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "backend-sampling"
|
||||
fail-fast: false
|
||||
ubuntu:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
@@ -93,20 +79,19 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
key: server-ubuntu-24.04-arm
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
cmake --build build --config Release -j $(nproc) --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
@@ -117,22 +102,34 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2022
|
||||
- name: Tests (Backend sampling)
|
||||
id: server_integration_tests_backend_sampling
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
- name: Slow tests (Backend sampling)
|
||||
id: server_integration_tests_slow_backend_sampling
|
||||
if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
export LLAMA_ARG_BACKEND_SAMPLING=1
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -142,16 +139,24 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
node-version: "24"
|
||||
key: server-windows-2025-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake -B build -G "Ninja Multi-Config" ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
@@ -162,7 +167,6 @@ jobs:
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
@@ -170,7 +174,7 @@ jobs:
|
||||
|
||||
- name: Slow tests
|
||||
id: server_integration_tests_slow
|
||||
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||
if: ${{ github.event.schedule || github.event.inputs.slow_tests == 'true' }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
$env:SLOW_TESTS = "1"
|
||||
|
||||
43
.github/workflows/ui-build-self-hosted.yml
vendored
Normal file
43
.github/workflows/ui-build-self-hosted.yml
vendored
Normal file
@@ -0,0 +1,43 @@
|
||||
name: UI Build (self-hosted)
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: [self-hosted, fast]
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
|
||||
- name: Upload built UI
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist/
|
||||
retention-days: 1
|
||||
5
.github/workflows/ui-build.yml
vendored
5
.github/workflows/ui-build.yml
vendored
@@ -5,7 +5,6 @@ on:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build static output
|
||||
runs-on: ubuntu-slim
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
@@ -31,7 +30,7 @@ jobs:
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd build/tools/ui/dist
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
@@ -40,5 +39,5 @@ jobs:
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
retention-days: 1
|
||||
|
||||
8
.github/workflows/ui-publish.yml
vendored
8
.github/workflows/ui-publish.yml
vendored
@@ -20,7 +20,7 @@ jobs:
|
||||
publish:
|
||||
name: Publish UI Static Output
|
||||
needs: build
|
||||
runs-on: ubuntu-24.04-arm
|
||||
runs-on: ubuntu-slim
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install Hugging Face Hub CLI
|
||||
run: pip install -U huggingface_hub
|
||||
@@ -49,12 +49,12 @@ jobs:
|
||||
- name: Sync built files to Hugging Face bucket (version tag)
|
||||
run: |
|
||||
# Upload the built files to the Hugging Face bucket under the release version
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
|
||||
- name: Sync built files to Hugging Face bucket (latest)
|
||||
run: |
|
||||
# Also upload to the 'latest' directory for fallback downloads
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
|
||||
- name: Verify upload
|
||||
run: |
|
||||
|
||||
118
.github/workflows/ui-self-hosted.yml
vendored
Normal file
118
.github/workflows/ui-self-hosted.yml
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
name: UI (self-hosted)
|
||||
|
||||
# these are the same as ui.yml, but with self-hosted runners
|
||||
# the runners come with pre-installed Playwright browsers version: 1.56.1
|
||||
# the jobs are much lighter because they don't need to install node and playwright browsers
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-self-hosted.yml',
|
||||
'.github/workflows/ui-build-self-hosted.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-self-hosted.yml',
|
||||
'.github/workflows/ui-build-self-hosted.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build static output
|
||||
uses: ./.github/workflows/ui-build-self-hosted.yml
|
||||
|
||||
ui-checks:
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run linting
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run lint
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
@@ -1,4 +1,4 @@
|
||||
name: CI (UI)
|
||||
name: UI
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -11,23 +11,25 @@ on:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -39,7 +41,7 @@ jobs:
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
4
.github/workflows/update-ops-docs.yml
vendored
4
.github/workflows/update-ops-docs.yml
vendored
@@ -3,18 +3,20 @@ name: Update Operations Documentation
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '.github/workflows/update-ops-docs.yml'
|
||||
- 'docs/ops.md'
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/update-ops-docs.yml'
|
||||
- 'docs/ops.md'
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
|
||||
jobs:
|
||||
update-ops-docs:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- Be very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
|
||||
@@ -16,7 +16,8 @@ Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
|
||||
- Ask the user to tell you what model was used and write it in place of [MODEL]
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
|
||||
@@ -104,24 +104,16 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
||||
|
||||
# extra artifacts
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
# Backward compat: when old var is set but new one isn't, forward the value
|
||||
if(DEFINED LLAMA_BUILD_WEBUI)
|
||||
set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
|
||||
message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
|
||||
endif()
|
||||
if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
|
||||
set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
|
||||
message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
|
||||
endif()
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
||||
@@ -226,17 +218,8 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
|
||||
add_subdirectory(tools)
|
||||
endif()
|
||||
|
||||
# Automatically add all files from the 'licenses' directory
|
||||
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
||||
|
||||
foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
|
||||
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
|
||||
license_add_file("${NAME}" "${FILE_PATH}")
|
||||
endforeach()
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
license_generate(llama-common)
|
||||
if (LLAMA_BUILD_APP)
|
||||
add_subdirectory(app)
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
/examples/parallel/ @ggerganov
|
||||
/examples/passkey/ @ggerganov
|
||||
/examples/retrieval/ @ggerganov
|
||||
/examples/save-load-state/ @ggerganov
|
||||
/examples/speculative-simple/ @ggerganov
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
|
||||
@@ -63,6 +63,7 @@ After submitting your PR:
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
|
||||
|
||||
@@ -27,6 +27,7 @@ LLM inference in C/C++
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
|
||||
|
||||
----
|
||||
|
||||
@@ -142,6 +143,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
||||
- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
|
||||
- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)
|
||||
- [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum)
|
||||
|
||||
#### Multimodal
|
||||
|
||||
@@ -290,7 +292,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [WebGPU](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
|
||||
10
SECURITY.md
10
SECURITY.md
@@ -12,16 +12,16 @@
|
||||
|
||||
## Reporting a vulnerability
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The private security disclosure program is disabled until further notice. Please submit patches with fixes directly to the repo as public PRs. Emails will be ignored.
|
||||
|
||||
If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
|
||||
|
||||
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
|
||||
|
||||
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> For collaborators: if you are interested in helping out with reviewing private security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
|
||||
|
||||
## Requirements
|
||||
### Requirements
|
||||
|
||||
Before submitting your report, ensure you meet the following requirements:
|
||||
|
||||
@@ -31,7 +31,7 @@ Before submitting your report, ensure you meet the following requirements:
|
||||
|
||||
Maintainers reserve the right to close the report if these requirements are not fulfilled.
|
||||
|
||||
## Covered Topics
|
||||
### Covered Topics
|
||||
|
||||
Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
|
||||
|
||||
|
||||
31
app/CMakeLists.txt
Normal file
31
app/CMakeLists.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
set(TARGET llama-app)
|
||||
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
llama-server-impl
|
||||
llama-cli-impl
|
||||
llama-completion-impl
|
||||
llama-bench-impl
|
||||
llama-batched-bench-impl
|
||||
llama-fit-params-impl
|
||||
llama-quantize-impl
|
||||
llama-perplexity-impl
|
||||
)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
# Automatically add all files from the 'licenses' directory
|
||||
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
|
||||
|
||||
foreach(FILE_PATH ${EXTRA_LICENSES})
|
||||
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
|
||||
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
|
||||
license_add_file("${NAME}" "${FILE_PATH}")
|
||||
endforeach()
|
||||
|
||||
license_generate(${TARGET})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
endif()
|
||||
127
app/llama.cpp
Normal file
127
app/llama.cpp
Normal file
@@ -0,0 +1,127 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// embedded data generated by cmake
|
||||
extern const char * LICENSES[];
|
||||
|
||||
// visible
|
||||
int llama_server(int argc, char ** argv);
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
// hidden
|
||||
int llama_completion(int argc, char ** argv);
|
||||
int llama_bench(int argc, char ** argv);
|
||||
int llama_batched_bench(int argc, char ** argv);
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
// hands the update over to the install script, which downloads and swaps the binary
|
||||
static int llama_update(int argc, char ** argv) {
|
||||
(void) argc;
|
||||
(void) argv;
|
||||
|
||||
#if defined(_WIN32)
|
||||
return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\"");
|
||||
#else
|
||||
return system("curl -fsSL https://llama.app/install.sh | sh");
|
||||
#endif
|
||||
}
|
||||
|
||||
static const char * progname;
|
||||
|
||||
static int help(int argc, char ** argv);
|
||||
static int version(int argc, char ** argv);
|
||||
static int licenses(int argc, char ** argv);
|
||||
|
||||
struct command {
|
||||
const char * name;
|
||||
const char * desc;
|
||||
std::vector<std::string> aliases;
|
||||
bool hidden;
|
||||
int (*func)(int, char **);
|
||||
};
|
||||
|
||||
static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"update", "Update llama to the latest release", {}, false, llama_update },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, false, version },
|
||||
{"licenses", "Show third-party licenses", {"credits"}, false, licenses },
|
||||
{"help", "Show available commands", {}, false, help },
|
||||
};
|
||||
|
||||
static int version(int argc, char ** argv) {
|
||||
printf("%s\n", llama_build_info());
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int licenses(int argc, char ** argv) {
|
||||
for (int i = 0; LICENSES[i]; ++i) {
|
||||
printf("%s\n", LICENSES[i]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int help(int argc, char ** argv) {
|
||||
const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
|
||||
|
||||
printf("Usage: %s <command> [options]\n\nAvailable commands:\n", progname);
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (show_all || !cmd.hidden) {
|
||||
printf(" %-15s %s\n", cmd.name, cmd.desc);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
if (!show_all) {
|
||||
printf("Run '%s help all' to show additional commands.\n", progname);
|
||||
}
|
||||
printf("Run '%s <command> --help' for command-specific usage.\n", progname);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool matches(const std::string & arg, const command & cmd) {
|
||||
if (arg == cmd.name) {
|
||||
return true;
|
||||
}
|
||||
for (const auto & alias : cmd.aliases) {
|
||||
if (arg == alias) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
progname = argv[0];
|
||||
|
||||
const std::string arg = argc >= 2 ? argv[1] : "help";
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (matches(arg, cmd)) {
|
||||
// keep cmd.name so the router's child processes re-invoke correctly
|
||||
#ifdef _WIN32
|
||||
_putenv_s("LLAMA_APP_CMD", cmd.name);
|
||||
#else
|
||||
setenv("LLAMA_APP_CMD", cmd.name, 1);
|
||||
#endif
|
||||
return cmd.func(argc - 1, argv + 1);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
|
||||
return 1;
|
||||
}
|
||||
@@ -7,6 +7,8 @@ VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_APP=OFF
|
||||
LLAMA_BUILD_COMMON=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
@@ -31,6 +33,8 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
|
||||
-DLLAMA_BUILD_COMMON=${LLAMA_BUILD_COMMON}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
@@ -414,7 +418,7 @@ cmake -B build-ios-sim -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
echo "Building for iOS devices..."
|
||||
cmake -B build-ios-device -G Xcode \
|
||||
@@ -428,7 +432,7 @@ cmake -B build-ios-device -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
echo "Building for macOS..."
|
||||
cmake -B build-macos -G Xcode \
|
||||
@@ -439,7 +443,7 @@ cmake -B build-macos -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
cmake --build build-macos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
echo "Building for visionOS..."
|
||||
cmake -B build-visionos -G Xcode \
|
||||
@@ -454,7 +458,7 @@ cmake -B build-visionos -G Xcode \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
echo "Building for visionOS simulator..."
|
||||
cmake -B build-visionos-sim -G Xcode \
|
||||
@@ -469,7 +473,7 @@ cmake -B build-visionos-sim -G Xcode \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
|
||||
echo "Building for tvOS simulator..."
|
||||
@@ -485,7 +489,7 @@ cmake -B build-tvos-sim -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-sim --config Release -- -quiet
|
||||
cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
echo "Building for tvOS devices..."
|
||||
cmake -B build-tvos-device -G Xcode \
|
||||
@@ -500,7 +504,7 @@ cmake -B build-tvos-device -G Xcode \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-S .
|
||||
cmake --build build-tvos-device --config Release -- -quiet
|
||||
cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet
|
||||
|
||||
# Setup frameworks and copy binaries and headers
|
||||
echo "Setting up framework structures..."
|
||||
|
||||
23
ci/run.sh
23
ci/run.sh
@@ -66,6 +66,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
@@ -114,10 +116,7 @@ fi
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
|
||||
MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
|
||||
MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
|
||||
if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
|
||||
@@ -133,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
||||
|
||||
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
|
||||
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
|
||||
@@ -167,6 +166,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
@@ -238,7 +239,7 @@ function gg_run_ctest_debug {
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -461,10 +462,10 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -700,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
export LLAMA_ARG_LOG_PREFIX=1
|
||||
export LLAMA_ARG_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
|
||||
|
||||
134
common/arg.cpp
134
common/arg.cpp
@@ -50,8 +50,6 @@
|
||||
|
||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||
|
||||
extern const char * LICENSES[];
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
using namespace common_arg_utils;
|
||||
|
||||
@@ -342,9 +340,7 @@ struct handle_model_result {
|
||||
};
|
||||
|
||||
static handle_model_result common_params_handle_model(struct common_params_model & model,
|
||||
const std::string & bearer_token,
|
||||
bool offline,
|
||||
bool search_mtp = false) {
|
||||
const common_download_opts & opts) {
|
||||
handle_model_result result;
|
||||
|
||||
if (!model.docker_repo.empty()) {
|
||||
@@ -356,10 +352,8 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
model.hf_file = model.path;
|
||||
model.path = "";
|
||||
}
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = bearer_token;
|
||||
opts.offline = offline;
|
||||
auto download_result = common_download_model(model, opts, true, search_mtp);
|
||||
common_download_opts hf_opts = opts;
|
||||
auto download_result = common_download_model(model, hf_opts);
|
||||
|
||||
if (download_result.model_path.empty()) {
|
||||
throw std::runtime_error("failed to download model from Hugging Face");
|
||||
@@ -384,9 +378,6 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
|
||||
}
|
||||
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = bearer_token;
|
||||
opts.offline = offline;
|
||||
auto download_result = common_download_model(model, opts);
|
||||
if (download_result.model_path.empty()) {
|
||||
throw std::runtime_error("failed to download model from " + model.url);
|
||||
@@ -443,35 +434,50 @@ static bool parse_bool_value(const std::string & value) {
|
||||
// CLI argument parsing functions
|
||||
//
|
||||
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex) {
|
||||
bool common_params_handle_models(common_params & params, llama_example curr_ex) {
|
||||
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
|
||||
params.speculative.types.end(),
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
|
||||
|
||||
auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_draft_mtp);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (curr_ex == ex) {
|
||||
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
|
||||
break;
|
||||
common_download_opts opts;
|
||||
opts.bearer_token = params.hf_token;
|
||||
opts.offline = params.offline;
|
||||
opts.skip_download = params.skip_download;
|
||||
opts.download_mtp = spec_type_draft_mtp;
|
||||
opts.download_mmproj = !params.no_mmproj;
|
||||
|
||||
try {
|
||||
auto res = common_params_handle_model(params.model, opts);
|
||||
if (params.no_mmproj) {
|
||||
params.mmproj = {};
|
||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||
// optionally, handle mmproj model when -hf is specified
|
||||
params.mmproj = res.mmproj;
|
||||
}
|
||||
// only download mmproj if the current example is using it
|
||||
for (const auto & ex : mmproj_examples) {
|
||||
if (curr_ex == ex) {
|
||||
common_params_handle_model(params.mmproj, opts);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// when --spec-type mtp is set and no draft model was provided explicitly,
|
||||
// fall back to the MTP head discovered alongside the -hf model
|
||||
if (spec_type_draft_mtp && res.found_mtp &&
|
||||
params.speculative.draft.mparams.path.empty() &&
|
||||
params.speculative.draft.mparams.hf_repo.empty() &&
|
||||
params.speculative.draft.mparams.url.empty()) {
|
||||
params.speculative.draft.mparams.path = res.mtp.path;
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, opts);
|
||||
common_params_handle_model(params.vocoder.model, opts);
|
||||
return true;
|
||||
} catch (const common_skip_download_exception &) {
|
||||
return false;
|
||||
} catch (const std::exception &) {
|
||||
throw;
|
||||
}
|
||||
// when --spec-type mtp is set and no draft model was provided explicitly,
|
||||
// fall back to the MTP head discovered alongside the -hf model
|
||||
if (spec_type_draft_mtp && res.found_mtp &&
|
||||
params.speculative.draft.mparams.path.empty() &&
|
||||
params.speculative.draft.mparams.hf_repo.empty() &&
|
||||
params.speculative.draft.mparams.url.empty()) {
|
||||
params.speculative.draft.mparams.path = res.mtp.path;
|
||||
}
|
||||
common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
|
||||
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
|
||||
}
|
||||
|
||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||
@@ -1035,11 +1041,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
// we define here to make sure it's included in llama-gen-docs
|
||||
if (ex == LLAMA_EXAMPLE_COMPLETION) {
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
|
||||
} else if (ex == LLAMA_EXAMPLE_MTMD) {
|
||||
params.use_jinja = false; // disable jinja by default
|
||||
params.sampling.temp = 0.2; // lower temp by default for better quality
|
||||
|
||||
} else if (ex == LLAMA_EXAMPLE_SERVER) {
|
||||
params.n_parallel = -1; // auto by default
|
||||
}
|
||||
@@ -1060,7 +1064,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
sampler_type_names.pop_back(); // remove last semicolon
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* filter options by example
|
||||
* rules:
|
||||
@@ -1074,7 +1077,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
add_opt(common_arg(
|
||||
{"-h", "--help", "--usage"},
|
||||
"print usage and exit",
|
||||
@@ -1091,16 +1093,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--license"},
|
||||
"show source code license and dependencies",
|
||||
[](common_params &) {
|
||||
for (int i = 0; LICENSES[i]; ++i) {
|
||||
printf("%s\n", LICENSES[i]);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"-cl", "--cache-list"},
|
||||
"show list of models in cache",
|
||||
@@ -1334,12 +1326,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"-cpent", "--checkpoint-every-n-tokens"}, "N",
|
||||
string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
|
||||
{"-cms", "--checkpoint-min-step"}, "N",
|
||||
string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step),
|
||||
[](common_params & params, int value) {
|
||||
params.checkpoint_every_nt = value;
|
||||
if (value < 0) {
|
||||
throw std::invalid_argument("checkpoint-min-step must be non-negative");
|
||||
}
|
||||
params.checkpoint_min_step = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-cram", "--cache-ram"}, "N",
|
||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||
@@ -2995,7 +2990,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
key_file.close();
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--ssl-key-file"}, "FNAME",
|
||||
"path to file a PEM-encoded SSL private key",
|
||||
@@ -3023,7 +3018,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
|
||||
add_opt(common_arg(
|
||||
{"-to", "--timeout"}, "N",
|
||||
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||
@@ -3032,6 +3027,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.timeout_write = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
||||
add_opt(common_arg(
|
||||
{"--sse-ping-interval"}, "N",
|
||||
string_format("server SSE ping interval in seconds (-1 = disabled, default: %d)", params.sse_ping_interval),
|
||||
[](common_params & params, int value) {
|
||||
params.sse_ping_interval = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSE_PING_INTERVAL"));
|
||||
add_opt(common_arg(
|
||||
{"--threads-http"}, "N",
|
||||
string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
||||
@@ -3324,7 +3326,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params &, const std::string & value) {
|
||||
common_log_set_file(common_log_main(), value.c_str());
|
||||
}
|
||||
).set_env("LLAMA_LOG_FILE"));
|
||||
).set_env("LLAMA_ARG_LOG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--log-colors"}, "[on|off|auto]",
|
||||
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
@@ -3341,7 +3343,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_LOG_COLORS"));
|
||||
).set_env("LLAMA_ARG_LOG_COLORS"));
|
||||
add_opt(common_arg(
|
||||
{"-v", "--verbose", "--log-verbose"},
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
@@ -3356,7 +3358,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.offline = true;
|
||||
}
|
||||
).set_env("LLAMA_OFFLINE"));
|
||||
).set_env("LLAMA_ARG_OFFLINE"));
|
||||
add_opt(common_arg(
|
||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
|
||||
@@ -3364,13 +3366,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
" - 1: error\n"
|
||||
" - 2: warning\n"
|
||||
" - 3: info\n"
|
||||
" - 4: debug\n"
|
||||
" - 4: trace (more info)\n"
|
||||
" - 5: debug\n"
|
||||
"(default: %d)\n", params.verbosity),
|
||||
[](common_params & params, int value) {
|
||||
params.verbosity = value;
|
||||
common_log_set_verbosity_thold(value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
).set_env("LLAMA_ARG_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
{"--log-prefix"},
|
||||
{"--no-log-prefix"},
|
||||
@@ -3590,6 +3593,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.draft.p_min = std::stof(value);
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-backend-sampling"},
|
||||
{"--no-spec-draft-backend-sampling"},
|
||||
string_format("offload draft sampling to the backend (default: %s)",
|
||||
params.speculative.draft.backend_sampling ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.speculative.draft.backend_sampling = value;
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||
@@ -4072,7 +4084,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.top_k = 0;
|
||||
params.sampling.min_p = 0.01f;
|
||||
params.use_jinja = true;
|
||||
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
@@ -4091,7 +4102,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.sampling.top_k = 0;
|
||||
params.sampling.min_p = 0.01f;
|
||||
params.use_jinja = true;
|
||||
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
|
||||
@@ -129,8 +129,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/18163
|
||||
void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
// Populate model paths (main model, mmproj, etc) from -hf if necessary
|
||||
void common_params_handle_models(common_params & params, llama_example curr_ex);
|
||||
// populate model paths (main model, mmproj, etc) from -hf if necessary
|
||||
// return true if the model is ready to use
|
||||
// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc)
|
||||
// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed)
|
||||
bool common_params_handle_models(common_params & params, llama_example curr_ex);
|
||||
|
||||
// initialize argument parser context - used by test-arg-parser and preset
|
||||
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
@@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
static const std::string ERR_TMPL = "#**ERROR**#";
|
||||
|
||||
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
|
||||
generation_params tmpl_params;
|
||||
tmpl_params.messages = params.messages;
|
||||
@@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
|
||||
return common_chat_template_direct_apply(tmpl, tmpl_params);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_DBG("Template application failed: %s\n", e.what());
|
||||
return "";
|
||||
return ERR_TMPL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants(
|
||||
std::string output_B = apply_template(tmpl, params_B);
|
||||
|
||||
// Check for template application failures
|
||||
if (output_A.empty() || output_B.empty()) {
|
||||
if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
|
||||
@@ -377,6 +377,8 @@ struct analyze_tools : analyze_base {
|
||||
|
||||
struct autoparser {
|
||||
jinja::caps jinja_caps;
|
||||
std::string user_start;
|
||||
std::string assistant_start;
|
||||
analyze_reasoning reasoning;
|
||||
analyze_content content;
|
||||
analyze_tools tools;
|
||||
@@ -387,6 +389,10 @@ struct autoparser {
|
||||
|
||||
autoparser() = default;
|
||||
|
||||
// Find the starting marker for the user message and assistant message
|
||||
std::string detect_user_start_marker(const common_chat_template & tmpl);
|
||||
std::string detect_assistant_start_marker(const common_chat_template & tmpl);
|
||||
|
||||
// Run full differential analysis on a template
|
||||
void analyze_template(const common_chat_template & tmpl);
|
||||
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
|
||||
#define ANSI_RESET "\033[0m"
|
||||
#define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
|
||||
@@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
|
||||
static const std::string ARG_FIRST = "AA_ARG_FST_AA";
|
||||
static const std::string ARG_SECOND = "BB_ARG_SND_BB";
|
||||
static const std::string USER_MSG = "U_USER_MSG Hello END_U";
|
||||
static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
|
||||
static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
|
||||
static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
|
||||
static const std::string CALL_ID_001 = "call00001";
|
||||
@@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.content.end = "<|END_OF_TURN_TOKEN|>";
|
||||
analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
|
||||
analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
|
||||
analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
@@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.tools.function.close = "```";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
|
||||
}
|
||||
}
|
||||
},
|
||||
// Nemotron Nano v2
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
|
||||
tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
|
||||
|
||||
analysis.tools.format.mode = tool_format::JSON_NATIVE;
|
||||
analysis.tools.format.section_start = "";
|
||||
analysis.tools.format.section_end = "";
|
||||
analysis.tools.format.per_call_start = "<TOOLCALL>";
|
||||
analysis.tools.format.per_call_end = "</TOOLCALL>";
|
||||
analysis.content.mode = content_mode::PLAIN;
|
||||
analysis.content.start = "";
|
||||
analysis.content.end = "";
|
||||
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
|
||||
analysis.reasoning.start = "<think>\n\n";
|
||||
analysis.reasoning.end = "</think>";
|
||||
analysis.assistant_start = "<SPECIAL_11>Assistant";
|
||||
analysis.user_start = "<SPECIAL_11>User";
|
||||
analysis.preserved_tokens.clear();
|
||||
analysis.preserved_tokens.push_back("<SPECIAL_12>");
|
||||
analysis.preserved_tokens.push_back("<SPECIAL_11>");
|
||||
analysis.preserved_tokens.push_back("</think>");
|
||||
analysis.preserved_tokens.push_back("<TOOLCALL>");
|
||||
analysis.preserved_tokens.push_back("</TOOLCALL>");
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Fireworks
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
|
||||
" + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
|
||||
analysis.assistant_start = "<|start_header_id|>assistant<|end_header_id|>";
|
||||
analysis.user_start = "<|start_header_id|>user<|end_header_id|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Solar Open
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
|
||||
analysis.assistant_start = "<|begin|>assistant";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Apriel 1.6
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
|
||||
analysis.user_start = "<|begin_user|>";
|
||||
analysis.assistant_start = "<|begin_assistant|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
|
||||
});
|
||||
|
||||
// Common JSON structures
|
||||
@@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
|
||||
content = analyze_content(tmpl, reasoning);
|
||||
tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
|
||||
assistant_start = detect_assistant_start_marker(tmpl);
|
||||
user_start = detect_user_start_marker(tmpl);
|
||||
collect_preserved_tokens();
|
||||
|
||||
for (auto & workaround : workarounds) {
|
||||
@@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
}
|
||||
|
||||
LOG_DBG("\n--- Reasoning & Content Structure ---\n");
|
||||
LOG_DBG("user_msg_start: %s\n", user_start.c_str());
|
||||
LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
|
||||
LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
|
||||
LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
|
||||
LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
|
||||
@@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() {
|
||||
add_token(tools.call_id.suffix);
|
||||
}
|
||||
|
||||
std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
|
||||
json user_msg = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
json assistant_no_reasoning = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({ user_msg });
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg, assistant_no_reasoning });
|
||||
}
|
||||
);
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
|
||||
return "";
|
||||
}
|
||||
|
||||
auto usermsg = comparison->diff.right;
|
||||
if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
|
||||
}
|
||||
|
||||
auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
|
||||
if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
|
||||
ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
|
||||
}
|
||||
if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
|
||||
ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
|
||||
}
|
||||
return trim_whitespace(ast_prefix);
|
||||
}
|
||||
|
||||
std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
|
||||
json user_msg = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
json assistant = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
};
|
||||
|
||||
json user_msg_two = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG_TWO }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({});
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg });
|
||||
}
|
||||
);
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
|
||||
params.messages = json::array({ user_msg_two, assistant });
|
||||
comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg_two, assistant, user_msg });
|
||||
}
|
||||
);
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
auto usermsg = comparison->diff.right;
|
||||
if (usermsg.find(USER_MSG) == std::string::npos) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
|
||||
}
|
||||
|
||||
if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
|
||||
usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
|
||||
}
|
||||
|
||||
auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
|
||||
auto candidate_split = segmentize_markers(candidate);
|
||||
std::stringstream result;
|
||||
bool encountered_marker = false;
|
||||
for (const auto & mrk : candidate_split) {
|
||||
std::string lower_mrk = std::string(mrk.value);
|
||||
std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
// heuristic to weed out potential end markers, but only at the start
|
||||
if (mrk.type == segment_type::MARKER && !encountered_marker &&
|
||||
(lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
|
||||
continue;
|
||||
}
|
||||
if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
|
||||
continue;
|
||||
}
|
||||
encountered_marker |= mrk.type == segment_type::MARKER;
|
||||
result << mrk.value;
|
||||
}
|
||||
return trim_whitespace(result.str());
|
||||
}
|
||||
|
||||
analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
|
||||
: analyze_base(tmpl) {
|
||||
LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
|
||||
|
||||
@@ -90,6 +90,45 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
|
||||
return text;
|
||||
}
|
||||
|
||||
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
|
||||
if (delims.empty() || prompt.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
|
||||
std::vector<std::string> all_delims;
|
||||
std::vector<common_peg_parser> tagged_messages;
|
||||
|
||||
all_delims.reserve(delims.size());
|
||||
tagged_messages.reserve(delims.size());
|
||||
for (const auto & d : delims) {
|
||||
all_delims.push_back(d.delimiter);
|
||||
}
|
||||
|
||||
auto any_delim = p.until_one_of(all_delims);
|
||||
for (const auto & d : delims) {
|
||||
tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
|
||||
}
|
||||
|
||||
return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
|
||||
});
|
||||
|
||||
common_peg_parse_context ctx(prompt);
|
||||
const auto result = parser.parse(ctx);
|
||||
if (!result.success()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<common_chat_msg_span> spans;
|
||||
ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
|
||||
if (!node.tag.empty()) {
|
||||
spans.push_back({ node.tag, node.start, node.end - node.start });
|
||||
}
|
||||
});
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
|
||||
if (!content.empty() && !content_parts.empty()) {
|
||||
throw std::runtime_error("Cannot specify both content and content_parts");
|
||||
@@ -1042,6 +1081,14 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
|
||||
data.prompt = prompt;
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
data.message_spans = common_chat_split_by_role(prompt, {
|
||||
{ "assistant", "<|start|>assistant" },
|
||||
{ "user", "<|start|>user" },
|
||||
{ "system", "<|start|>developer" },
|
||||
{ "system", "<|start|>system" },
|
||||
{ "tool", "<|start|>functions" },
|
||||
});
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
|
||||
@@ -1181,6 +1228,11 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
data.message_spans = common_chat_split_by_role(data.prompt, {
|
||||
{ "user", "<|turn>user\n" },
|
||||
{ "assistant", "<|turn>model\n" },
|
||||
});
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
|
||||
data.supports_thinking = true;
|
||||
data.thinking_start_tag = "<|channel>thought";
|
||||
@@ -2393,6 +2445,19 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
struct autoparser::autoparser autoparser;
|
||||
autoparser.analyze_template(tmpl);
|
||||
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
||||
|
||||
std::vector<common_chat_msg_delimiter> delimiters;
|
||||
if (!autoparser.assistant_start.empty()) {
|
||||
delimiters.push_back({ "assistant", autoparser.assistant_start });
|
||||
}
|
||||
if (!autoparser.user_start.empty()) {
|
||||
delimiters.push_back({ "user", autoparser.user_start });
|
||||
}
|
||||
|
||||
if (!delimiters.empty()) {
|
||||
auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
|
||||
}
|
||||
|
||||
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
||||
if (auto_params.supports_thinking) {
|
||||
auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
|
||||
|
||||
@@ -143,6 +143,17 @@ struct common_chat_msg_diff {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_span {
|
||||
std::string role;
|
||||
std::size_t pos = 0;
|
||||
std::size_t len = 0;
|
||||
};
|
||||
|
||||
struct common_chat_msg_delimiter {
|
||||
std::string role;
|
||||
std::string delimiter;
|
||||
};
|
||||
|
||||
struct common_chat_tool {
|
||||
std::string name;
|
||||
std::string description;
|
||||
@@ -208,6 +219,7 @@ struct common_chat_params {
|
||||
std::vector<std::string> preserved_tokens;
|
||||
std::vector<std::string> additional_stops;
|
||||
std::string parser;
|
||||
std::vector<common_chat_msg_span> message_spans;
|
||||
};
|
||||
|
||||
// per-message parsing syntax
|
||||
@@ -219,6 +231,7 @@ struct common_chat_parser_params {
|
||||
bool reasoning_in_content = false;
|
||||
std::string generation_prompt;
|
||||
bool parse_tool_calls = true;
|
||||
bool is_continuation = false;
|
||||
bool echo = false; // Include assistant prefilled msg in output
|
||||
bool debug = false; // Enable debug output for PEG parser
|
||||
common_peg_arena parser = {};
|
||||
@@ -303,6 +316,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const std::string & src,
|
||||
autoparser::generation_params & params);
|
||||
|
||||
|
||||
// specialized per-task preset
|
||||
struct common_chat_prompt_preset {
|
||||
std::string system;
|
||||
@@ -310,3 +324,6 @@ struct common_chat_prompt_preset {
|
||||
};
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
|
||||
|
||||
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
|
||||
|
||||
|
||||
@@ -445,6 +445,27 @@ std::string string_strip(const std::string & str) {
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
std::string string_lcs(std::string_view a, std::string_view b) {
|
||||
if (a.empty() || b.empty()) return {};
|
||||
|
||||
std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
|
||||
size_t best_len = 0;
|
||||
size_t best_end_a = 0;
|
||||
|
||||
for (size_t i = 1; i <= a.size(); ++i) {
|
||||
for (size_t j = 1; j <= b.size(); ++j) {
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
dp[i][j] = dp[i - 1][j - 1] + 1;
|
||||
if (dp[i][j] > best_len) {
|
||||
best_len = dp[i][j];
|
||||
best_end_a = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::string(a.substr(best_end_a - best_len, best_len));
|
||||
}
|
||||
|
||||
std::string string_get_sortable_timestamp() {
|
||||
using clock = std::chrono::system_clock;
|
||||
|
||||
@@ -1368,8 +1389,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
|
||||
if (params.warmup) {
|
||||
LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
llama_set_warmup(lctx, true);
|
||||
|
||||
std::vector<llama_token> tmp;
|
||||
llama_token bos = llama_vocab_bos(vocab);
|
||||
llama_token eos = llama_vocab_eos(vocab);
|
||||
@@ -1400,7 +1419,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
|
||||
llama_memory_clear(llama_get_memory(lctx), true);
|
||||
llama_synchronize(lctx);
|
||||
llama_perf_context_reset(lctx);
|
||||
llama_set_warmup(lctx, false);
|
||||
|
||||
// reset samplers to reset RNG state after warmup to the seeded state
|
||||
res->reset_samplers();
|
||||
@@ -1542,6 +1560,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.n_ctx = params.n_ctx;
|
||||
cparams.n_seq_max = params.n_parallel;
|
||||
cparams.n_rs_seq = params.speculative.need_n_rs_seq();
|
||||
cparams.n_outputs_max = std::max(params.n_outputs_max, 0);
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_ubatch = params.n_ubatch;
|
||||
cparams.n_threads = params.cpuparams.n_threads;
|
||||
@@ -1963,36 +1982,37 @@ bool common_replay_last_token(struct llama_context * ctx, llama_token last_token
|
||||
|
||||
bool common_prompt_batch_decode(
|
||||
struct llama_context * ctx,
|
||||
const std::vector<llama_token> & tokens,
|
||||
const std::vector<llama_token> & all_tokens,
|
||||
int n_new,
|
||||
int & n_past,
|
||||
int n_batch,
|
||||
std::string_view state_path,
|
||||
bool save_state) {
|
||||
const int n_eval = tokens.size();
|
||||
if (n_eval == 0) {
|
||||
if (n_new == 0) {
|
||||
return true;
|
||||
}
|
||||
const int offset = all_tokens.size() - n_new;
|
||||
|
||||
if (save_state && n_eval > 1) {
|
||||
const int n_tokens_before_last = n_eval - 1;
|
||||
if (save_state && n_new > 1) {
|
||||
const int n_tokens_before_last = n_new - 1;
|
||||
|
||||
GGML_ASSERT(n_eval <= n_batch);
|
||||
GGML_ASSERT(n_new <= n_batch);
|
||||
|
||||
// Decode all but the last token so we can save the memory state before decoding the last token.
|
||||
// This is done so we can restore the session state later and replay the last token.
|
||||
// Memory implementations in recurrent/hybrid models don't support removing tokens from their
|
||||
// memory, so we can't just remove the last token from the memory and replay the last token which
|
||||
// is the reason for this logic.
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_tokens_before_last))) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_tokens_before_last))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_tokens_before_last;
|
||||
|
||||
llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last);
|
||||
LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last);
|
||||
llama_state_save_file(ctx, state_path.data(), all_tokens.data(), all_tokens.size());
|
||||
LOG_INF("saved session before last token to %s, n_new = %zu\n", state_path.data(), all_tokens.size());
|
||||
|
||||
llama_token last_token = tokens.back();
|
||||
llama_token last_token = all_tokens.back();
|
||||
llama_batch batch = llama_batch_get_one(&last_token, 1);
|
||||
int32_t pos = n_past;
|
||||
batch.pos = &pos;
|
||||
@@ -2003,11 +2023,11 @@ bool common_prompt_batch_decode(
|
||||
}
|
||||
n_past++;
|
||||
} else {
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(tokens.data()), n_eval))) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(const_cast<llama_token*>(all_tokens.data() + offset), n_new))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
n_past += n_new;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
@@ -277,6 +277,7 @@ struct common_params_sampling {
|
||||
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||
bool reasoning_control = false; // create the budget sampler on demand so reasoning can be ended at runtime
|
||||
|
||||
bool backend_sampling = false;
|
||||
|
||||
@@ -305,6 +306,8 @@ struct common_params_speculative_draft {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.0f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
bool backend_sampling = true; // offload draft sampling to the backend (default: on)
|
||||
|
||||
common_params_model mparams;
|
||||
|
||||
llama_context * ctx_tgt = nullptr;
|
||||
@@ -429,6 +432,7 @@ struct common_params {
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
int32_t n_outputs_max = 0; // max outputs in a batch (0 = n_batch)
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
||||
@@ -477,7 +481,7 @@ struct common_params {
|
||||
|
||||
std::set<std::string> model_alias; // model aliases // NOLINT
|
||||
std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
std::string hf_token = ""; // HF token (aka bearer token) // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string system_prompt = ""; // NOLINT
|
||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||
@@ -505,6 +509,7 @@ struct common_params {
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
bool offline = false;
|
||||
bool skip_download = false; // skip model file downloading
|
||||
|
||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
@@ -585,14 +590,15 @@ struct common_params {
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
bool reuse_port = false; // allow multiple sockets to bind to the same port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_read = 3600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t sse_ping_interval = 30; // SSE ping interval in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
@@ -615,11 +621,7 @@ struct common_params {
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// UI configs
|
||||
#ifdef LLAMA_UI_DEFAULT_ENABLED
|
||||
bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
|
||||
#else
|
||||
bool ui = true; // default to enabled when not set
|
||||
#endif
|
||||
bool ui = true;
|
||||
|
||||
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
|
||||
bool webui = ui;
|
||||
@@ -733,6 +735,7 @@ std::string string_format(const char * fmt, ...);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
std::string string_lcs(std::string_view a, std::string_view b);
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
||||
@@ -927,7 +930,8 @@ void common_batch_add(
|
||||
// tokens from memory, so this approach works across all model architectures.
|
||||
bool common_prompt_batch_decode(
|
||||
struct llama_context * ctx,
|
||||
const std::vector<llama_token> & embd,
|
||||
const std::vector<llama_token> & all_tokens,
|
||||
int n_new,
|
||||
int & n_past,
|
||||
int n_batch,
|
||||
std::string_view state_path,
|
||||
|
||||
@@ -292,6 +292,10 @@ static int common_download_file_single_online(const std::string & url,
|
||||
|
||||
const bool file_exists = std::filesystem::exists(path);
|
||||
|
||||
if (!file_exists && opts.skip_download) {
|
||||
return -2; // file is missing and download is disabled
|
||||
}
|
||||
|
||||
if (file_exists && skip_etag) {
|
||||
LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
@@ -357,6 +361,10 @@ static int common_download_file_single_online(const std::string & url,
|
||||
LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
|
||||
return 304; // 304 Not Modified - fake cached response
|
||||
}
|
||||
// pass this point, the file exists but is different from the server version, so we need to redownload it
|
||||
if (opts.skip_download) {
|
||||
return -2; // special code to indicate that the download was skipped due to etag mismatch
|
||||
}
|
||||
if (remove(path.c_str()) != 0) {
|
||||
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||
return -1;
|
||||
@@ -775,13 +783,13 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode
|
||||
}
|
||||
|
||||
common_download_model_result common_download_model(const common_params_model & model,
|
||||
const common_download_opts & opts,
|
||||
bool download_mmproj,
|
||||
bool download_mtp) {
|
||||
const common_download_opts & opts) {
|
||||
common_download_model_result result;
|
||||
std::vector<download_task> tasks;
|
||||
hf_plan hf;
|
||||
|
||||
bool download_mmproj = opts.download_mmproj;
|
||||
bool download_mtp = opts.download_mtp;
|
||||
bool is_hf = !model.hf_repo.empty();
|
||||
|
||||
if (is_hf) {
|
||||
@@ -806,18 +814,22 @@ common_download_model_result common_download_model(const common_params_model &
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<std::future<bool>> futures;
|
||||
std::vector<std::future<int>> futures;
|
||||
for (const auto & task : tasks) {
|
||||
futures.push_back(std::async(std::launch::async,
|
||||
[&task, &opts, is_hf]() {
|
||||
int status = common_download_file_single(task.url, task.path, opts, is_hf);
|
||||
return is_http_status_ok(status);
|
||||
return common_download_file_single(task.url, task.path, opts, is_hf);
|
||||
}
|
||||
));
|
||||
}
|
||||
|
||||
for (auto & f : futures) {
|
||||
if (!f.get()) {
|
||||
int status = f.get();
|
||||
if (status == -2 && opts.skip_download) {
|
||||
throw common_skip_download_exception();
|
||||
}
|
||||
bool is_ok = is_http_status_ok(status);
|
||||
if (!is_ok) {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,6 +52,9 @@ struct common_download_opts {
|
||||
std::string bearer_token;
|
||||
common_header_list headers;
|
||||
bool offline = false;
|
||||
bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid
|
||||
bool download_mmproj = false;
|
||||
bool download_mtp = false;
|
||||
common_download_callback * callback = nullptr;
|
||||
};
|
||||
|
||||
@@ -62,6 +65,11 @@ struct common_download_model_result {
|
||||
std::string mtp_path;
|
||||
};
|
||||
|
||||
// throw if the file is missing or invalid (e.g. ETag check failed)
|
||||
struct common_skip_download_exception : public std::runtime_error {
|
||||
common_skip_download_exception() : std::runtime_error("skip download") {}
|
||||
};
|
||||
|
||||
// Download model from HuggingFace repo or URL
|
||||
//
|
||||
// input (via model struct):
|
||||
@@ -89,9 +97,7 @@ struct common_download_model_result {
|
||||
// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
|
||||
common_download_model_result common_download_model(
|
||||
const common_params_model & model,
|
||||
const common_download_opts & opts = {},
|
||||
bool download_mmproj = false,
|
||||
bool download_mtp = false
|
||||
const common_download_opts & opts = {}
|
||||
);
|
||||
|
||||
// returns list of cached models
|
||||
@@ -99,6 +105,7 @@ std::vector<common_cached_model_info> common_list_cached_models();
|
||||
|
||||
// download single file from url to local path
|
||||
// returns status code or -1 on error
|
||||
// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true)
|
||||
// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
|
||||
int common_download_file_single(const std::string & url,
|
||||
const std::string & path,
|
||||
|
||||
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
|
||||
16
common/fit.h
16
common/fit.h
@@ -1,6 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
@@ -30,3 +35,14 @@ void common_fit_print(
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
// Load a model + context with no_alloc and return the per-device memory breakdown.
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const struct llama_model_params * mparams,
|
||||
const struct llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
enum ggml_log_level log_level);
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "ngram-mod.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
//
|
||||
// common_ngram_mod
|
||||
//
|
||||
|
||||
@@ -247,3 +247,24 @@ common_reasoning_budget_state common_reasoning_budget_get_state(const struct lla
|
||||
}
|
||||
return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
|
||||
}
|
||||
|
||||
bool common_reasoning_budget_force(struct llama_sampler * smpl) {
|
||||
if (!smpl) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
|
||||
|
||||
// only a sampler that is actively counting down the budget may be forced;
|
||||
// any other state (idle, already forcing/waiting, or done) is left untouched
|
||||
if (ctx->state != REASONING_BUDGET_COUNTING) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ctx->state = REASONING_BUDGET_FORCING;
|
||||
ctx->force_pos = 0;
|
||||
ctx->end_matcher.reset();
|
||||
LOG_INF("reasoning-budget: forced into forcing state (manual transition)\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -40,3 +40,7 @@ struct llama_sampler * common_reasoning_budget_init(
|
||||
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE);
|
||||
|
||||
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
|
||||
|
||||
// Manually transition the reasoning budget sampler into the FORCING state.
|
||||
// Returns true if the transition occurred.
|
||||
bool common_reasoning_budget_force(struct llama_sampler * smpl);
|
||||
|
||||
@@ -293,7 +293,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
|
||||
// reasoning budget sampler (skip when budget is unlimited unless a lazy grammar is active, which needs rbudget for thinking-block suppression)
|
||||
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0)) {
|
||||
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty() && (params.grammar_lazy || params.reasoning_budget_tokens >= 0 || params.reasoning_control)) {
|
||||
rbudget = common_reasoning_budget_init(
|
||||
vocab,
|
||||
params.reasoning_budget_start,
|
||||
@@ -661,6 +661,14 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
return llama_sampler_get_seed(gsmpl->chain);
|
||||
}
|
||||
|
||||
bool common_sampler_reasoning_budget_force(struct common_sampler * gsmpl) {
|
||||
if (!gsmpl) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return common_reasoning_budget_force(gsmpl->rbudget);
|
||||
}
|
||||
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
||||
|
||||
@@ -87,6 +87,9 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
|
||||
|
||||
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||
|
||||
// force the reasoning budget sampler (if any) to begin forcing its end sequence now.
|
||||
bool common_sampler_reasoning_budget_force(struct common_sampler * gsmpl);
|
||||
|
||||
// helpers
|
||||
|
||||
// access the internal list of current candidate tokens
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP)
|
||||
#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP)
|
||||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "ngram-map.h"
|
||||
@@ -33,16 +33,15 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
|
||||
};
|
||||
|
||||
static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "default";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (i > 0) result += ", ";
|
||||
if (devices[i] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (!result.empty()) result += ", ";
|
||||
result += ggml_backend_dev_name(devices[i]);
|
||||
}
|
||||
return result;
|
||||
return result.empty() ? "default" : result;
|
||||
}
|
||||
|
||||
struct common_speculative_config {
|
||||
@@ -163,7 +162,7 @@ struct common_speculative_impl {
|
||||
virtual bool need_embd() const = 0;
|
||||
|
||||
// true if this implementation requires the target context to extract pre-norm embeddings
|
||||
virtual bool need_embd_pre_norm() const { return false; }
|
||||
virtual bool need_embd_nextn() const { return false; }
|
||||
};
|
||||
|
||||
struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
@@ -414,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
// backend sampler chain per seq, attached to ctx_dft
|
||||
std::vector<llama_sampler *> backend_chains;
|
||||
|
||||
int32_t n_embd = 0;
|
||||
|
||||
// Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
|
||||
@@ -445,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
@@ -469,8 +471,24 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
|
||||
llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
|
||||
llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
|
||||
// offload draft sampling to the backend
|
||||
backend_chains.assign(n_seq, nullptr);
|
||||
if (this->params.backend_sampling) {
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
|
||||
|
||||
if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
|
||||
LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
|
||||
llama_sampler_free(chain);
|
||||
chain = nullptr;
|
||||
}
|
||||
backend_chains[seq_id] = chain;
|
||||
}
|
||||
}
|
||||
|
||||
llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false);
|
||||
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
|
||||
|
||||
@@ -484,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
}
|
||||
|
||||
~common_speculative_impl_draft_mtp() override {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
|
||||
if (backend_chains[seq_id] == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (ctx_dft) {
|
||||
llama_set_sampler(ctx_dft, seq_id, nullptr);
|
||||
}
|
||||
llama_sampler_free(backend_chains[seq_id]);
|
||||
}
|
||||
backend_chains.clear();
|
||||
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
@@ -553,7 +583,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
// ^--- this is a problem
|
||||
// TODO:this is generally true, but would be nice to assert it
|
||||
{
|
||||
const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt);
|
||||
const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt);
|
||||
std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
|
||||
|
||||
//{
|
||||
@@ -595,7 +625,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
verify_h[seq_id].resize((size_t) n_rows * n_embd);
|
||||
|
||||
for (int32_t i = 0; i < n_rows; ++i) {
|
||||
const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i);
|
||||
const float * h = llama_get_embeddings_nextn_ith(ctx_tgt, i_batch_beg[seq_id] + i);
|
||||
std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes);
|
||||
}
|
||||
|
||||
@@ -656,7 +686,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
auto * smpl = smpls[seq_id].get();
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, i_batch, true);
|
||||
h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch);
|
||||
h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
|
||||
++i_batch;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
@@ -742,7 +772,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool need_embd_pre_norm() const override {
|
||||
bool need_embd_nextn() const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
@@ -1287,6 +1317,40 @@ static uint32_t common_get_enabled_speculative_configs(const std::vector<common_
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t common_speculative_n_max(const common_params_speculative * spec) {
|
||||
int32_t n_max = 0;
|
||||
|
||||
for (const auto type : spec->types) {
|
||||
switch (type) {
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE:
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:
|
||||
n_max = std::max(n_max, std::max(0, spec->draft.n_max));
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:
|
||||
n_max = std::max(n_max, (int32_t) spec->ngram_simple.size_m);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
|
||||
n_max = std::max(n_max, (int32_t) spec->ngram_map_k.size_m);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V:
|
||||
n_max = std::max(n_max, (int32_t) spec->ngram_map_k4v.size_m);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD:
|
||||
n_max = std::max(n_max, std::max(0, spec->ngram_mod.n_max));
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:
|
||||
n_max = std::max(n_max, (int32_t) 8);
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
case COMMON_SPECULATIVE_TYPE_COUNT:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return n_max;
|
||||
}
|
||||
|
||||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq) {
|
||||
@@ -1295,8 +1359,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
{
|
||||
uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
|
||||
|
||||
bool has_draft_model_path = !params.draft.mparams.path.empty();
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
@@ -1329,16 +1391,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
if (has_ngram_cache) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
|
||||
}
|
||||
if (has_draft_simple) {
|
||||
if (!has_draft_model_path) {
|
||||
LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__);
|
||||
has_draft_simple = false;
|
||||
}
|
||||
} else if (has_draft_model_path && !has_mtp && !has_draft_eagle3) {
|
||||
LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__);
|
||||
has_draft_simple = true;
|
||||
}
|
||||
|
||||
if (has_draft_simple) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params));
|
||||
}
|
||||
@@ -1487,13 +1539,13 @@ bool common_speculative_need_embd(common_speculative * spec) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool common_speculative_need_embd_pre_norm(common_speculative * spec) {
|
||||
bool common_speculative_need_embd_nextn(common_speculative * spec) {
|
||||
if (spec == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (auto & impl : spec->impls) {
|
||||
if (impl->need_embd_pre_norm()) {
|
||||
if (impl->need_embd_nextn()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,9 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
||||
// convert type to string
|
||||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
// return the max number of draft tokens based on the speculative parameters
|
||||
int32_t common_speculative_n_max(const common_params_speculative * spec);
|
||||
|
||||
common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq);
|
||||
|
||||
void common_speculative_free(common_speculative * spec);
|
||||
@@ -56,8 +59,8 @@ bool common_speculative_process(common_speculative * spec, const llama_batch & b
|
||||
// true if any implementation requires target post-norm embeddings to be extracted
|
||||
bool common_speculative_need_embd(common_speculative * spec);
|
||||
|
||||
// true if any implementation requires target pre-norm embeddings to be extracted
|
||||
bool common_speculative_need_embd_pre_norm(common_speculative * spec);
|
||||
// true if any implementation requires target nextn embeddings to be extracted
|
||||
bool common_speculative_need_embd_nextn(common_speculative * spec);
|
||||
|
||||
// generate drafts for the sequences specified with `common_speculative_get_draft_params`
|
||||
void common_speculative_draft(common_speculative * spec);
|
||||
|
||||
@@ -47,6 +47,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"DeepseekForCausalLM": "deepseek",
|
||||
"DeepseekV2ForCausalLM": "deepseek",
|
||||
"DeepseekV3ForCausalLM": "deepseek",
|
||||
"DeepseekV32ForCausalLM": "deepseek",
|
||||
"DistilBertForMaskedLM": "bert",
|
||||
"DistilBertForSequenceClassification": "bert",
|
||||
"DistilBertModel": "bert",
|
||||
@@ -57,6 +58,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Ernie4_5_ForCausalLM": "ernie",
|
||||
"Ernie4_5_MoeForCausalLM": "ernie",
|
||||
"EuroBertModel": "bert",
|
||||
"Exaone4_5_ForConditionalGeneration": "exaone",
|
||||
"Exaone4ForCausalLM": "exaone",
|
||||
"ExaoneForCausalLM": "exaone",
|
||||
"ExaoneMoEForCausalLM": "exaone",
|
||||
@@ -74,6 +76,8 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Gemma3nForCausalLM": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
"Gemma4ForCausalLM": "gemma",
|
||||
"Gemma4UnifiedForConditionalGeneration": "gemma",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Glm4ForCausalLM": "glm",
|
||||
"Glm4MoeForCausalLM": "glm",
|
||||
@@ -132,6 +136,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Mamba2ForCausalLM": "mamba",
|
||||
"MambaForCausalLM": "mamba",
|
||||
"MambaLMHeadModel": "mamba",
|
||||
"MellumForCausalLM": "mellum",
|
||||
"MiMoV2FlashForCausalLM": "mimo",
|
||||
"MiMoV2ForCausalLM": "mimo",
|
||||
"MiniCPM3ForCausalLM": "minicpm",
|
||||
@@ -212,9 +217,11 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Starcoder2ForCausalLM": "starcoder",
|
||||
"Step3p5ForCausalLM": "step3",
|
||||
"StepVLForConditionalGeneration": "step3",
|
||||
"Step3p7ForConditionalGeneration": "step3",
|
||||
"T5EncoderModel": "t5",
|
||||
"T5ForConditionalGeneration": "t5",
|
||||
"T5WithLMHeadModel": "t5",
|
||||
"TalkieForCausalLM": "talkie",
|
||||
"UMT5ForConditionalGeneration": "t5",
|
||||
"UMT5Model": "t5",
|
||||
"UltravoxModel": "ultravox",
|
||||
@@ -234,11 +241,14 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"AudioFlamingo3ForConditionalGeneration": "ultravox",
|
||||
"CogVLMForCausalLM": "cogvlm",
|
||||
"DeepseekOCR2ForCausalLM": "deepseek",
|
||||
"DeepseekOCRForCausalLM": "deepseek",
|
||||
"DotsOCRForCausalLM": "dotsocr",
|
||||
"Exaone4_5_ForConditionalGeneration": "exaone",
|
||||
"Gemma3ForConditionalGeneration": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
"Gemma4UnifiedForConditionalGeneration": "gemma",
|
||||
"Glm4vForConditionalGeneration": "qwen3vl",
|
||||
"Glm4vMoeForConditionalGeneration": "qwen3vl",
|
||||
"GlmOcrForConditionalGeneration": "qwen3vl",
|
||||
@@ -277,6 +287,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"Sarashina2VisionForCausalLM": "sarashina2",
|
||||
"SmolVLMForConditionalGeneration": "smolvlm",
|
||||
"StepVLForConditionalGeneration": "step3",
|
||||
"Step3p7ForConditionalGeneration": "step3",
|
||||
"UltravoxModel": "ultravox",
|
||||
"VoxtralForConditionalGeneration": "ultravox",
|
||||
"YoutuVLForConditionalGeneration": "youtuvl",
|
||||
|
||||
@@ -119,7 +119,8 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
fuse_gate_up_exps: bool = False):
|
||||
fuse_gate_up_exps: bool = False,
|
||||
fp8_as_q8: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
type(self) is TextModel or \
|
||||
type(self) is MmprojModel:
|
||||
@@ -148,6 +149,8 @@ class ModelBase:
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
self._is_nvfp4 = False
|
||||
self._is_mxfp4 = False
|
||||
self._fp8_as_q8 = fp8_as_q8
|
||||
self._fp8_dequantized: set[str] = set()
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
@@ -429,6 +432,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith(".activation_scale"): # unused
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused
|
||||
@@ -440,6 +445,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith(".qscale_act"):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "gptq":
|
||||
@@ -467,7 +474,14 @@ class ModelBase:
|
||||
elif quant_method == "compressed-tensors":
|
||||
quant_format = quant_config["format"]
|
||||
groups = quant_config["config_groups"]
|
||||
if len(groups) > 1:
|
||||
nvfp4_compressed_tensors = (
|
||||
quant_format == "nvfp4-pack-quantized"
|
||||
or quant_format == "mixed-precision"
|
||||
and bool(groups)
|
||||
and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict))
|
||||
)
|
||||
|
||||
if len(groups) > 1 and not nvfp4_compressed_tensors:
|
||||
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
|
||||
weight_config = tuple(groups.values())[0]["weights"]
|
||||
|
||||
@@ -476,6 +490,11 @@ class ModelBase:
|
||||
strategy = weight_config.get("strategy")
|
||||
assert strategy == "channel" or strategy == "block"
|
||||
assert weight_config.get("group_size") is None # didn't find a model using this yet
|
||||
is_fp8 = (
|
||||
quant_format == "float-quantized"
|
||||
and weight_config.get("type") == "float"
|
||||
and weight_config.get("num_bits") == 8
|
||||
)
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
@@ -483,6 +502,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8 and is_fp8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
elif quant_format == "pack-quantized":
|
||||
assert weight_config.get("strategy") == "group"
|
||||
assert weight_config.get("type", "int") == "int"
|
||||
@@ -505,6 +526,9 @@ class ModelBase:
|
||||
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
|
||||
if (base_name + "_zero_point") in self.model_tensors:
|
||||
tensors_to_remove.append(base_name + "_zero_point")
|
||||
elif nvfp4_compressed_tensors:
|
||||
# Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
|
||||
elif quant_method == "modelopt":
|
||||
@@ -514,10 +538,18 @@ class ModelBase:
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
if weight_name not in self.model_tensors:
|
||||
tensors_to_remove.append(name)
|
||||
continue
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
is_fp8_weight = False
|
||||
if self._fp8_as_q8:
|
||||
is_fp8_weight = w().dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
|
||||
tensors_to_remove.append(name)
|
||||
if is_fp8_weight:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method is not None:
|
||||
@@ -605,8 +637,10 @@ class ModelBase:
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
del name, new_name, bid, n_dims # unused
|
||||
|
||||
del new_name, bid # unused
|
||||
# Force FP8-original tensors to Q8_0 when requested; Q8_0 is faster than F16/BF16.
|
||||
if self._fp8_as_q8 and name in self._fp8_dequantized and n_dims >= 2:
|
||||
return gguf.GGMLQuantizationType.Q8_0
|
||||
return False
|
||||
|
||||
# some models need extra generated tensors (like rope_freqs)
|
||||
@@ -746,10 +780,13 @@ class ModelBase:
|
||||
del experts, merged
|
||||
|
||||
def prepare_tensors(self):
|
||||
# detect NVFP4 quantization (ModelOpt format)
|
||||
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
|
||||
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
|
||||
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
|
||||
# detect NVFP4 quantization (ModelOpt and Compressed-tensors formats)
|
||||
quantization_config = self.hparams.get("quantization_config") or {}
|
||||
quant_algo = quantization_config.get("quant_algo")
|
||||
quant_method = quantization_config.get("quant_method")
|
||||
quant_format = quantization_config.get("format")
|
||||
quant_groups = quantization_config.get("config_groups") or {}
|
||||
quant_layers = quantization_config.get("quantized_layers") or {}
|
||||
quant_config_file = self.dir_model / "hf_quant_config.json"
|
||||
|
||||
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
|
||||
@@ -760,13 +797,25 @@ class ModelBase:
|
||||
producer_name = (producer.get("name") or "").lower()
|
||||
if quant_method is None:
|
||||
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
|
||||
quant_method = producer_name
|
||||
quant_algo = quant_config.get("quant_algo", quant_algo)
|
||||
quant_method = quant_config.get("quant_method", quant_method)
|
||||
quant_format = quant_config.get("format", quant_format)
|
||||
quant_groups = quant_config.get("config_groups", quant_groups) or {}
|
||||
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
|
||||
|
||||
# Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
|
||||
# per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
|
||||
nvfp4_compressed_tensors = quant_method == "compressed-tensors" and (
|
||||
quant_format == "nvfp4-pack-quantized"
|
||||
or quant_format == "mixed-precision"
|
||||
and bool(quant_groups)
|
||||
and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict))
|
||||
)
|
||||
if quant_algo != "NVFP4":
|
||||
if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
|
||||
if nvfp4_compressed_tensors:
|
||||
quant_algo = "NVFP4"
|
||||
elif any(str(v.get("quant_algo")).endswith("NVFP4") for v in quant_layers.values() if isinstance(v, dict)):
|
||||
quant_algo = "NVFP4"
|
||||
|
||||
self._is_nvfp4 = quant_algo == "NVFP4"
|
||||
@@ -776,6 +825,28 @@ class ModelBase:
|
||||
# This must run before dequant_model so NVFP4 tensors are removed
|
||||
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
|
||||
if self._is_nvfp4:
|
||||
if nvfp4_compressed_tensors:
|
||||
# Convert compressed-tensors 'global' scales into the reciprocal
|
||||
def inverse_scale(gen):
|
||||
def load():
|
||||
scale = LazyTorchTensor.to_eager(gen()).float()
|
||||
return 1.0 / scale
|
||||
return load
|
||||
|
||||
# Change the compressed-tensors names to the ModelOpt names for handling consistently later
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith(".weight_packed"):
|
||||
weight_name = name.removesuffix("_packed")
|
||||
if weight_name not in self.model_tensors:
|
||||
self.model_tensors[weight_name] = self.model_tensors.pop(name)
|
||||
elif name.endswith(".weight_global_scale"):
|
||||
scale2_name = name.replace(".weight_global_scale", ".weight_scale_2")
|
||||
if scale2_name not in self.model_tensors:
|
||||
self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name))
|
||||
elif name.endswith(".input_global_scale"):
|
||||
input_scale_name = name.replace(".input_global_scale", ".input_scale")
|
||||
if input_scale_name not in self.model_tensors:
|
||||
self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name))
|
||||
self._generate_nvfp4_tensors()
|
||||
|
||||
self.dequant_model()
|
||||
@@ -844,6 +915,8 @@ class ModelBase:
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D_Q,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D_K,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D_V,
|
||||
# DSA indexer weights should be F32
|
||||
gguf.MODEL_TENSOR.INDEXER_PROJ,
|
||||
)
|
||||
)
|
||||
or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
|
||||
@@ -1067,7 +1140,7 @@ class TextModel(ModelBase):
|
||||
# Skip multimodal tensors
|
||||
if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
|
||||
or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
|
||||
or "vision_" in name or "audio_" in name or "sam_model" in name \
|
||||
or "vision_" in name or "audio_" in name \
|
||||
or "token2wav." in name or "code2wav." in name \
|
||||
or "projector." in name or "pre_mm_projector_norm" in name \
|
||||
or "image_newline" in name or "view_seperator" in name \
|
||||
@@ -1374,6 +1447,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
|
||||
# ref: https://huggingface.co/evilfreelancer/ruGPT3XL
|
||||
res = "gpt-2"
|
||||
if chkhsh == "9e454714343b69b99b71795c1d27a68c2a1d15dab111f4d353109f966af29da7":
|
||||
# ref: https://huggingface.co/LiquidAI/LFM2.5-8B-A1B
|
||||
res = "lfm2"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
@@ -1525,7 +1601,7 @@ class TextModel(ModelBase):
|
||||
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
|
||||
res = "midm-2.0"
|
||||
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
|
||||
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
|
||||
# ref: https://huggingface.co/LiquidAI/LFM2.5-350M
|
||||
res = "lfm2"
|
||||
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
|
||||
@@ -1575,6 +1651,21 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
|
||||
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
|
||||
res = "talkie"
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
if chkhsh == "f241072145675bf8322086f115aebad05e9f869557a238bf2150a2a417d1bf60":
|
||||
# ref: https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2
|
||||
res = "granite-embed-multi-97m"
|
||||
if chkhsh == "789696f5946cc0fc59371f39f6097cafed196b3acded6140432f26bbb1ae1669":
|
||||
# ref: https://huggingface.co/ibm-granite/granite-embedding-311m-multilingual-r2
|
||||
res = "granite-embed-multi-311m"
|
||||
if chkhsh == "9dcf830ee9990cdbf78cc523a5f7bd9ad8f3f9890c2d3581d2785ad10f07049d":
|
||||
# ref: https://huggingface.co/JetBrains/Mellum2-12B-A2.5B-Base
|
||||
res = "mellum2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -1610,6 +1701,57 @@ class TextModel(ModelBase):
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_whitespace(self) -> None:
|
||||
tokens, toktypes, _ = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("whitespace")
|
||||
self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_hybriddna(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
|
||||
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
|
||||
# k-mer's own id (llama.cpp strips it on detokenization)
|
||||
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
|
||||
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token: str = reverse_vocab[i]
|
||||
if token in added_vocab:
|
||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(token)
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
self.gguf_writer.add_tokenizer_model("hybriddna")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
from .qwen import QwenModel
|
||||
|
||||
@@ -2323,10 +2465,9 @@ class MmprojModel(ModelBase):
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
@@ -2461,7 +2602,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
||||
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
|
||||
# For text conversion we route to a dedicated text-only class.
|
||||
# TODO: refactor this later to avoid adding exception here
|
||||
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
|
||||
if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Exaone4_5_ForConditionalGeneration", "Step3p7ForConditionalGeneration"):
|
||||
return arch
|
||||
|
||||
# if "architectures" is found in the sub-config, use that instead
|
||||
|
||||
@@ -571,7 +571,16 @@ class JinaBertV2Model(BertModel):
|
||||
if tokenizer_class == 'BertTokenizer':
|
||||
super().set_vocab()
|
||||
elif tokenizer_class == 'RobertaTokenizer':
|
||||
self._set_vocab_gpt2()
|
||||
pre_tokenizer_type = None
|
||||
tokenizer_json_path = self.dir_model / "tokenizer.json"
|
||||
if tokenizer_json_path.is_file():
|
||||
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
|
||||
pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")
|
||||
|
||||
if pre_tokenizer_type == "Whitespace":
|
||||
self._set_vocab_whitespace()
|
||||
else:
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_token_type_count(2)
|
||||
else:
|
||||
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
||||
@@ -594,6 +603,12 @@ class ModernBertModel(BertModel):
|
||||
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
# FFN activation: ModernBert uses a GLU pair (ffn_up output is 2*n_ff). The
|
||||
# original ModernBERT uses GELU (-> GeGLU); some derivatives such as IBM
|
||||
# Granite Embedding 97m R2 use SiLU (-> SwiGLU). Persist this so the
|
||||
# llama.cpp graph can pick the matching activation.
|
||||
if hidden_act := self.hparams.get("hidden_activation"):
|
||||
self.gguf_writer.add_hidden_act(hidden_act)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
|
||||
@@ -16,10 +16,14 @@ from .qwen import QwenModel
|
||||
|
||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||
class DeepseekOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
|
||||
self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
@@ -49,22 +53,27 @@ class DeepseekOCRVisionModel(MmprojModel):
|
||||
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
|
||||
|
||||
vision_config['sam'] = vision_config['width']['sam_vit_b']
|
||||
vision_config.update(vision_config['width']['clip-l-14-224'])
|
||||
vision_config['hidden_size'] = vision_config['width']
|
||||
vision_config['num_heads'] = vision_config['heads']
|
||||
vision_config['intermediate_size'] = vision_config['heads'] * 4
|
||||
if vision_config['width'].get('clip-l-14-224') is not None:
|
||||
vision_config.update(vision_config['width']['clip-l-14-224'])
|
||||
if isinstance(vision_config['width'], int):
|
||||
vision_config['hidden_size'] = vision_config['width']
|
||||
if vision_config.get('heads') is not None:
|
||||
vision_config['num_heads'] = vision_config['heads']
|
||||
vision_config['intermediate_size'] = vision_config['heads'] * 4
|
||||
|
||||
return vision_config
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".embeddings." in name or 'pos_embed' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".rel_pos_h" in name or '.rel_pos_w' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".neck." in name or ".net_" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
|
||||
if nq_name in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith("view_seperator"):
|
||||
data_torch = data_torch.unsqueeze(0)
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
@@ -81,6 +90,33 @@ class DeepseekOCRVisionModel(MmprojModel):
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekOCR2ForCausalLM")
|
||||
class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# the vision tower's qwen2 encoder is built from fixed defaults,
|
||||
# see build_qwen2_decoder_as_encoder() in deepencoderv2.py
|
||||
if self.hparams.get("patch_size") is None:
|
||||
self.hparams["patch_size"] = 16
|
||||
if self.hparams.get("intermediate_size") is None:
|
||||
self.hparams["intermediate_size"] = 4864
|
||||
if self.hparams.get("num_attention_heads") is None:
|
||||
self.hparams["num_attention_heads"] = 14
|
||||
super().set_gguf_parameters()
|
||||
# qwen2 encoder is GQA: 14 Q heads, 2 KV heads
|
||||
self.gguf_writer.add_vision_head_count_kv(2)
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any]:
|
||||
vision_config = super().get_vision_config()
|
||||
vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
|
||||
if vision_config.get('layers') is None:
|
||||
vision_config['layers'] = 24
|
||||
return vision_config
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekForCausalLM")
|
||||
class DeepseekModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
||||
@@ -188,13 +224,21 @@ class DeepseekV2Model(TextModel):
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# special handling for Deepseek OCR
|
||||
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
|
||||
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
|
||||
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
# default jinja template
|
||||
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, _ = item
|
||||
# DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
|
||||
if "sam_model" in name or "qwen2_model" in name:
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_gpt2()
|
||||
@@ -386,3 +430,32 @@ class DeepseekV2Model(TextModel):
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekV32ForCausalLM")
|
||||
class DeepseekV32Model(DeepseekV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK32
|
||||
skip_mtp = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def set_vocab(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
assert getattr(tokenizer, "add_bos_token", False), "Change value of add_bos_token to true in tokenizer_config.json file."
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# NextN/MTP prediction layers
|
||||
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
|
||||
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
|
||||
|
||||
# DSA indexer parameters
|
||||
self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
|
||||
self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
|
||||
self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
|
||||
|
||||
@@ -3,14 +3,15 @@ from __future__ import annotations
|
||||
import math
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
from typing import Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf
|
||||
from .base import MmprojModel, ModelBase, TextModel, gguf
|
||||
from .qwenvl import Qwen2VLVisionModel
|
||||
|
||||
|
||||
@ModelBase.register("ExaoneForCausalLM")
|
||||
@@ -208,3 +209,97 @@ class ExaoneMoEModel(Exaone4Model):
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("Exaone4_5_ForConditionalGeneration")
|
||||
class Exaone4_5_TextModel(Exaone4Model):
|
||||
"""Text tower of EXAONE 4.5; Tensors match EXAONE4"""
|
||||
|
||||
model_arch = gguf.MODEL_ARCH.EXAONE4
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
|
||||
if n_nextn > 0:
|
||||
self.block_count = self.hparams["num_hidden_layers"] + n_nextn
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
|
||||
if n_nextn > 0:
|
||||
self.gguf_writer.add_nextn_predict_layers(n_nextn)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("mtp."):
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0) or 0)
|
||||
if n_nextn <= 0:
|
||||
return
|
||||
nh = self.hparams["num_hidden_layers"]
|
||||
if ".layers." in name:
|
||||
share = self.hparams.get("mtp_share_layers", False)
|
||||
mtp_bid = bid if bid is not None else 0
|
||||
if share:
|
||||
for k in range(n_nextn):
|
||||
nn = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{nh + k}")
|
||||
yield from super().modify_tensors(data_torch, nn, nh + k)
|
||||
return
|
||||
name = name.replace(f"mtp.layers.{mtp_bid}", f"model.layers.{mtp_bid + nh}")
|
||||
else:
|
||||
remapper = {
|
||||
"mtp.fc": gguf.MODEL_TENSOR.NEXTN_EH_PROJ,
|
||||
"mtp.pre_fc_norm_embedding": gguf.MODEL_TENSOR.NEXTN_ENORM,
|
||||
"mtp.pre_fc_norm_hidden": gguf.MODEL_TENSOR.NEXTN_HNORM,
|
||||
"mtp.norm": gguf.MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
|
||||
}
|
||||
_n = Path(name)
|
||||
key = _n.stem
|
||||
if key not in remapper:
|
||||
return
|
||||
for bid_mtp in range(nh, self.block_count):
|
||||
mapped_name = self.format_tensor_name(remapper[key], bid_mtp, suffix=_n.suffix)
|
||||
yield from ModelBase.modify_tensors(self, data_torch, mapped_name, bid_mtp)
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Exaone4_5_ForConditionalGeneration")
|
||||
class Exaone4_5VisionModel(Qwen2VLVisionModel):
|
||||
"""Vision tower for EXAONE 4.5; Qwen2-VL-style ViT (GQA) + patch merger"""
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
name = name.replace("model.visual.", "visual.", 1)
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
MmprojModel.set_gguf_parameters(self)
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.EXAONE4_5)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
num_kv_head = self.find_vparam(["num_key_value_heads"], optional=True)
|
||||
if num_kv_head is not None:
|
||||
self.gguf_writer.add_vision_head_count_kv(num_kv_head)
|
||||
eps = hparams.get("rms_norm_eps", self.global_config.get("rms_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(eps)
|
||||
if (window_size := hparams.get("window_size")) is not None:
|
||||
self.gguf_writer.add_vision_window_size(window_size)
|
||||
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
|
||||
if fullatt_block_indexes:
|
||||
n_wa_pattern = fullatt_block_indexes[0] + 1
|
||||
for i in range(1, len(fullatt_block_indexes)):
|
||||
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
|
||||
raise ValueError(f"Invalid EXAONE4.5 fullatt_block_indexes: {fullatt_block_indexes}")
|
||||
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if ".qkv." in name:
|
||||
yield from ModelBase.modify_tensors(self, data_torch, name, bid)
|
||||
return
|
||||
|
||||
yield from Qwen2VLVisionModel.modify_tensors(self, data_torch, name, bid)
|
||||
|
||||
@@ -3,7 +3,7 @@ from __future__ import annotations
|
||||
import json
|
||||
import re
|
||||
|
||||
from typing import Callable, Iterable, TYPE_CHECKING
|
||||
from typing import Callable, Iterable, TYPE_CHECKING, Sequence
|
||||
|
||||
import torch
|
||||
|
||||
@@ -614,7 +614,7 @@ class Gemma3NModel(Gemma3Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM")
|
||||
class Gemma4Model(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
@@ -765,6 +765,26 @@ class Gemma4Model(Gemma3Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
|
||||
class Gemma4UnifiedModel(Gemma4Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
def _get_suppress_tokens(self) -> Sequence[int] | None:
|
||||
gen_cfg_path = self.dir_model / "generation_config.json"
|
||||
if gen_cfg_path.is_file():
|
||||
with open(gen_cfg_path, encoding="utf-8") as f:
|
||||
gen_cfg = json.load(f)
|
||||
return gen_cfg.get("suppress_tokens")
|
||||
return None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
suppress_tokens = self._get_suppress_tokens()
|
||||
if suppress_tokens is not None:
|
||||
self.gguf_writer.add_suppress_tokens(suppress_tokens)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
class Gemma4VisionAudioModel(MmprojModel):
|
||||
has_audio_encoder = True
|
||||
@@ -786,14 +806,15 @@ class Gemma4VisionAudioModel(MmprojModel):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# vision params
|
||||
assert self.hparams_vision is not None
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
|
||||
|
||||
# audio params
|
||||
if self.hparams_audio:
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
||||
assert self.hparams_audio is not None
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-6))
|
||||
|
||||
def is_audio_tensor(self, name: str) -> bool:
|
||||
return "audio_tower" in name or "embed_audio" in name
|
||||
@@ -838,3 +859,61 @@ class Gemma4VisionAudioModel(MmprojModel):
|
||||
data_torch = data_torch.permute(0, 3, 1, 2).contiguous()
|
||||
mapped_name = self.map_tensor_name(name, (".weight", ".bias", ".input_max", ".input_min", ".output_max", ".output_min"))
|
||||
yield (mapped_name, data_torch)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4UnifiedForConditionalGeneration")
|
||||
class Gemma4UnifiedVisionAudioModel(Gemma4VisionAudioModel):
|
||||
has_audio_encoder = True
|
||||
has_vision_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
assert self.hparams_audio is not None
|
||||
text_embd_dim = self.hparams_vision["mm_embed_dim"]
|
||||
self.hparams_vision["hidden_size"] = text_embd_dim
|
||||
self.hparams_audio["hidden_size"] = text_embd_dim
|
||||
# this is a transformer-less vision tower, the params below are redundant but set to avoid error
|
||||
self.hparams_vision["intermediate_size"] = 0
|
||||
self.hparams_vision["num_layers"] = 0
|
||||
self.hparams_vision["num_attention_heads"] = 0
|
||||
self.hparams_audio["intermediate_size"] = 0
|
||||
self.hparams_audio["num_layers"] = 0
|
||||
self.hparams_audio["num_attention_heads"] = 0
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4UV)
|
||||
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4UA)
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
if name.endswith("pos_embedding"):
|
||||
name += ".weight"
|
||||
data_torch = data_torch.permute(1, 0, 2)
|
||||
elif ".pos_norm." in name:
|
||||
# rename to patch_ln3 to reuse the tensor name scheme
|
||||
name = name.replace(".pos_norm.", ".patch_ln3.")
|
||||
elif "patch_dense.weight" in name:
|
||||
# ggml im2col outputs in RR..GG..BB.. (CHW) order, but weight expects RGBRGB.. (HWC).
|
||||
# Permute columns so column i aligns with CHW input position i.
|
||||
assert self.hparams_vision is not None
|
||||
p = self.hparams_vision["model_patch_size"]
|
||||
i = torch.arange(p * p * 3)
|
||||
ch = i // (p * p)
|
||||
row = (i % (p * p)) // p
|
||||
col = i % p
|
||||
# perm[i] = HWC column index for CHW position i
|
||||
perm = row * p * 3 + col * 3 + ch
|
||||
data_torch = data_torch[:, perm]
|
||||
elif "patch_ln1.weight" in name or "patch_ln1.bias" in name:
|
||||
# same permutation for patch_ln1 as patch_dense to align with CHW input order
|
||||
assert self.hparams_vision is not None
|
||||
p = self.hparams_vision["model_patch_size"]
|
||||
i = torch.arange(p * p * 3)
|
||||
ch = i // (p * p)
|
||||
row = (i % (p * p)) // p
|
||||
col = i % p
|
||||
# perm[i] = HWC index for CHW position i
|
||||
perm = row * p * 3 + col * 3 + ch
|
||||
data_torch = data_torch[perm]
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
@@ -189,7 +189,8 @@ class HunYuanModel(TextModel):
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
|
||||
# Some HunYuanVL variants (e.g. OCR-style configs) have pad_token_id=-1;
|
||||
# guard SpecialVocab so it doesn't try to emit an invalid pad id.
|
||||
token_types = None
|
||||
if (self.hparams.get("pad_token_id") or 0) < 0:
|
||||
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
|
||||
@@ -250,7 +251,8 @@ class HunYuanModel(TextModel):
|
||||
self._fix_special_tokens()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
|
||||
# Some HunYuanVL variants set num_experts=1 (not real MoE);
|
||||
# prevent the parent class from emitting expert_count metadata in that case.
|
||||
saved_num_experts = self.hparams.pop("num_experts", None)
|
||||
super().set_gguf_parameters()
|
||||
if saved_num_experts is not None and saved_num_experts > 1:
|
||||
@@ -288,51 +290,21 @@ class HunYuanModel(TextModel):
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLVisionModel(MmprojModel):
|
||||
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
|
||||
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
|
||||
# Each variant maps to a different projector type in clip.cpp so image
|
||||
# preprocessing follows the correct code path.
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
|
||||
# HunyuanVL uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
@staticmethod
|
||||
def is_ocr_variant(hparams: dict) -> bool:
|
||||
"""Return True for HunyuanOCR, False for HunyuanVL.
|
||||
|
||||
The projector's output dim must equal the text model's hidden_size by
|
||||
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
|
||||
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
|
||||
ViT -> LLM projection dim is a hard architectural signature, not a
|
||||
magic number.
|
||||
"""
|
||||
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
|
||||
return vision_out == 1024
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
vcfg = self.hparams_vision
|
||||
|
||||
if self.is_ocr_variant(self.global_config):
|
||||
# --- HunyuanOCR ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
return
|
||||
|
||||
# --- HunyuanVL ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
|
||||
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
@@ -353,7 +325,7 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
# HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
@@ -361,40 +333,18 @@ class HunyuanVLVisionModel(MmprojModel):
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLTextModel(HunYuanModel):
|
||||
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
|
||||
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
|
||||
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
|
||||
# the config and pick the matching GGUF architecture.
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
|
||||
@staticmethod
|
||||
def _is_ocr_config(hparams: dict) -> bool:
|
||||
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
|
||||
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
|
||||
# HunyuanVLVisionModel.is_ocr_variant.
|
||||
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
|
||||
if self._is_ocr_config(raw_hparams):
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
else:
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
super().__init__(dir_model, *args, **kwargs)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
|
||||
# the HunYuan-Dense arch which already handles standard rope in super().
|
||||
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
|
||||
return
|
||||
|
||||
# XD-RoPE metadata for the HunyuanVL;
|
||||
if self.rope_parameters.get("rope_type") != "xdrope":
|
||||
return
|
||||
|
||||
# defaults for HunyuanVL. The C++ side later computes:
|
||||
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
|
||||
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
|
||||
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
@@ -51,6 +51,15 @@ class LlamaModel(TextModel):
|
||||
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
|
||||
self._set_vocab_mistral()
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if (add_prefix_space := tokenizer_config_json.get("add_prefix_space")) is not None:
|
||||
self.gguf_writer.add_add_space_prefix(add_prefix_space)
|
||||
if tokenizer_config_json.get("tokenizer_class") == "HybridDNATokenizer":
|
||||
return self._set_vocab_hybriddna()
|
||||
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
@@ -72,13 +81,6 @@ class LlamaModel(TextModel):
|
||||
special_vocab._set_special_token("eot", 32010)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
61
conversion/mellum.py
Normal file
61
conversion/mellum.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import ModelBase, TextModel, gguf, logger
|
||||
|
||||
|
||||
@ModelBase.register("MellumForCausalLM")
|
||||
class MellumModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MELLUM
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
|
||||
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
|
||||
|
||||
use_sliding_window = self.hparams.get("use_sliding_window")
|
||||
sliding_window = self.hparams.get("sliding_window")
|
||||
if (use_sliding_window is True or use_sliding_window is None) and sliding_window is not None:
|
||||
self.gguf_writer.add_sliding_window(sliding_window)
|
||||
logger.info(f"gguf: sliding window = {sliding_window}")
|
||||
self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in self.hparams["layer_types"]])
|
||||
logger.info(f"gguf: sliding window pattern length = {len(self.hparams['layer_types'])}")
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, merged_name, bid)
|
||||
return
|
||||
else:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -549,6 +548,7 @@ class _Qwen35MtpMixin:
|
||||
tensor_map: gguf.TensorNameMap
|
||||
no_mtp: bool
|
||||
mtp_only: bool
|
||||
_original_block_count: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -557,22 +557,44 @@ class _Qwen35MtpMixin:
|
||||
self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
|
||||
key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
|
||||
type(self)._original_block_count = hparams.get(key)
|
||||
return super().index_tensors(remote_hf_model_id=remote_hf_model_id) # ty: ignore[unresolved-attribute]
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item):
|
||||
name, _ = item
|
||||
assert cls._original_block_count is not None
|
||||
# TODO: change TextModel to super()
|
||||
if (titem := TextModel.filter_tensors(item)) is None:
|
||||
return None
|
||||
name, gen = titem
|
||||
if name.startswith("model.mtp."):
|
||||
name = name.replace("model.", "", 1)
|
||||
if name.startswith("mtp."):
|
||||
if cls.no_mtp:
|
||||
return None
|
||||
return item
|
||||
if cls.mtp_only:
|
||||
canonical = name.replace("language_model.", "")
|
||||
keep = canonical in (
|
||||
remapper = {
|
||||
"fc": "eh_proj",
|
||||
"pre_fc_norm_embedding": "enorm",
|
||||
"pre_fc_norm_hidden": "hnorm",
|
||||
"norm": "shared_head.norm",
|
||||
}
|
||||
parts = name.split(".", 3)
|
||||
if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal():
|
||||
mtp_idx = int(parts[2])
|
||||
name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}"
|
||||
elif len(parts) == 3 and parts[1] in remapper:
|
||||
name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}"
|
||||
elif cls.mtp_only:
|
||||
keep = name in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
"embed_tokens.weight", "norm.weight",
|
||||
)
|
||||
if not keep:
|
||||
return None
|
||||
return super().filter_tensors(item) # ty: ignore[unresolved-attribute]
|
||||
return name, gen
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
|
||||
@@ -594,29 +616,6 @@ class _Qwen35MtpMixin:
|
||||
self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
|
||||
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("mtp."):
|
||||
n_layer = self.hparams["num_hidden_layers"]
|
||||
if name.find("layers.") != -1:
|
||||
assert bid is not None
|
||||
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
|
||||
bid = bid + n_layer
|
||||
else:
|
||||
remapper = {
|
||||
"mtp.fc": "model.layers.{bid}.eh_proj",
|
||||
"mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
|
||||
"mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
|
||||
"mtp.norm": "model.layers.{bid}.shared_head.norm",
|
||||
}
|
||||
stem = Path(name).stem
|
||||
suffix = Path(name).suffix
|
||||
tmpl = remapper[stem] + suffix
|
||||
for b in range(n_layer, self.block_count):
|
||||
yield from super().modify_tensors(data_torch, tmpl.format(bid=b), b) # ty: ignore[unresolved-attribute]
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid) # ty: ignore[unresolved-attribute]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
|
||||
class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
|
||||
|
||||
@@ -15,7 +15,7 @@ from .base import MmprojModel, ModelBase, TextModel, _MISTRAL_COMMON_DATASET_MEA
|
||||
from .qwen import Qwen3Model
|
||||
|
||||
|
||||
@ModelBase.register("StepVLForConditionalGeneration")
|
||||
@ModelBase.register("StepVLForConditionalGeneration", "Step3p7ForConditionalGeneration")
|
||||
class Step3VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -95,10 +95,38 @@ class Step3VLTextModel(Qwen3Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
|
||||
@ModelBase.register("Step3p5ForCausalLM")
|
||||
@ModelBase.register("Step3p5ForCausalLM", "Step3p7ForConditionalGeneration")
|
||||
class Step35Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.STEP35
|
||||
|
||||
# The --mtp / --no-mtp toggles are ModelBase.mtp_only / no_mtp (set in
|
||||
# convert_hf_to_gguf.py main()). Unlike Qwen3.5, which stores MTP under a
|
||||
# `mtp.*` namespace, Step3.5 appends MTP layers at
|
||||
# `model.layers.{num_hidden_layers + i}`, so we filter them by layer index.
|
||||
# The trunk layer count is captured before indexing so the classmethod
|
||||
# filter_tensors can tell the appended MTP block(s) apart from the trunk.
|
||||
_n_main_layers: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# NextN/MTP layers are appended past num_hidden_layers; extend the
|
||||
# tensor map to cover them so the MTP block's tensors get correctly
|
||||
# indexed names. When --no-mtp drops the MTP blocks, fall back to the
|
||||
# base num_hidden_layers so we don't reserve unused slots.
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
|
||||
if n_nextn > 0 and not self.no_mtp:
|
||||
self.block_count += n_nextn
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None):
|
||||
# filter_tensors is a classmethod and can't reach self.hparams; stash
|
||||
# the trunk layer count here (before indexing runs) so it can detect
|
||||
# the appended MTP layers by index.
|
||||
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
|
||||
key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
|
||||
type(self)._n_main_layers = hparams.get(key)
|
||||
return super().index_tensors(remote_hf_model_id=remote_hf_model_id)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
rope_theta = self.hparams.get("rope_theta")
|
||||
if isinstance(rope_theta, list):
|
||||
@@ -119,8 +147,25 @@ class Step35Model(TextModel):
|
||||
n_head_swa = attn_other.get("num_attention_heads", n_head_base)
|
||||
n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
|
||||
|
||||
layer_types = layer_types[: self.block_count]
|
||||
partial_rotary_factors = partial_rotary_factors[: self.block_count]
|
||||
n_nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
|
||||
|
||||
# The Step3p5 HF checkpoint stores layer_types/partial_rotary_factors
|
||||
# entries for the MTP blocks past num_hidden_layers; preserve them so
|
||||
# the MTP layer's attention shape, SWA flag, and partial RoPE dim are
|
||||
# set correctly. Pad with full-attention defaults if the checkpoint
|
||||
# truncated them.
|
||||
def _pad(arr, n, default):
|
||||
arr = list(arr)
|
||||
if len(arr) < n:
|
||||
arr = arr + [default] * (n - len(arr))
|
||||
return arr[:n]
|
||||
|
||||
layer_types = _pad(layer_types, self.block_count, "full_attention")
|
||||
partial_rotary_factors = _pad(
|
||||
partial_rotary_factors,
|
||||
self.block_count,
|
||||
0.5, # full_attention default for Step3p5
|
||||
)
|
||||
assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
|
||||
head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
|
||||
kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
|
||||
@@ -157,31 +202,61 @@ class Step35Model(TextModel):
|
||||
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
|
||||
|
||||
# Optional per-layer SwiGLU clamps.
|
||||
# Optional per-layer SwiGLU clamps. MTP layers default to no clamping (0.0).
|
||||
if (limits := self.hparams.get("swiglu_limits")) is not None:
|
||||
limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
|
||||
limits_f = _pad(
|
||||
[0.0 if v is None else float(v) for v in limits],
|
||||
self.block_count,
|
||||
0.0,
|
||||
)
|
||||
self.gguf_writer.add_swiglu_clamp_exp(limits_f)
|
||||
if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
|
||||
limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
|
||||
limits_shared_f = _pad(
|
||||
[0.0 if v is None else float(v) for v in limits_shared],
|
||||
self.block_count,
|
||||
0.0,
|
||||
)
|
||||
self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
|
||||
|
||||
if n_nextn > 0 and not self.no_mtp:
|
||||
self.gguf_writer.add_nextn_predict_layers(n_nextn)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if (titem := super().filter_tensors(item)) is None:
|
||||
return None
|
||||
name, gen = titem
|
||||
|
||||
# Map router bias (expert selection bias) to a GGUF bias tensor
|
||||
if name.endswith(".moe.router_bias"):
|
||||
name += ".bias"
|
||||
|
||||
return super().filter_tensors((name, gen))
|
||||
# Step3.5 appends the MTP block(s) past num_hidden_layers.
|
||||
assert cls._n_main_layers is not None
|
||||
is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers
|
||||
|
||||
# --no-mtp: drop the appended MTP block(s) entirely.
|
||||
if is_mtp and cls.no_mtp:
|
||||
return None
|
||||
# --mtp: keep ONLY MTP-block tensors plus the shared embeddings/norm/
|
||||
# lm_head (so the resulting GGUF carries just the draft head).
|
||||
if cls.mtp_only and not is_mtp and name not in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
):
|
||||
return None
|
||||
|
||||
# The checkpoint nests the per-MTP-layer shared head under
|
||||
# `model.layers.{N+i}.transformer.shared_head.{norm,output}.weight`;
|
||||
# strip the `transformer.` infix and rename `output` → `head` so the
|
||||
# existing NEXTN_SHARED_HEAD_{NORM,HEAD} tensor mapping picks them up.
|
||||
# Mirrors vllm's `_rewrite_spec_layer_name` (step3p5_mtp.py).
|
||||
if is_mtp:
|
||||
name = name.replace(".transformer.", ".")
|
||||
name = name.replace("shared_head.output", "shared_head.head")
|
||||
|
||||
return name, gen
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
# remove mtp layers
|
||||
if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
|
||||
il = int(m.group(1))
|
||||
n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
|
||||
if il >= n_main:
|
||||
return
|
||||
if name.endswith("norm.weight"):
|
||||
data_torch += 1.0
|
||||
|
||||
@@ -190,6 +265,21 @@ class Step35Model(TextModel):
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_metadata(self, vocab_only: bool):
|
||||
from_dir = self.fname_out.is_dir()
|
||||
super().prepare_metadata(vocab_only=vocab_only)
|
||||
|
||||
# Mirror Qwen3.5's behavior: when emitting a draft-only file into a
|
||||
# directory, prefix with "mtp-" so it doesn't collide with the trunk.
|
||||
if not self.mtp_only or not from_dir:
|
||||
return
|
||||
|
||||
output_type: str = self.ftype.name.partition("_")[2]
|
||||
fname_default: str = gguf.naming_convention(
|
||||
self.metadata.name, self.metadata.basename, self.metadata.finetune,
|
||||
self.metadata.version, size_label=None, output_type=output_type, model_type=None)
|
||||
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
# Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3").
|
||||
# llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS).
|
||||
@@ -203,11 +293,23 @@ class Step35Model(TextModel):
|
||||
if isinstance(rope_theta, list):
|
||||
rope_theta = rope_theta[0]
|
||||
base = float(rope_theta)
|
||||
if (dim := self.hparams.get("head_dim")) is None:
|
||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
dim = int(dim)
|
||||
|
||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||
if (storage_dim := self.hparams.get("head_dim")) is None:
|
||||
storage_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
storage_dim = int(storage_dim)
|
||||
|
||||
# Llama 3 factors apply only to the rotary dims used by full_attention layers
|
||||
# (partial_rotary_factor * head_dim). Remaining slots are padded with 1.0 so
|
||||
# sliding_attention layers remain unaffected. set_gguf_parameters already
|
||||
# guarantees at least one full_attention layer.
|
||||
layer_types = (self.hparams.get("layer_types") or [])[: self.block_count]
|
||||
partial_rotary_factors = (self.hparams.get("partial_rotary_factors") or [])[: self.block_count]
|
||||
full_attention_factor = next(
|
||||
float(f) for lt, f in zip(layer_types, partial_rotary_factors) if lt == "full_attention"
|
||||
)
|
||||
rotary_dim = int(storage_dim * full_attention_factor)
|
||||
|
||||
freqs = 1.0 / (base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim))
|
||||
|
||||
factor = float(rope_params.get("factor", 8.0))
|
||||
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
|
||||
@@ -228,4 +330,8 @@ class Step35Model(TextModel):
|
||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||
rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))
|
||||
|
||||
# Pad to head_dim/2 with 1.0 so non-scaled layers remain neutral.
|
||||
if len(rope_factors) < storage_dim // 2:
|
||||
rope_factors.extend([1.0] * (storage_dim // 2 - len(rope_factors)))
|
||||
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
|
||||
|
||||
53
conversion/talkie.py
Normal file
53
conversion/talkie.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import LazyTorchTensor, ModelBase, TextModel, gguf
|
||||
|
||||
|
||||
@ModelBase.register("TalkieForCausalLM")
|
||||
class TalkieModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.TALKIE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
# Talkie used F.rms_norm without an explicit eps
|
||||
self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
prefix = f"model.blocks.{bid}." if bid is not None else ""
|
||||
suffix = name.removeprefix(prefix)
|
||||
|
||||
if suffix == "attn_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "mlp_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "lm_head_gain.w_g":
|
||||
self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item())
|
||||
return
|
||||
elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"):
|
||||
# absorb inverse rope
|
||||
head_dim = self.hparams["head_dim"]
|
||||
shape = data_torch.shape
|
||||
data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1]))
|
||||
signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype)
|
||||
signs[:, head_dim // 2 :, :] = -1
|
||||
if self.lazy:
|
||||
signs = LazyTorchTensor.from_eager(signs)
|
||||
# (n_head, head_dim, n_in) -> (n_out, n_in)
|
||||
data_torch = torch.reshape(data_torch * signs, shape)
|
||||
elif suffix == "attn.head_gain.head_g":
|
||||
# allow head gain to broadcast
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
|
||||
if not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user