mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-19 16:05:58 +02:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d14ce3dab4 | ||
|
|
6db130445d | ||
|
|
4b262ab662 | ||
|
|
00c461ce1a | ||
|
|
ccee426426 | ||
|
|
3c81c8deea | ||
|
|
cd963fee6a | ||
|
|
d2e179a477 | ||
|
|
c85a242ed0 | ||
|
|
aabee047d8 | ||
|
|
f1c1c5c057 | ||
|
|
439f1b193d | ||
|
|
c3e9ade6dd | ||
|
|
9a532ae4ba | ||
|
|
b7340443d4 | ||
|
|
5cbaa5e69e |
@@ -5,6 +5,9 @@
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CHIP_TYPE=910b
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
@@ -67,6 +70,19 @@ RUN mkdir -p /app/full && \
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
# -- Install runtime dependencies --
|
||||
RUN yum install -y libgomp curl && \
|
||||
yum clean all && \
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
@@ -35,6 +38,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
||||
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# CUDA architecture to build for (defaults to all supported archs)
|
||||
@@ -40,6 +44,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
## Build Image
|
||||
|
||||
@@ -40,6 +43,19 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
ARG IGC_VERSION=v2.20.5
|
||||
ARG IGC_VERSION_FULL=2_2.20.5+19972
|
||||
ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
@@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \
|
||||
|
||||
# TODO: use image with NNRT
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
@@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V
|
||||
|
||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
@@ -45,6 +49,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ${BASE_MUSA_RUN_CONTAINER} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
|
||||
ARG http_proxy=
|
||||
ARG https_proxy=
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
## Build Image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS build
|
||||
|
||||
@@ -88,6 +92,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
# Pass proxy args to runtime stage
|
||||
ARG http_proxy
|
||||
ARG https_proxy
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
|
||||
|
||||
@@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1
|
||||
# Target the ROCm build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
### Build image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
@@ -57,6 +61,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl \
|
||||
&& apt autoremove -y \
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
ARG GCC_VERSION=15.2.0
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
### Build Llama.cpp stage
|
||||
FROM gcc:${GCC_VERSION} AS build
|
||||
@@ -52,6 +55,19 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
### Base image
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
ARG UBUNTU_VERSION=26.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
@@ -31,6 +34,19 @@ RUN mkdir -p /app/full \
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
|
||||
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
|
||||
|
||||
83
.github/workflows/docker.yml
vendored
83
.github/workflows/docker.yml
vendored
@@ -11,6 +11,11 @@ name: Publish Docker image
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
skip_s390x:
|
||||
description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)"
|
||||
type: boolean
|
||||
default: false
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because it is expensive
|
||||
- cron: '12 4 * * *'
|
||||
@@ -64,6 +69,8 @@ jobs:
|
||||
- name: Generate build and merge matrices
|
||||
id: matrices
|
||||
shell: bash
|
||||
env:
|
||||
SKIP_S390X: ${{ inputs.skip_s390x || 'false' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
@@ -86,6 +93,11 @@ jobs:
|
||||
]
|
||||
JSON
|
||||
|
||||
if [ "${SKIP_S390X}" = "true" ]; then
|
||||
jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp
|
||||
mv build-matrix.json.tmp build-matrix.json
|
||||
fi
|
||||
|
||||
BUILD_MATRIX="$(jq -c . build-matrix.json)"
|
||||
MERGE_MATRIX="$(jq -c '
|
||||
reduce .[] as $entry ({}; .[$entry.tag] |= (
|
||||
@@ -132,6 +144,7 @@ jobs:
|
||||
config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
@@ -187,6 +200,10 @@ jobs:
|
||||
env:
|
||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||
|
||||
- name: Get build date
|
||||
id: build_date
|
||||
run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Free Disk Space (Ubuntu)
|
||||
if: ${{ matrix.config.free_disk_space == true }}
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
@@ -211,13 +228,26 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: full
|
||||
provenance: false
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.build_date.outputs.date }}
|
||||
APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
|
||||
APP_REVISION=${{ steps.checkout.outputs.commit }}
|
||||
IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
|
||||
IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
annotations: |
|
||||
manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
|
||||
manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
|
||||
manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
|
||||
manifest:org.opencontainers.image.title=llama.cpp
|
||||
manifest:org.opencontainers.image.description=LLM inference in C/C++
|
||||
manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
|
||||
manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -235,13 +265,26 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: light
|
||||
provenance: false
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.build_date.outputs.date }}
|
||||
APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
|
||||
APP_REVISION=${{ steps.checkout.outputs.commit }}
|
||||
IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
|
||||
IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
annotations: |
|
||||
manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
|
||||
manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
|
||||
manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
|
||||
manifest:org.opencontainers.image.title=llama.cpp
|
||||
manifest:org.opencontainers.image.description=LLM inference in C/C++
|
||||
manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
|
||||
manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -259,13 +302,26 @@ jobs:
|
||||
with:
|
||||
context: .
|
||||
platforms: ${{ matrix.config.platforms }}
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
|
||||
outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
|
||||
file: ${{ matrix.config.dockerfile }}
|
||||
target: server
|
||||
provenance: false
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.build_date.outputs.date }}
|
||||
APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
|
||||
APP_REVISION=${{ steps.checkout.outputs.commit }}
|
||||
IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
|
||||
IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
|
||||
${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
|
||||
${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
|
||||
annotations: |
|
||||
manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
|
||||
manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
|
||||
manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
|
||||
manifest:org.opencontainers.image.title=llama.cpp
|
||||
manifest:org.opencontainers.image.description=LLM inference in C/C++
|
||||
manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
|
||||
manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
|
||||
# using github experimental cache
|
||||
#cache-from: type=gha
|
||||
#cache-to: type=gha,mode=max
|
||||
@@ -330,10 +386,15 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get build date
|
||||
id: build_date
|
||||
run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Download digest metadata
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
|
||||
with:
|
||||
@@ -361,6 +422,8 @@ jobs:
|
||||
IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
|
||||
PREFIX="${IMAGE_REPO}:"
|
||||
SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
|
||||
BUILD_DATE="${{ steps.build_date.outputs.date }}"
|
||||
COMMIT_SHA="${{ steps.checkout.outputs.commit }}"
|
||||
TAGS="${{ matrix.config.tag }}"
|
||||
ARCHES="${{ matrix.config.arches }}"
|
||||
DIGEST_GLOB="/tmp/digests/*.tsv"
|
||||
@@ -412,11 +475,21 @@ jobs:
|
||||
refs+=("${IMAGE_REPO}@${digest}")
|
||||
done
|
||||
|
||||
local annotations=(
|
||||
--annotation "index:org.opencontainers.image.created=${BUILD_DATE}"
|
||||
--annotation "index:org.opencontainers.image.version=${SRC_TAG}"
|
||||
--annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}"
|
||||
--annotation "index:org.opencontainers.image.title=llama.cpp"
|
||||
--annotation "index:org.opencontainers.image.description=LLM inference in C/C++"
|
||||
--annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}"
|
||||
--annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}"
|
||||
)
|
||||
|
||||
echo "Creating ${merged_tag} from ${refs[*]}"
|
||||
docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
|
||||
docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}"
|
||||
|
||||
echo "Creating ${merged_versioned_tag} from ${refs[*]}"
|
||||
docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
|
||||
docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}"
|
||||
}
|
||||
|
||||
for tag in $TAGS; do
|
||||
|
||||
27
.github/workflows/server-self-hosted.yml
vendored
27
.github/workflows/server-self-hosted.yml
vendored
@@ -152,6 +152,33 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
libssl-dev \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
|
||||
4
.github/workflows/ui-ci.yml
vendored
4
.github/workflows/ui-ci.yml
vendored
@@ -41,7 +41,7 @@ jobs:
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -93,7 +93,7 @@ jobs:
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
|
||||
@@ -22,6 +22,8 @@ Pull requests (PRs):
|
||||
Commits:
|
||||
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
|
||||
- Do not explicitly set the git author in commits - rely on the default git config
|
||||
- Always use `--no-gpg-sign` when committing
|
||||
- Never `git push` without explicit confirmation from the user
|
||||
|
||||
Resources (read on demand):
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
|
||||
@@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [Metal](docs/build.md#metal-build) | Apple Silicon |
|
||||
| [BLAS](docs/build.md#blas-build) | All |
|
||||
| [BLIS](docs/backend/BLIS.md) | All |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||
| [SYCL](docs/backend/SYCL.md) | Intel GPU |
|
||||
| [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
|
||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||
|
||||
@@ -536,7 +536,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
const bool skip = (arg == "--spec-type");
|
||||
|
||||
if (!skip) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
}
|
||||
auto & tmp = arg_to_options[arg];
|
||||
auto opt = *tmp.first;
|
||||
@@ -893,7 +897,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||
}
|
||||
if (!seen_args.insert(arg).second) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
const bool skip = (arg == "--spec-type");
|
||||
|
||||
if (!skip) {
|
||||
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
||||
}
|
||||
}
|
||||
auto opt = *arg_to_options[arg];
|
||||
std::string val;
|
||||
@@ -4117,6 +4125,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.ngram_mod.n_match = 24;
|
||||
params.speculative.ngram_mod.n_min = 48;
|
||||
params.speculative.ngram_mod.n_max = 64;
|
||||
|
||||
// TODO: not sure if this is a good config - explore more settings and potentially enable it
|
||||
//params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
|
||||
//params.speculative.ngram_map_k4v.size_n = 8;
|
||||
//params.speculative.ngram_map_k4v.size_m = 24;
|
||||
//params.speculative.ngram_map_k4v.min_hits = 2;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
|
||||
|
||||
@@ -1160,7 +1160,7 @@ struct common_init_result::impl {
|
||||
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
||||
};
|
||||
|
||||
common_init_result::common_init_result(common_params & params) :
|
||||
common_init_result::common_init_result(common_params & params, bool model_only) :
|
||||
pimpl(new impl{}) {
|
||||
auto mparams = common_model_params_to_llama(params);
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
@@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) :
|
||||
|
||||
pimpl->model.reset(model);
|
||||
|
||||
if (model_only) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
@@ -1252,29 +1256,6 @@ common_init_result::common_init_result(common_params & params) :
|
||||
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
||||
}
|
||||
|
||||
// [TAG_RS_STATE_ROLLBACK_SUPPORT]
|
||||
// TODO: ngram speculative methods require checkpointing in addition to partial RS rollback
|
||||
// currently this is not supported. so we disable the partial rollback
|
||||
if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) {
|
||||
auto & types = params.speculative.types;
|
||||
|
||||
for (int i = 0; i < (int) types.size(); i++) {
|
||||
if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
continue;
|
||||
}
|
||||
if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cparams.n_rs_seq = 0;
|
||||
|
||||
LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__,
|
||||
common_speculative_type_to_str(types[i]).c_str());
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
@@ -1309,8 +1290,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
||||
return pimpl->lora;
|
||||
}
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
common_init_result_ptr res(new common_init_result(params));
|
||||
common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
|
||||
common_init_result_ptr res(new common_init_result(params, model_only));
|
||||
|
||||
llama_model * model = res->model();
|
||||
if (model == NULL) {
|
||||
@@ -1318,6 +1299,10 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
||||
return res;
|
||||
}
|
||||
|
||||
if (model_only) {
|
||||
return res;
|
||||
}
|
||||
|
||||
llama_context * lctx = res->context();
|
||||
if (lctx == NULL) {
|
||||
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
|
||||
|
||||
@@ -299,11 +299,11 @@ struct common_params_model {
|
||||
|
||||
// draft-model-based speculative decoding parameters
|
||||
struct common_params_speculative_draft {
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.0f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
common_params_model mparams;
|
||||
|
||||
@@ -857,7 +857,7 @@ struct common_sampler;
|
||||
|
||||
// note: defines the model, context, samplers, ets. lifetimes
|
||||
struct common_init_result {
|
||||
common_init_result(common_params & params);
|
||||
common_init_result(common_params & params, bool model_only = false);
|
||||
~common_init_result();
|
||||
|
||||
llama_model * model();
|
||||
@@ -875,7 +875,7 @@ private:
|
||||
|
||||
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
||||
|
||||
common_init_result_ptr common_init_from_params(common_params & params);
|
||||
common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
|
||||
@@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map,
|
||||
draft.push_back(inp[match_pos + n + i]);
|
||||
}
|
||||
|
||||
LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
|
||||
LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
|
||||
key_offset, slot_max,
|
||||
curr_key.key_num, draft.size());
|
||||
|
||||
|
||||
@@ -32,6 +32,19 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
|
||||
{"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
|
||||
};
|
||||
|
||||
static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
|
||||
if (devices.empty()) {
|
||||
return "default";
|
||||
}
|
||||
|
||||
std::string result;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (i > 0) result += ", ";
|
||||
result += ggml_backend_dev_name(devices[i]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
struct common_speculative_config {
|
||||
common_speculative_type type;
|
||||
common_params_speculative params;
|
||||
@@ -144,7 +157,7 @@ struct common_speculative_impl {
|
||||
|
||||
virtual void draft(common_speculative_draft_params_vec & dparams) = 0;
|
||||
|
||||
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0;
|
||||
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
|
||||
|
||||
// true if this implementation requires the target context to extract post-norm embeddings
|
||||
virtual bool need_embd() const = 0;
|
||||
@@ -167,6 +180,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
auto * ctx_tgt = this->params.ctx_tgt;
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
ggml_type_name(this->params.cache_type_v),
|
||||
ctx_tgt ? "yes" : "no",
|
||||
ctx_dft ? "yes" : "no",
|
||||
common_speculative_get_devices_str(this->params.devices).c_str());
|
||||
|
||||
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
|
||||
// TODO: optimize or pass from outside?
|
||||
@@ -343,7 +366,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -355,8 +378,12 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
//common_params_speculative_eagle3 params;
|
||||
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {}
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
@@ -371,7 +398,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
// TODO: implement
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -380,7 +407,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft)
|
||||
|
||||
llama_batch batch;
|
||||
@@ -407,7 +434,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
// pre-advancement before process() mirrored the verify batch.
|
||||
std::vector<uint16_t> last_n_drafted;
|
||||
|
||||
common_speculative_state_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
|
||||
common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
|
||||
, params(params.draft)
|
||||
{
|
||||
@@ -417,6 +444,16 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
|
||||
n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd);
|
||||
LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
|
||||
this->params.n_gpu_layers,
|
||||
ggml_type_name(this->params.cache_type_k),
|
||||
ggml_type_name(this->params.cache_type_v),
|
||||
ctx_tgt ? "yes" : "no",
|
||||
ctx_dft ? "yes" : "no",
|
||||
common_speculative_get_devices_str(this->params.devices).c_str());
|
||||
|
||||
const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
|
||||
batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1);
|
||||
// llama_batch_init allocates only one of token/embd; MTP needs both.
|
||||
@@ -427,7 +464,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
for (auto & s : smpls) {
|
||||
common_params_sampling sparams;
|
||||
sparams.no_perf = false;
|
||||
sparams.top_k = 1; // TODO: re-enable top_k == 10 and utilize `p_min` spec param
|
||||
sparams.top_k = 10;
|
||||
sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
|
||||
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
|
||||
}
|
||||
@@ -446,7 +483,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
last_n_drafted.assign(n_seq, 0);
|
||||
}
|
||||
|
||||
~common_speculative_state_draft_mtp() override {
|
||||
~common_speculative_impl_draft_mtp() override {
|
||||
if (batch.token != nullptr) {
|
||||
free(batch.token);
|
||||
batch.token = nullptr;
|
||||
@@ -462,7 +499,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
|
||||
if (pos_max < N - 1) {
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d — "
|
||||
LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
|
||||
"process() hook may not have run on every prefill ubatch "
|
||||
"(need_embd / logits=1 on every prompt position?). "
|
||||
"Drafts may degrade.\n",
|
||||
@@ -633,6 +670,14 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
// add drafted token for each sequence
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
drafting[seq_id] = false;
|
||||
n_drafting--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
auto & dp = dparams.at(seq_id);
|
||||
@@ -678,7 +723,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
return;
|
||||
}
|
||||
@@ -714,7 +759,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
common_ngram_simple_config config)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq)
|
||||
, params(params.ngram_simple)
|
||||
, config(config) {}
|
||||
, config(config)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__);
|
||||
LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__,
|
||||
this->params.size_n, this->params.size_m, this->params.min_hits);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
@@ -738,7 +788,7 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -748,20 +798,21 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
};
|
||||
|
||||
struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
|
||||
common_params_speculative_ngram_map params;
|
||||
|
||||
// n_seq configs
|
||||
std::vector<common_ngram_map> config;
|
||||
|
||||
common_speculative_impl_ngram_map_k(
|
||||
const common_params_speculative & params,
|
||||
const common_ngram_map & config,
|
||||
uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq)
|
||||
, params(params.ngram_map_k) {
|
||||
{
|
||||
for (uint32_t i = 0; i < n_seq; i++) {
|
||||
this->config.push_back(config);
|
||||
}
|
||||
|
||||
LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str());
|
||||
LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__,
|
||||
config.size_key, config.size_value, config.key_only, config.min_hits);
|
||||
}
|
||||
|
||||
void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
|
||||
@@ -788,9 +839,13 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override {
|
||||
GGML_ASSERT((seq_id < (llama_seq_id) config.size()));
|
||||
|
||||
if (is_other) {
|
||||
return;
|
||||
}
|
||||
|
||||
common_ngram_map_accept(config[seq_id], n_accepted);
|
||||
}
|
||||
|
||||
@@ -812,7 +867,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
// the last position in the prompt that was added to the ngram container
|
||||
size_t i_last = 0;
|
||||
|
||||
// length of the last drafted n‑gram (number of tokens returned by draft)
|
||||
// length of the last drafted n-gram (number of tokens returned by draft)
|
||||
size_t n_draft_last = 0;
|
||||
|
||||
// consecutive accept rounds with low acceptance fraction (< 0.5)
|
||||
@@ -830,8 +885,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
, verbose(std::getenv("LLAMA_TRACE") != nullptr) {
|
||||
static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
|
||||
|
||||
LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__,
|
||||
this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024);
|
||||
LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__);
|
||||
LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__,
|
||||
this->params.n_match, this->params.n_max, this->params.n_min);
|
||||
LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__,
|
||||
mod.size(), (float)(mod.size_bytes())/1024/1024);
|
||||
|
||||
if (this->params.n_match < 16) {
|
||||
LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
|
||||
@@ -921,7 +979,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
}
|
||||
result.resize(result.size() - n);
|
||||
|
||||
// store length of drafted n‑gram for later acceptance analysis
|
||||
// store length of drafted n-gram for later acceptance analysis
|
||||
sinfo.n_draft_last = result.size();
|
||||
}
|
||||
|
||||
@@ -943,17 +1001,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
|
||||
void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override {
|
||||
if (is_other) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto & sinfo = sinfos[seq_id];
|
||||
|
||||
// compute acceptance fraction if we have a recorded draft length
|
||||
if (sinfo.n_draft_last > 0) {
|
||||
const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last;
|
||||
if (f_acc < 0.5) {
|
||||
if (f_acc < 0.25) {
|
||||
sinfo.n_low++;
|
||||
if (sinfo.n_low >= 3) {
|
||||
if (sinfo.n_low >= 5) {
|
||||
if (verbose) {
|
||||
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low);
|
||||
LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low);
|
||||
}
|
||||
|
||||
mod.reset();
|
||||
@@ -1003,6 +1065,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
|
||||
, save_dynamic(save_dynamic)
|
||||
, save_static(save_static)
|
||||
{
|
||||
LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__);
|
||||
LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__,
|
||||
n_draft,
|
||||
path_static.empty() ? "none" : path_static.c_str(),
|
||||
path_dynamic.empty() ? "none" : path_dynamic.c_str());
|
||||
|
||||
sinfos.resize(n_seq);
|
||||
|
||||
if (!path_static.empty()) {
|
||||
@@ -1099,7 +1167,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
|
||||
void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
|
||||
// noop
|
||||
}
|
||||
|
||||
@@ -1285,7 +1353,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
std::vector<std::unique_ptr<common_speculative_impl>> impls = {};
|
||||
|
||||
for (const common_speculative_config & config : configs) {
|
||||
LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str());
|
||||
switch (config.type) {
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
break;
|
||||
@@ -1298,7 +1365,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: {
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft_mtp>(config.params, n_seq));
|
||||
impls.push_back(std::make_unique<common_speculative_impl_draft_mtp>(config.params, n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
|
||||
@@ -1319,11 +1386,16 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
impls.push_back(std::move(state));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: {
|
||||
impls.push_back(
|
||||
std::make_unique<common_speculative_impl_ngram_map_k>(
|
||||
get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
|
||||
impls.push_back(
|
||||
std::make_unique<common_speculative_impl_ngram_map_k>(
|
||||
config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
|
||||
get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
|
||||
@@ -1515,11 +1587,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
|
||||
|
||||
GGML_ASSERT(impl);
|
||||
|
||||
// TODO: currently only the implementation that generated the draft is used to accept it
|
||||
// however, some implementations (such as MTP) need to also "see" the accepted tokens
|
||||
// extend `common_speculative_impl::accept()` with an extra argument `bool is_other` to
|
||||
// inform the implementation if the accepted tokens are from another implementation and
|
||||
// pass the accepted tokens to all remaining implementations using `is_other == true`
|
||||
{
|
||||
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
|
||||
if (n_accepted > 0) {
|
||||
@@ -1527,9 +1594,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
|
||||
impl->n_acc_tokens += n_accepted;
|
||||
}
|
||||
|
||||
impl->accept(seq_id, n_accepted);
|
||||
impl->accept(seq_id, n_accepted, false);
|
||||
impl->n_call_accept++;
|
||||
}
|
||||
|
||||
// accept with the rest of the implementations, using is_other == true
|
||||
for (auto & impl_other : spec->impls) {
|
||||
if (impl_other.get() != impl) {
|
||||
impl_other->accept(seq_id, n_accepted, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void common_speculative_print_stats(const common_speculative * spec) {
|
||||
@@ -1549,7 +1623,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
|
||||
str_perf = "";
|
||||
}
|
||||
|
||||
LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
|
||||
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
|
||||
common_speculative_type_to_str(impl->type).c_str(),
|
||||
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
|
||||
impl->n_gen_drafts,
|
||||
|
||||
@@ -445,6 +445,11 @@ if __name__ == '__main__':
|
||||
if self.lazy:
|
||||
tensor = LazyTorchTensor.from_eager(tensor)
|
||||
base_name = get_base_tensor_name(name)
|
||||
# filter base name, ignore tensor transformations for now
|
||||
data_gen = lambda g=tensor: g # noqa: E731
|
||||
if (titem := self.filter_tensors((base_name, data_gen))) is None:
|
||||
continue
|
||||
base_name, _ = titem
|
||||
# note: mergekit-extract-lora also adds token embeddings to the adapter
|
||||
is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
|
||||
is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
- [News](#news)
|
||||
- [OS](#os)
|
||||
- [Hardware](#hardware)
|
||||
- [Performance Reference](#performance-reference)
|
||||
- [Docker](#docker)
|
||||
- [Linux](#linux)
|
||||
- [Windows](#windows)
|
||||
@@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on
|
||||
|
||||
## News
|
||||
|
||||
- 2026.04
|
||||
|
||||
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
|
||||
- 2026.04-05
|
||||
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0.
|
||||
- Fused MoE.
|
||||
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
|
||||
|
||||
@@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
||||
|
||||
NA
|
||||
|
||||
## Performance Reference
|
||||
|
||||
|
||||
To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313).
|
||||
|
||||
You could update your test result in it directly.
|
||||
|
||||
## Docker
|
||||
|
||||
The docker build option is currently limited to *Intel GPU* targets.
|
||||
|
||||
@@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
### General Speculative Parameters
|
||||
|
||||
```
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
comma-separated list of types of speculative decoding to use
|
||||
(default: none)
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
--spec-default use default speculative decoding
|
||||
--spec-default use default speculative decoding config
|
||||
(enables ngram-mod)
|
||||
```
|
||||
|
||||
### Draft Model Parameters
|
||||
@@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_MODEL)
|
||||
--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
|
||||
HuggingFace repository for the draft model
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO)
|
||||
--spec-draft-n-max N
|
||||
number of tokens to draft for speculative decoding (default: 16)
|
||||
number of tokens to draft for speculative decoding (default: 3)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
|
||||
--spec-draft-n-min N
|
||||
minimum number of draft tokens to use for speculative decoding (default: 0)
|
||||
@@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
speculative decoding split probability (default: 0.10)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
|
||||
--spec-draft-p-min, --draft-p-min P
|
||||
minimum speculative decoding probability (greedy) (default: 0.75)
|
||||
minimum speculative decoding probability (greedy) (default: 0.00)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
|
||||
--spec-draft-ctx-size, -cd, --ctx-size-draft N
|
||||
size of the prompt context for the draft model (default: 0, 0 = loaded from model)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
|
||||
--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N
|
||||
max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
|
||||
--spec-draft-device, -devd, --device-draft <dev1,dev2,..>
|
||||
comma-separated list of devices to use for offloading the draft model
|
||||
--spec-draft-replace, --spec-replace TARGET DRAFT
|
||||
translate the string in TARGET into DRAFT if the draft model and main model are not compatible
|
||||
(use --list-devices to see available devices)
|
||||
```
|
||||
|
||||
### Draft Model CPU Scheduling Parameters
|
||||
|
||||
```
|
||||
--spec-draft-threads, -td, --threads-draft N
|
||||
number of CPU threads to use during generation
|
||||
--spec-draft-threads-batch, -tbd, --threads-batch-draft N
|
||||
number of threads to use during batch and prompt processing (default: same as --threads-draft)
|
||||
--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M
|
||||
Draft model CPU affinity mask. Complements cpu-range-draft
|
||||
--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi
|
||||
Ranges of CPUs for affinity. Complements --cpu-mask-draft
|
||||
--spec-draft-cpu-strict, --cpu-strict-draft <0|1>
|
||||
Use strict CPU placement for draft model (default: same as --cpu-strict)
|
||||
--spec-draft-prio, --prio-draft N
|
||||
set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime
|
||||
--spec-draft-poll, --poll-draft <0|1>
|
||||
Use polling to wait for draft model work (default: same as --poll)
|
||||
--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M
|
||||
Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft
|
||||
--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi
|
||||
Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft
|
||||
--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1>
|
||||
Use strict CPU placement for draft model batch (default: --cpu-strict-draft)
|
||||
--spec-draft-prio-batch, --prio-batch-draft N
|
||||
set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime
|
||||
--spec-draft-poll-batch, --poll-batch-draft <0|1>
|
||||
Use polling to wait for draft model work for batch (default: --poll-draft)
|
||||
```
|
||||
|
||||
### Draft Model KV Cache and Tensor Override Parameters
|
||||
|
||||
```
|
||||
--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE
|
||||
KV cache data type for K for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K)
|
||||
--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE
|
||||
KV cache data type for V for the draft model
|
||||
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V)
|
||||
--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...
|
||||
override tensor buffer type for draft model
|
||||
--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft
|
||||
keep all Mixture of Experts (MoE) weights in the CPU for the draft model
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE)
|
||||
--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N
|
||||
keep the MoE weights of the first N layers in the CPU for the draft model
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE)
|
||||
```
|
||||
|
||||
### n-gram Mod Parameters
|
||||
@@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
|
||||
|
||||
### `--spec-type TYPE`
|
||||
|
||||
Specifies a type of speculative decoding without draft model.
|
||||
Specifies a comma-separated list of speculative decoding types to use.
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft-simple` | Use a simple draft model for speculation |
|
||||
| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model |
|
||||
| `ngram-cache` | Use n-gram cache lookup |
|
||||
| `ngram-simple` | Use simple n-gram pattern matching |
|
||||
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
|
||||
@@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model.
|
||||
./llama-server [...] --spec-type ngram-simple
|
||||
```
|
||||
|
||||
**Example:** Multiple speculative implementations.
|
||||
```bash
|
||||
./llama-server [...] --spec-type ngram-mod,ngram-map-k4v
|
||||
```
|
||||
|
||||
### `--spec-ngram-*-size-n N`
|
||||
|
||||
Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
|
||||
|
||||
@@ -149,6 +149,8 @@ class TaskState:
|
||||
t_gen_ms: Optional[float] = None
|
||||
reasoning_content: Optional[str] = None
|
||||
server_name: Optional[str] = None
|
||||
chunk_idx: int = 0
|
||||
problem_idx: int = 0
|
||||
|
||||
|
||||
class EvalState:
|
||||
@@ -233,7 +235,9 @@ class EvalState:
|
||||
tps_gen: Optional[float] = None,
|
||||
t_gen_ms: Optional[float] = None,
|
||||
reasoning_content: Optional[str] = None,
|
||||
server_name: Optional[str] = None
|
||||
server_name: Optional[str] = None,
|
||||
chunk_idx: int = 0,
|
||||
problem_idx: int = 0,
|
||||
):
|
||||
with self._lock:
|
||||
if "cases" not in self.task_states:
|
||||
@@ -252,7 +256,9 @@ class EvalState:
|
||||
"tps_gen": tps_gen,
|
||||
"t_gen_ms": t_gen_ms,
|
||||
"reasoning_content": reasoning_content,
|
||||
"server_name": server_name
|
||||
"server_name": server_name,
|
||||
"chunk_idx": chunk_idx,
|
||||
"problem_idx": problem_idx,
|
||||
}
|
||||
|
||||
self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False))
|
||||
@@ -289,6 +295,9 @@ class EvalState:
|
||||
all_cases = {}
|
||||
for i, task_id in tasks_to_save:
|
||||
question_text, prompt, expected = self.get_case(i)
|
||||
# Extract chunk_idx from task_id for pending cases
|
||||
_parts = task_id.rsplit("_", 2)
|
||||
_chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
|
||||
if task_id in self.task_states.get("cases", {}):
|
||||
all_cases[task_id] = self.task_states["cases"][task_id]
|
||||
else:
|
||||
@@ -306,7 +315,9 @@ class EvalState:
|
||||
"tps_gen": None,
|
||||
"t_gen_ms": None,
|
||||
"reasoning_content": None,
|
||||
"server_name": None
|
||||
"server_name": None,
|
||||
"chunk_idx": _chunk_idx,
|
||||
"problem_idx": i,
|
||||
}
|
||||
|
||||
ci_lower, ci_upper = self.accuracy_ci()
|
||||
@@ -382,11 +393,12 @@ class EvalState:
|
||||
grader_log_str = self._escape_html(json.dumps(grader_log, indent=2))
|
||||
escaped_server = self._escape_html(server_name)
|
||||
|
||||
answer_class = status_class if status == "ok" else ""
|
||||
rows.append(f"""<tr class="task-row" onclick="toggleDetails('{task_id}')">
|
||||
<td>{task_id}</td>
|
||||
<td class="{status_class}">{status_text}</td>
|
||||
<td>{self._escape_html(expected)}</td>
|
||||
<td>{self._escape_html(answer)}</td>
|
||||
<td class="{answer_class}">{self._escape_html(answer)}</td>
|
||||
<td>{tokens_str}</td>
|
||||
<td>{tps_str}</td>
|
||||
<td>{t_gen_str}</td>
|
||||
@@ -405,6 +417,53 @@ class EvalState:
|
||||
|
||||
rows_html = "\n".join(rows)
|
||||
|
||||
# ---- per-problem summary table ----
|
||||
problem_groups: Dict[int, List[Dict[str, Any]]] = {}
|
||||
for _tid, _case in cases.items():
|
||||
if _case.get("status") != "ok":
|
||||
continue
|
||||
_pidx = _case.get("problem_idx")
|
||||
if _pidx is None:
|
||||
_p_parts = _tid.rsplit("_", 2)
|
||||
_pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0
|
||||
problem_groups.setdefault(_pidx, []).append(_case)
|
||||
|
||||
summary_rows_html = ""
|
||||
if problem_groups:
|
||||
def _stat(v, fmt=".1f", avg_fmt=None):
|
||||
if not v:
|
||||
return ("–", "–", "–")
|
||||
af = fmt if avg_fmt is None else avg_fmt
|
||||
return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}")
|
||||
|
||||
summary_data = []
|
||||
for pidx, g in problem_groups.items():
|
||||
runs = len(g)
|
||||
n_ok = sum(1 for c in g if c.get("correct", False))
|
||||
toks = [c["tokens"] for c in g if c.get("tokens") is not None]
|
||||
tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None]
|
||||
tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None]
|
||||
summary_data.append((
|
||||
pidx, runs, n_ok,
|
||||
_stat(toks, "d", ".0f"),
|
||||
_stat(tps),
|
||||
_stat(tg),
|
||||
))
|
||||
|
||||
summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending
|
||||
|
||||
summary_rows_html = "\n".join(
|
||||
f"""<tr class="summary-row">
|
||||
<td>{p:03d}</td>
|
||||
<td>{r}</td>
|
||||
<td>{n}/{r}</td>
|
||||
<td>{tk[0]}</td><td>{tk[1]}</td><td>{tk[2]}</td>
|
||||
<td>{tp[0]}</td><td>{tp[1]}</td><td>{tp[2]}</td>
|
||||
<td>{tg[0]}</td><td>{tg[1]}</td><td>{tg[2]}</td>
|
||||
</tr>"""
|
||||
for p, r, n, tk, tp, tg in summary_data
|
||||
)
|
||||
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
@@ -412,10 +471,10 @@ class EvalState:
|
||||
<title>{self.dataset_type.upper()} Eval</title>
|
||||
<style>
|
||||
body {{ font-family: system-ui, sans-serif; margin: 0; padding: 16px; background: #fff; color: #222; }}
|
||||
.bar {{ padding: 8px 0; font-size: 14px; color: #555; }}
|
||||
.bar span {{ margin-right: 20px; }}
|
||||
.bar b {{ color: #222; }}
|
||||
table {{ width: 100%; border-collapse: collapse; font-size: 13px; }}
|
||||
.bar {{ padding: 8px 0; font-size: 13px; color: #555; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; display: grid; grid-template-columns: auto 1fr auto 1fr; gap: 2px 12px; align-items: baseline; }}
|
||||
.bar .label {{ color: #888; }}
|
||||
.bar .value {{ color: #222; }}
|
||||
table {{ width: 100%; border-collapse: collapse; font-size: 13px; font-family: 'SF Mono', 'Menlo', 'Consolas', monospace; }}
|
||||
th {{ text-align: left; padding: 6px 8px; border-bottom: 2px solid #ccc; font-weight: 600; }}
|
||||
td {{ padding: 4px 8px; border-bottom: 1px solid #eee; vertical-align: top; }}
|
||||
.task-row {{ cursor: pointer; }}
|
||||
@@ -429,37 +488,88 @@ class EvalState:
|
||||
.details-content {{ padding: 8px 16px; background: #f6f8fa; font-size: 12px; }}
|
||||
.details-content b {{ color: #555; }}
|
||||
.details-content pre {{ background: #fff; border: 1px solid #e1e4e8; padding: 8px; overflow-x: auto; white-space: pre-wrap; word-wrap: break-word; margin: 4px 0 8px; }}
|
||||
.summary-table {{ margin-bottom: 16px; font-size: 13px; width: 100%; }}
|
||||
.summary-row {{ background: #fafbfc; }}
|
||||
.summary-row:hover {{ background: #f5f5f5; }}
|
||||
.summary-table th {{ text-align: right; font-weight: 600; }}
|
||||
.summary-table th:first-child {{ text-align: left; }}
|
||||
.summary-table th[colspan] {{ text-align: center; }}
|
||||
.summary-table td {{ text-align: right; }}
|
||||
.summary-table td:first-child {{ text-align: left; }}
|
||||
.tabs {{ display: flex; border-bottom: 2px solid #ddd; margin: 12px 0 0; }}
|
||||
.tab-btn {{ padding: 6px 16px; border: none; background: none; font-size: 13px; cursor: pointer; color: #555; border-bottom: 2px solid transparent; margin-bottom: -2px; font-weight: 500; }}
|
||||
.tab-btn:hover {{ color: #222; }}
|
||||
.tab-btn.active {{ color: #222; border-bottom-color: #222; font-weight: 600; }}
|
||||
.tab-content {{ display: none; }}
|
||||
.tab-content.active {{ display: block; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="bar">
|
||||
<span><b>{self.dataset_type.upper()}</b></span>
|
||||
<span>Model: {self.model_name or 'N/A'}</span>
|
||||
<span>Accuracy: <b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</span>
|
||||
<span>Correct: <span class="correct">{n_correct}</span> / {len(completed)}</span>
|
||||
<span>Pending: {n_pending}</span>
|
||||
<span>Time: {self.total_time:.1f}s</span>
|
||||
<span>Sampling: {sampling_str}</span>
|
||||
<div class="label">Dataset</div><div class="value"><b>{self.dataset_type.upper()}</b></div>
|
||||
<div class="label">Model</div><div class="value"><b>{self.model_name or 'N/A'}</b></div>
|
||||
<div class="label">Accuracy</div><div class="value"><b>{accuracy:.1f}%</b> [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]</div>
|
||||
<div class="label">Correct</div><div class="value"><span class="correct">{n_correct}</span> / {len(completed)}</div>
|
||||
<div class="label">Pending</div><div class="value">{n_pending}</div>
|
||||
<div class="label">Time</div><div class="value">{self.total_time:.1f}s</div>
|
||||
<div class="label">Sampling</div><div class="value">{sampling_str}</div>
|
||||
</div>
|
||||
<div class="tabs">
|
||||
<button class="tab-btn active" data-tab="detailed" onclick="switchTab(this)">Detailed</button>
|
||||
<button class="tab-btn" data-tab="summary" onclick="switchTab(this)">Summary</button>
|
||||
</div>
|
||||
<div id="tab-detailed" class="tab-content active">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th></th>
|
||||
<th>Gold</th>
|
||||
<th>Answer</th>
|
||||
<th>Tokens</th>
|
||||
<th>T/s</th>
|
||||
<th>Gen s</th>
|
||||
<th>Server</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{rows_html}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div id="tab-summary" class="tab-content">
|
||||
<table class="summary-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Problem</th>
|
||||
<th>Runs</th>
|
||||
<th>Correct</th>
|
||||
<th colspan="3">Tokens</th>
|
||||
<th colspan="3">T/s</th>
|
||||
<th colspan="3">Gen s</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th></th>
|
||||
<th></th>
|
||||
<th></th>
|
||||
<th>min</th><th>avg</th><th>max</th>
|
||||
<th>min</th><th>avg</th><th>max</th>
|
||||
<th>min</th><th>avg</th><th>max</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{summary_rows_html}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th></th>
|
||||
<th>Gold</th>
|
||||
<th>Answer</th>
|
||||
<th>Tokens</th>
|
||||
<th>T/s</th>
|
||||
<th>Gen s</th>
|
||||
<th>Server</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{rows_html}
|
||||
</tbody>
|
||||
</table>
|
||||
<script>
|
||||
function toggleDetails(id) {{ document.getElementById('details-'+id).classList.toggle('open'); }}
|
||||
function switchTab(btn) {{
|
||||
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
|
||||
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
|
||||
btn.classList.add('active');
|
||||
document.getElementById('tab-'+btn.dataset.tab).classList.add('active');
|
||||
}}
|
||||
</script>
|
||||
</body>
|
||||
</html>"""
|
||||
@@ -1062,12 +1172,19 @@ class Processor:
|
||||
) -> TaskState:
|
||||
question_text, prompt, expected = eval_state.get_case(i)
|
||||
|
||||
# Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}"
|
||||
_parts = task_id.rsplit("_", 2)
|
||||
chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0
|
||||
problem_idx = i
|
||||
|
||||
task_state = TaskState(
|
||||
task_id=task_id,
|
||||
prompt=prompt,
|
||||
expected=expected,
|
||||
question_text=question_text,
|
||||
server_name=server_config.name
|
||||
server_name=server_config.name,
|
||||
chunk_idx=chunk_idx,
|
||||
problem_idx=problem_idx,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -1085,7 +1202,8 @@ class Processor:
|
||||
eval_state.add_result(
|
||||
task_id, prompt, expected, result, None,
|
||||
{"finish_reason": finish_reason}, False, task_state.status,
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
|
||||
chunk_idx, problem_idx,
|
||||
)
|
||||
eval_state.dump()
|
||||
return task_state
|
||||
@@ -1108,7 +1226,8 @@ class Processor:
|
||||
eval_state.add_result(
|
||||
task_id, prompt, expected, result, answer,
|
||||
grader_log, is_correct, "ok",
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name
|
||||
tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name,
|
||||
chunk_idx, problem_idx,
|
||||
)
|
||||
|
||||
eval_state.dump()
|
||||
|
||||
@@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]:
|
||||
return int(match.group(0))
|
||||
|
||||
class AimeDataset:
|
||||
def __init__(self, split: str = "train"):
|
||||
def __init__(self, split: str = "train", dataset_type: str = "aime"):
|
||||
self.split = split
|
||||
self.dataset_type = dataset_type
|
||||
self.questions: List[Dict] = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
def _get_question_text(self, question: Dict) -> str:
|
||||
"""Get question text, handling different dataset field names."""
|
||||
return question.get("problem", question.get("question", ""))
|
||||
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
def _load_dataset(self):
|
||||
if self.dataset_type == "aime":
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
elif self.dataset_type == "aime2025":
|
||||
print(f"Loading AIME2025 dataset...")
|
||||
ds_list = []
|
||||
for config_name in ["AIME2025-I", "AIME2025-II"]:
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test")
|
||||
ds_list.extend(ds)
|
||||
ds = ds_list
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
raise ValueError(f"Unknown dataset type: {self.dataset_type}")
|
||||
|
||||
self.questions = list(ds)
|
||||
print(f"AIME dataset loaded: {len(self.questions)} questions")
|
||||
print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions")
|
||||
|
||||
def find_question(self, request_text: str) -> Optional[Dict]:
|
||||
# Strip common template prefixes to get the actual question text
|
||||
# Templates include things like "Solve the following math problem step by step..."
|
||||
# The actual question usually follows a blank line or after the template instruction
|
||||
cleaned = request_text
|
||||
# Split on double newline and take the part that looks like the problem
|
||||
parts = cleaned.split('\n\n')
|
||||
if len(parts) > 1:
|
||||
# Find the part that's longest (likely the actual problem text)
|
||||
problem_parts = [p for p in parts if len(p.strip()) > 100]
|
||||
if problem_parts:
|
||||
cleaned = max(problem_parts, key=lambda x: len(x))
|
||||
|
||||
best_match = None
|
||||
best_distance = -1
|
||||
best_index = -1
|
||||
|
||||
for i, question in enumerate(self.questions):
|
||||
question_text = question["problem"]
|
||||
request_lower = request_text.lower()
|
||||
question_text = self._get_question_text(question)
|
||||
request_lower = cleaned.lower()
|
||||
question_lower = question_text.lower()
|
||||
|
||||
# Check if question text is contained in the cleaned request
|
||||
if question_lower in request_lower or request_lower in question_lower:
|
||||
debug_log(f"DEBUG: Found substring match at index {i}")
|
||||
return question
|
||||
|
||||
# Exact match
|
||||
if question_lower == request_lower:
|
||||
debug_log(f"DEBUG: Found exact match at index {i}")
|
||||
@@ -118,7 +154,7 @@ class AimeDataset:
|
||||
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
|
||||
return best_match
|
||||
|
||||
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
|
||||
debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...")
|
||||
return None
|
||||
|
||||
def get_answer(self, question: Dict) -> str:
|
||||
@@ -134,15 +170,16 @@ class Simulator:
|
||||
port: int = 8033,
|
||||
host: str = "localhost",
|
||||
success_rate: float = 0.8,
|
||||
dataset_split: str = "train"
|
||||
dataset_split: str = "train",
|
||||
dataset_type: str = "aime"
|
||||
):
|
||||
self.port = port
|
||||
self.host = host
|
||||
self.success_rate = success_rate
|
||||
self.dataset = AimeDataset(dataset_split)
|
||||
self.dataset = AimeDataset(dataset_split, dataset_type)
|
||||
self.eval_state = EvalState(
|
||||
id="aime-2025",
|
||||
tasks=["aime"],
|
||||
id=dataset_type,
|
||||
tasks=[dataset_type],
|
||||
task_states={},
|
||||
sampling_config={"temperature": 0, "max_tokens": 2048}
|
||||
)
|
||||
@@ -159,6 +196,10 @@ class Simulator:
|
||||
else:
|
||||
response_text = self._generate_wrong_answer(question)
|
||||
|
||||
comp_tokens = random.randint(10000, 60000)
|
||||
tps_gen = random.uniform(90.0, 110.0)
|
||||
t_gen_ms = comp_tokens / tps_gen * 1000
|
||||
|
||||
return {
|
||||
"id": f"chatcmpl-{int(time.time())}",
|
||||
"object": "chat.completion",
|
||||
@@ -176,8 +217,12 @@ class Simulator:
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
"completion_tokens": comp_tokens,
|
||||
"total_tokens": 100 + comp_tokens
|
||||
},
|
||||
"timings": {
|
||||
"predicted_ms": t_gen_ms,
|
||||
"predicted_per_second": tps_gen
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,6 +263,12 @@ class Simulator:
|
||||
return response
|
||||
|
||||
class RequestHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == "/v1/models":
|
||||
self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200)
|
||||
return
|
||||
self._send_json({"error": "Not found"}, 404)
|
||||
|
||||
def do_POST(self):
|
||||
if self.path != "/v1/chat/completions":
|
||||
self._send_json({"error": "Not found"}, 404)
|
||||
@@ -280,6 +331,13 @@ def main():
|
||||
default=0.8,
|
||||
help="Success rate 0-1 (default: 0.8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default="aime",
|
||||
choices=["aime", "aime2025"],
|
||||
help="Dataset type (default: aime)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-split",
|
||||
type=str,
|
||||
@@ -294,7 +352,8 @@ def main():
|
||||
port=args.port,
|
||||
host=args.host,
|
||||
success_rate=args.success_rate,
|
||||
dataset_split=args.dataset_split
|
||||
dataset_split=args.dataset_split,
|
||||
dataset_type=args.dataset
|
||||
)
|
||||
|
||||
server = HTTPServer((args.host, args.port), RequestHandler)
|
||||
@@ -304,7 +363,7 @@ def main():
|
||||
print("\n=== llama-server-simulator ===")
|
||||
print(f"Server running on http://{args.host}:{args.port}")
|
||||
print(f"Success rate: {args.success_rate}")
|
||||
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print("\nPress Ctrl+C to stop\n")
|
||||
|
||||
try:
|
||||
|
||||
@@ -1,22 +1,296 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <clocale>
|
||||
#include <vector>
|
||||
#include <cstdio>
|
||||
|
||||
struct llama_batch_ptr {
|
||||
llama_batch batch;
|
||||
|
||||
llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max)
|
||||
: batch{llama_batch_init(n_tokens, embd, n_seq_max)} {}
|
||||
|
||||
~llama_batch_ptr() { llama_batch_free(batch); }
|
||||
|
||||
llama_batch_ptr(const llama_batch_ptr &) = delete;
|
||||
llama_batch_ptr & operator=(const llama_batch_ptr &) = delete;
|
||||
llama_batch_ptr(llama_batch_ptr &&) = default;
|
||||
llama_batch_ptr & operator=(llama_batch_ptr &&) = default;
|
||||
|
||||
llama_batch & get() { return batch; }
|
||||
const llama_batch & get() const { return batch; }
|
||||
};
|
||||
|
||||
static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) {
|
||||
std::string result;
|
||||
llama_batch_ptr batch(1, 0, 1);
|
||||
|
||||
for (int i = 0; i < n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||
|
||||
LOG("%s", next_token_str.c_str());
|
||||
result += next_token_str;
|
||||
|
||||
common_batch_clear(batch.get());
|
||||
common_batch_add(batch.get(), next_token, n_past, {seq_id}, true);
|
||||
|
||||
if (llama_decode(ctx, batch.get())) {
|
||||
LOG_ERR("\n%s: failed to evaluate\n", __func__);
|
||||
return {};
|
||||
}
|
||||
n_past++;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Test 1: baseline
|
||||
// - tokenize the prompt
|
||||
// - decode all but the last token
|
||||
// - save state to disk
|
||||
// - decode the last token
|
||||
// - generate n_predict tokens
|
||||
static std::string test_baseline(struct llama_model * model, const struct common_params & params) {
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
auto n_past = 0;
|
||||
if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) {
|
||||
LOG_ERR("%s: failed to decode prompt\n", __func__);
|
||||
return {};
|
||||
}
|
||||
|
||||
LOG("\n=== Test 1: baseline ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0);
|
||||
if (result.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
LOG("\n");
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Test 2: state load
|
||||
// - create a new context
|
||||
// - load state from file
|
||||
// - replay the last prompt token
|
||||
// - generate n_predict tokens and compare against expected result
|
||||
static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
LOG("\n=== Test 2: state load ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
// Load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size());
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
LOG_ERR("\n%s: failed to load state\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// Replay last token
|
||||
int n_past = (int) n_token_count_out;
|
||||
if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
|
||||
return false;
|
||||
}
|
||||
n_past++;
|
||||
|
||||
// Generate tokens
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0);
|
||||
if (result.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result != expected_result) {
|
||||
LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("\nPASS\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Test 3: seq copy (host)
|
||||
// - create a multi-seq context
|
||||
// - load state from file
|
||||
// - replay the last prompt token
|
||||
// - migrate KV cache from seq 0 to seq 1 via the CPU path
|
||||
// - generate n_predict tokens on seq 1 and compare against expected result
|
||||
static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
|
||||
auto params_ctx = common_context_params_to_llama(params);
|
||||
params_ctx.n_seq_max = 2;
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
LOG("\n=== Test 3: seq copy (host) ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
// Load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size());
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
LOG_ERR("\n%s: failed to load state\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// Replay last token
|
||||
int n_past = (int) n_token_count_out;
|
||||
if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
|
||||
return false;
|
||||
}
|
||||
n_past++;
|
||||
|
||||
// Migrate KV cache from seq 0 to seq 1 (CPU path)
|
||||
{
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx.get(), 0));
|
||||
const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0);
|
||||
if (ncopy != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx.get()), true);
|
||||
LOG_TRC("%s: kv cache cleared\n", __func__);
|
||||
|
||||
const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1);
|
||||
if (nset != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// Generate tokens on seq 1
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1);
|
||||
if (result.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result != expected_result) {
|
||||
LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("\nPASS\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Test 4: seq copy (device)
|
||||
// - create a multi-seq context
|
||||
// - load state from file
|
||||
// - replay the last prompt token
|
||||
// - migrate KV cache from seq 0 to seq 1 via the on-device path
|
||||
// - generate n_predict tokens on seq 1 and compare against expected result
|
||||
static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) {
|
||||
auto params_ctx = common_context_params_to_llama(params);
|
||||
params_ctx.n_seq_max = 2;
|
||||
auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)};
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)};
|
||||
llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
auto tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
|
||||
LOG("\n=== Test 4: seq copy (device) ===\n");
|
||||
LOG("%s", params.prompt.c_str());
|
||||
|
||||
// Load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size());
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
LOG_ERR("\n%s: failed to load state\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// Replay last token
|
||||
int n_past = (int) n_token_count_out;
|
||||
if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) {
|
||||
return false;
|
||||
}
|
||||
n_past++;
|
||||
|
||||
// Migrate KV cache from seq 0 to seq 1 (on-device path)
|
||||
{
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
|
||||
const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (ncopy != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx.get()), true);
|
||||
LOG_TRC("%s: kv cache cleared\n", __func__);
|
||||
|
||||
const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (nset != seq_store.size()) {
|
||||
LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return false;
|
||||
}
|
||||
LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// Generate tokens on seq 1
|
||||
auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1);
|
||||
if (result.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result != expected_result) {
|
||||
LOG_ERR("\n%s: error: generation differs from expected\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG("\nPASS\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
params.prompt = "The quick brown fox";
|
||||
params.out_file = "dump_state.bin";
|
||||
params.sampling.seed = 1234;
|
||||
|
||||
const std::string_view state_file = "dump_state.bin";
|
||||
|
||||
common_init();
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
@@ -24,8 +298,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.n_parallel == 1) {
|
||||
// the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
|
||||
printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
|
||||
LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
|
||||
params.kv_unified = true;
|
||||
}
|
||||
|
||||
@@ -33,288 +306,40 @@ int main(int argc, char ** argv) {
|
||||
params.n_predict = 16;
|
||||
}
|
||||
|
||||
auto n_past = 0;
|
||||
|
||||
std::string result0;
|
||||
std::string result1;
|
||||
std::string result2;
|
||||
std::string result3;
|
||||
|
||||
// init
|
||||
|
||||
ggml_backend_load_all();
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
|
||||
auto llama_init = common_init_from_params(params, true);
|
||||
auto * model = llama_init->model();
|
||||
auto * ctx = llama_init->context();
|
||||
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
if (model == nullptr) {
|
||||
LOG_ERR("%s: failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto sparams = llama_sampler_chain_default_params();
|
||||
GGML_ASSERT(llama_init->context() == nullptr);
|
||||
|
||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
// tokenize prompt
|
||||
auto tokens = common_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const bool save_state = true;
|
||||
if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) {
|
||||
// Test 1: baseline (saves state to disk)
|
||||
auto result_baseline = test_baseline(model, params);
|
||||
if (result_baseline.empty()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// first run
|
||||
printf("\nfirst run: %s", params.prompt.c_str());
|
||||
|
||||
llama_batch batch = llama_batch_init(1, 0, 1);
|
||||
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result0 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {0}, true);
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
// make new context
|
||||
llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params));
|
||||
|
||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsecond run: %s", params.prompt.c_str());
|
||||
|
||||
// load state from file
|
||||
std::vector<llama_token> unused_sts(tokens.size()); // unused session tokens.
|
||||
size_t n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
// Test 2: state load
|
||||
if (!test_state_load(model, params, result_baseline)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx2, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx2, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result1 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {0}, true);
|
||||
|
||||
if (llama_decode(ctx2, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n\n");
|
||||
|
||||
if (result0 != result1) {
|
||||
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||
// Test 3: seq copy (host)
|
||||
if (!test_seq_cp_host(model, params, result_baseline)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// make new context
|
||||
auto params_ctx3 = common_context_params_to_llama(params);
|
||||
params_ctx3.n_seq_max = 2;
|
||||
llama_context * ctx3 = llama_init_from_model(model, params_ctx3);
|
||||
|
||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
// Test 4: seq copy (device)
|
||||
if (!test_seq_cp_device(model, params, result_baseline)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx3, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// save seq 0 and load into seq 1
|
||||
{
|
||||
// save kv of seq 0
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
||||
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
|
||||
if (ncopy != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_memory_clear(llama_get_memory(ctx3), true);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 1
|
||||
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
|
||||
if (nset != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// third run with seq 1 instead of 0
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {1}, true);
|
||||
|
||||
if (llama_decode(ctx3, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
// test on-device state save/load
|
||||
auto params_ctx4 = common_context_params_to_llama(params);
|
||||
params_ctx4.n_seq_max = 2;
|
||||
llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
|
||||
|
||||
llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// save seq 0 and load into seq 1
|
||||
{
|
||||
// save kv of seq 0
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
|
||||
const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (ncopy != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_memory_clear(llama_get_memory(ctx4), true);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 0
|
||||
const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (nset != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// forth run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl4, ctx4, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx4, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result3 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {1}, true);
|
||||
|
||||
if (llama_decode(ctx4, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
llama_sampler_free(smpl);
|
||||
llama_sampler_free(smpl2);
|
||||
llama_sampler_free(smpl3);
|
||||
llama_sampler_free(smpl4);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
// this one is managed by common_init_result
|
||||
//llama_free(ctx);
|
||||
|
||||
llama_free(ctx2);
|
||||
llama_free(ctx3);
|
||||
llama_free(ctx4);
|
||||
|
||||
if (result0 != result2) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (result0 != result3) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n%s : success\n", __func__);
|
||||
LOG("\nAll tests passed.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2744,6 +2744,18 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
@@ -2816,6 +2828,21 @@ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32) { return false; }
|
||||
if (dst->type != GGML_TYPE_F32) { return false; }
|
||||
if (!ggml_are_same_shape(src0, dst)) { return false; }
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; }
|
||||
|
||||
return true;
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||
return sess->c_name();
|
||||
@@ -2857,6 +2884,9 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_TRI: return HTP_OP_TRI;
|
||||
case GGML_OP_PAD: return HTP_OP_PAD;
|
||||
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
@@ -3416,6 +3446,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_solve_tri(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_TRI:
|
||||
supp = ggml_hexagon_supported_tri(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_PAD:
|
||||
supp = ggml_hexagon_supported_pad(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
gated-delta-net-ops.c
|
||||
pad-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -107,5 +107,7 @@ int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
int op_gated_delta_net(struct htp_ops_context * octx);
|
||||
int op_tri(struct htp_ops_context * octx);
|
||||
int op_pad(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -86,6 +86,8 @@ enum htp_op_code {
|
||||
HTP_OP_SOLVE_TRI,
|
||||
HTP_OP_L2_NORM,
|
||||
HTP_OP_GATED_DELTA_NET,
|
||||
HTP_OP_TRI,
|
||||
HTP_OP_PAD,
|
||||
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
@@ -595,9 +595,15 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_SOLVE_TRI:
|
||||
return op_solve_tri(octx);
|
||||
|
||||
case HTP_OP_PAD:
|
||||
return op_pad(octx);
|
||||
|
||||
case HTP_OP_GATED_DELTA_NET:
|
||||
return op_gated_delta_net(octx);
|
||||
|
||||
case HTP_OP_TRI:
|
||||
return op_tri(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
|
||||
545
ggml/src/ggml-hexagon/htp/pad-ops.c
Normal file
545
ggml/src/ggml-hexagon/htp/pad-ops.c
Normal file
@@ -0,0 +1,545 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "hex-dma.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
/* Circular wrap: maps any integer x into [0, n) */
|
||||
static inline uint32_t wrap_around(int32_t x, uint32_t n) {
|
||||
return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n);
|
||||
}
|
||||
|
||||
/* Decompose a flat dst row index into (i1, i2, i3) */
|
||||
static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2,
|
||||
uint32_t *i1, uint32_t *i2, uint32_t *i3) {
|
||||
*i1 = ir % ne1;
|
||||
*i2 = (ir / ne1) % ne2;
|
||||
*i3 = ir / (ne1 * ne2);
|
||||
}
|
||||
|
||||
/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */
|
||||
static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
int32_t lp1, int32_t rp1, uint32_t ne1,
|
||||
int32_t lp2, int32_t rp2, uint32_t ne2,
|
||||
int32_t lp3, int32_t rp3, uint32_t ne3) {
|
||||
return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) &&
|
||||
((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) &&
|
||||
((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3);
|
||||
}
|
||||
|
||||
/* Compute the DDR src row pointer for a zero-pad interior row */
|
||||
static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src,
|
||||
uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
int32_t lp1, int32_t lp2, int32_t lp3) {
|
||||
return (const uint8_t *) src->data
|
||||
+ (i1 - (uint32_t)lp1) * src->nb[1]
|
||||
+ (i2 - (uint32_t)lp2) * src->nb[2]
|
||||
+ (i3 - (uint32_t)lp3) * src->nb[3];
|
||||
}
|
||||
|
||||
/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */
|
||||
static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src,
|
||||
uint32_t i1, uint32_t i2, uint32_t i3,
|
||||
int32_t lp1, int32_t lp2, int32_t lp3) {
|
||||
return (const uint8_t *) src->data
|
||||
+ wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1]
|
||||
+ wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2]
|
||||
+ wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3];
|
||||
}
|
||||
|
||||
struct htp_pad_context {
|
||||
struct htp_ops_context * octx;
|
||||
|
||||
int32_t lp0, rp0;
|
||||
int32_t lp1, rp1;
|
||||
int32_t lp2, rp2;
|
||||
int32_t lp3, rp3;
|
||||
|
||||
uint32_t nrows_per_thread;
|
||||
uint32_t total_dst_rows;
|
||||
|
||||
size_t type_size;
|
||||
|
||||
// Row sizes for DMA kernel (populated when VTCM is available)
|
||||
size_t src_row_size;
|
||||
size_t src_row_size_aligned;
|
||||
size_t dst_row_size;
|
||||
size_t dst_row_size_aligned;
|
||||
};
|
||||
|
||||
#define htp_pad_preamble \
|
||||
const struct htp_tensor * src = octx->src[0]; \
|
||||
const struct htp_tensor * dst = octx->dst; \
|
||||
\
|
||||
const uint32_t ne00 = src->ne[0]; \
|
||||
const uint32_t nb00 = src->nb[0]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3]; \
|
||||
\
|
||||
const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \
|
||||
const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \
|
||||
const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \
|
||||
const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \
|
||||
\
|
||||
const size_t type_size = pctx->type_size; \
|
||||
\
|
||||
const uint32_t row_start = pctx->nrows_per_thread * ith; \
|
||||
const uint32_t row_end = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows);
|
||||
|
||||
|
||||
#define htp_pad_dma_preamble \
|
||||
const size_t src_row_size = pctx->src_row_size; \
|
||||
const size_t src_row_size_aligned = pctx->src_row_size_aligned; \
|
||||
const size_t dst_row_size = pctx->dst_row_size; \
|
||||
const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \
|
||||
\
|
||||
uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \
|
||||
uint8_t * dst_spad_base = octx->dst_spad.data + ith * octx->dst_spad.size_per_thread; \
|
||||
\
|
||||
dma_queue * dma = octx->ctx->dma[ith];
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX vectorized PAD kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3);
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
const int interior = pad_is_interior(i1, i2, i3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
|
||||
if (!interior) {
|
||||
hvx_splat_f32_u(dst_ptr, 0.0f, ne0);
|
||||
} else {
|
||||
const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3);
|
||||
|
||||
if (lp0 > 0) {
|
||||
hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0);
|
||||
}
|
||||
|
||||
uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size;
|
||||
if (nb00 == type_size) {
|
||||
hvx_copy_f32_uu(dst_row_start, src_ptr, ne00);
|
||||
} else {
|
||||
for (uint32_t i = 0; i < ne00; i++) {
|
||||
memcpy(dst_row_start + i * type_size,
|
||||
src_ptr + (size_t)i * nb00,
|
||||
type_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (rp0 > 0) {
|
||||
hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX + DMA PAD kernel — aligned, double-buffered
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
htp_pad_dma_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the
|
||||
// double-buffer pipeline before the main loop begins.
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) {
|
||||
uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned;
|
||||
uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr((uint8_t *)dst->data, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 0);
|
||||
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
|
||||
const int interior = pad_is_interior(i1, i2, i3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
|
||||
const uint8_t * src_ptr = interior
|
||||
? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL;
|
||||
|
||||
// Interior row: real DMA (1 row) from DDR to VTCM.
|
||||
// Border row: null DMA (nrows=0)
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur,
|
||||
src_ptr ? src_ptr : (const uint8_t *)src_spad_cur),
|
||||
src_row_size_aligned, src_row_size, src_ptr ? 1 : 0);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops,
|
||||
// push dst DMA and prefetch src for the next+1 row.
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start; ir < row_end; ir++) {
|
||||
uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src;
|
||||
uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst;
|
||||
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
const int interior = pad_is_interior(i1, i2, i3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
|
||||
if (!interior) {
|
||||
hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0);
|
||||
} else {
|
||||
hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0);
|
||||
|
||||
uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size;
|
||||
|
||||
if ((uintptr_t)dst_interior % VLEN == 0) {
|
||||
hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00);
|
||||
} else {
|
||||
hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr(dst_ptr, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
|
||||
const uint32_t next_row = ir + 2;
|
||||
if (next_row < row_end) {
|
||||
uint32_t ni1, ni2, ni3;
|
||||
pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3);
|
||||
const int next_interior = pad_is_interior(ni1, ni2, ni3,
|
||||
lp1, rp1, ne1,
|
||||
lp2, rp2, ne2,
|
||||
lp3, rp3, ne3);
|
||||
const uint8_t * next_src_ptr = next_interior
|
||||
? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL;
|
||||
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur,
|
||||
next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur),
|
||||
src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_flush(dma);
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX circular PAD kernel
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) {
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3);
|
||||
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3);
|
||||
|
||||
if (nb00 == type_size) {
|
||||
|
||||
if (lp0 > 0) {
|
||||
if ((uint32_t)lp0 < 32) {
|
||||
memcpy(dst_ptr,
|
||||
src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size,
|
||||
(size_t)lp0 * type_size);
|
||||
} else {
|
||||
hvx_copy_f32_uu(dst_ptr,
|
||||
src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size,
|
||||
(uint32_t)lp0);
|
||||
}
|
||||
}
|
||||
hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00);
|
||||
if (rp0 > 0) {
|
||||
if ((uint32_t)rp0 < 32) {
|
||||
memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size,
|
||||
src_row,
|
||||
(size_t)rp0 * type_size);
|
||||
} else {
|
||||
hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size,
|
||||
src_row,
|
||||
(uint32_t)rp0);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i = 0; i < (uint32_t)lp0; i++) {
|
||||
*(float *)(dst_ptr + i * type_size) =
|
||||
*(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00);
|
||||
}
|
||||
for (uint32_t i = 0; i < ne00; i++) {
|
||||
*(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) =
|
||||
*(const float *)(src_row + (size_t)i * nb00);
|
||||
}
|
||||
for (uint32_t i = 0; i < (uint32_t)rp0; i++) {
|
||||
*(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) =
|
||||
*(const float *)(src_row + (size_t)i * nb00);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX + DMA circular PAD kernel — aligned, double-buffered
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_pad_context * pctx = (const struct htp_pad_context *) data;
|
||||
struct htp_ops_context * octx = pctx->octx;
|
||||
htp_pad_preamble;
|
||||
htp_pad_dma_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the
|
||||
// double-buffer pipeline. Every row is a real src DMA (no null DMAs).
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) {
|
||||
uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned;
|
||||
uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned;
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr((uint8_t *)dst->data, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 0);
|
||||
|
||||
uint32_t pi1, pi2, pi3;
|
||||
pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3);
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)),
|
||||
src_row_size_aligned, src_row_size, 1);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Main loop: pop completed DMAs, assemble circular row in VTCM with
|
||||
// aligned HVX ops, push dst DMA and prefetch src for the next+1 row.
|
||||
// -----------------------------------------------------------------------
|
||||
for (uint32_t ir = row_start; ir < row_end; ir++) {
|
||||
uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src;
|
||||
uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst;
|
||||
|
||||
uint32_t i1, i2, i3;
|
||||
pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3);
|
||||
uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3;
|
||||
|
||||
|
||||
if (lp0 > 0) {
|
||||
uint8_t * dst_left = dst_spad_cur;
|
||||
const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size;
|
||||
if ((uint32_t)lp0 < 32) {
|
||||
memcpy(dst_left, src_left, (size_t)lp0 * type_size);
|
||||
} else {
|
||||
hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size;
|
||||
if ((uintptr_t)dst_mid % VLEN == 0) {
|
||||
hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00);
|
||||
} else {
|
||||
hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00);
|
||||
}
|
||||
}
|
||||
|
||||
if (rp0 > 0) {
|
||||
uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size;
|
||||
if ((uint32_t)rp0 < 32) {
|
||||
memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size);
|
||||
} else {
|
||||
if ((uintptr_t)dst_right % VLEN == 0) {
|
||||
hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0);
|
||||
} else {
|
||||
hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma,
|
||||
dma_make_ptr(dst_ptr, dst_spad_cur),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
|
||||
const uint32_t next_row = ir + 2;
|
||||
if (next_row < row_end) {
|
||||
uint32_t nri1, nri2, nri3;
|
||||
pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3);
|
||||
dma_queue_push_ddr_to_vtcm(dma,
|
||||
dma_make_ptr(src_spad_cur,
|
||||
pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)),
|
||||
src_row_size_aligned, src_row_size, 1);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_flush(dma);
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n",
|
||||
ith, nth,
|
||||
src->ne[0], src->ne[1], src->ne[2], src->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
row_start, row_end,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_pad(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0];
|
||||
const struct htp_tensor * dst = octx->dst;
|
||||
|
||||
// Only F32 supported
|
||||
size_t type_size;
|
||||
switch (src0->type) {
|
||||
case HTP_TYPE_F32: type_size = 4; break;
|
||||
default:
|
||||
FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const int32_t lp0 = octx->op_params[0];
|
||||
const int32_t rp0 = octx->op_params[1];
|
||||
const int32_t lp1 = octx->op_params[2];
|
||||
const int32_t rp1 = octx->op_params[3];
|
||||
const int32_t lp2 = octx->op_params[4];
|
||||
const int32_t rp2 = octx->op_params[5];
|
||||
const int32_t lp3 = octx->op_params[6];
|
||||
const int32_t rp3 = octx->op_params[7];
|
||||
const int32_t circular = octx->op_params[8];
|
||||
|
||||
const uint32_t ne0 = dst->ne[0];
|
||||
const uint32_t ne00 = src0->ne[0];
|
||||
|
||||
const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3];
|
||||
const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1);
|
||||
|
||||
const size_t src_row_size = (size_t)ne00 * type_size;
|
||||
const size_t dst_row_size = (size_t)ne0 * type_size;
|
||||
const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
|
||||
// Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread
|
||||
const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned);
|
||||
|
||||
const int use_dma = (src0->nb[0] == (uint32_t)type_size) &&
|
||||
(ne00 >= 512) &&
|
||||
(octx->ctx->vtcm_base != NULL) &&
|
||||
(octx->ctx->vtcm_size >= vtcm_needed);
|
||||
|
||||
if (use_dma) {
|
||||
octx->src0_spad.size_per_thread = 2 * src_row_size_aligned;
|
||||
octx->dst_spad.size_per_thread = 2 * dst_row_size_aligned;
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
}
|
||||
|
||||
struct htp_pad_context pctx = {
|
||||
.octx = octx,
|
||||
.lp0 = lp0, .rp0 = rp0,
|
||||
.lp1 = lp1, .rp1 = rp1,
|
||||
.lp2 = lp2, .rp2 = rp2,
|
||||
.lp3 = lp3, .rp3 = rp3,
|
||||
.nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads,
|
||||
.total_dst_rows = total_dst_rows,
|
||||
.type_size = type_size,
|
||||
.src_row_size = src_row_size,
|
||||
.src_row_size_aligned = src_row_size_aligned,
|
||||
.dst_row_size = dst_row_size,
|
||||
.dst_row_size_aligned = dst_row_size_aligned,
|
||||
};
|
||||
|
||||
FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n",
|
||||
circular ? "-circ" : "",
|
||||
use_dma ? "-dma" : "",
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
|
||||
|
||||
if (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); }
|
||||
else if (circular) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular, &pctx, n_threads); }
|
||||
else if (use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma, &pctx, n_threads); }
|
||||
else { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx, &pctx, n_threads); }
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp-ops.h"
|
||||
|
||||
struct htp_unary_context {
|
||||
struct htp_ops_context * octx;
|
||||
@@ -277,6 +276,95 @@ static void sigmoid_f32(const float * restrict src,
|
||||
}
|
||||
}
|
||||
|
||||
static void tri_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params,
|
||||
const uint32_t ir,
|
||||
const struct htp_unary_context * uctx) {
|
||||
|
||||
const int32_t ttype = op_params[0];
|
||||
const HVX_Vector zero = hvx_vec_splat_f32(0.0f);
|
||||
const uint32_t nvec = row_elems / VLEN_FP32;
|
||||
const uint32_t nloe = row_elems % VLEN_FP32;
|
||||
|
||||
const uint32_t ne01 = uctx->octx->src[0]->ne[1];
|
||||
|
||||
for (uint32_t b = 0; b < num_rows; b++) {
|
||||
const uint32_t abs_row = ir + b;
|
||||
const uint32_t i01 = abs_row % ne01;
|
||||
|
||||
const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size);
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) ((uint8_t *) dst + b * row_size);
|
||||
|
||||
uint32_t boundary;
|
||||
int keep_left;
|
||||
switch (ttype) {
|
||||
case 0: boundary = i01; keep_left = 0; break; // keep col >= row
|
||||
case 1: boundary = i01 + 1; keep_left = 0; break; // keep col > row
|
||||
case 2: boundary = i01 + 1; keep_left = 1; break; // keep col <= row
|
||||
case 3: boundary = i01; keep_left = 1; break; // keep col < row
|
||||
default: boundary = 0; keep_left = 0; break;
|
||||
}
|
||||
if (boundary > row_elems) boundary = row_elems;
|
||||
|
||||
// Full HVX vectors — each starts at a 128-byte aligned offset
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
const uint32_t vec_start = i * VLEN_FP32;
|
||||
const uint32_t vec_end = vec_start + VLEN_FP32;
|
||||
if (keep_left) {
|
||||
if (vec_end <= boundary) {
|
||||
v_dst[i] = v_src[i];
|
||||
} else if (vec_start >= boundary) {
|
||||
v_dst[i] = zero;
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
v_dst[i] = Q6_V_vmux_QVV(mask, v_src[i], zero);
|
||||
}
|
||||
} else {
|
||||
if (vec_end <= boundary) {
|
||||
v_dst[i] = zero;
|
||||
} else if (vec_start >= boundary) {
|
||||
v_dst[i] = v_src[i];
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
v_dst[i] = Q6_V_vmux_QVV(mask, zero, v_src[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tail elements (row_elems not a multiple of VLEN_FP32)
|
||||
if (nloe > 0) {
|
||||
const uint32_t vec_start = nvec * VLEN_FP32;
|
||||
const uint32_t vec_end = vec_start + nloe;
|
||||
HVX_Vector tail_val;
|
||||
if (keep_left) {
|
||||
if (vec_end <= boundary) {
|
||||
tail_val = v_src[nvec];
|
||||
} else if (vec_start >= boundary) {
|
||||
tail_val = zero;
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
tail_val = Q6_V_vmux_QVV(mask, v_src[nvec], zero);
|
||||
}
|
||||
} else {
|
||||
if (vec_end <= boundary) {
|
||||
tail_val = zero;
|
||||
} else if (vec_start >= boundary) {
|
||||
tail_val = v_src[nvec];
|
||||
} else {
|
||||
HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float));
|
||||
tail_val = Q6_V_vmux_QVV(mask, zero, v_src[nvec]);
|
||||
}
|
||||
}
|
||||
hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void softplus_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
@@ -498,6 +586,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
case HTP_OP_L2_NORM:
|
||||
l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_TRI:
|
||||
tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -571,6 +662,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
case HTP_OP_L2_NORM:
|
||||
op_type = "l2norm-f32";
|
||||
break;
|
||||
case HTP_OP_TRI:
|
||||
op_type = "tri-f32";
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
@@ -640,6 +735,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
return err;
|
||||
}
|
||||
|
||||
int op_tri(struct htp_ops_context * octx) {
|
||||
int err = HTP_STATUS_OK;
|
||||
|
||||
switch (octx->src[0]->type) {
|
||||
case HTP_TYPE_F32:
|
||||
err = execute_op_unary_f32(octx);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = HTP_STATUS_NO_SUPPORT;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int op_unary(struct htp_ops_context * octx) {
|
||||
int err = HTP_STATUS_OK;
|
||||
|
||||
|
||||
@@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
||||
return &guid;
|
||||
}
|
||||
|
||||
struct ggml_backend_rpc_device_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
uint64_t last_graph_uid;
|
||||
};
|
||||
|
||||
struct ggml_backend_rpc_buffer_type_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
@@ -211,7 +219,6 @@ struct ggml_backend_rpc_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
uint64_t last_graph_uid;
|
||||
};
|
||||
|
||||
struct ggml_backend_rpc_buffer_context {
|
||||
@@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
|
||||
|
||||
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend);
|
||||
ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context;
|
||||
|
||||
GGML_ASSERT(cgraph->n_nodes > 0);
|
||||
bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid;
|
||||
bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid;
|
||||
if (reuse) {
|
||||
rpc_msg_graph_recompute_req request;
|
||||
request.device = rpc_ctx->device;
|
||||
@@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
} else {
|
||||
rpc_ctx->last_graph_uid = cgraph->uid;
|
||||
rpc_dev_ctx->last_graph_uid = cgraph->uid;
|
||||
std::vector<uint8_t> input;
|
||||
serialize_graph(rpc_ctx->device, cgraph, input);
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
@@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ device,
|
||||
/* .name = */ dev_name,
|
||||
/* .last_graph_uid = */ 0,
|
||||
};
|
||||
auto reg = ggml_backend_rpc_add_server(endpoint);
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
@@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir
|
||||
}
|
||||
}
|
||||
|
||||
// device interface
|
||||
|
||||
struct ggml_backend_rpc_device_context {
|
||||
std::string endpoint;
|
||||
uint32_t device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
@@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) {
|
||||
std::string dev_name = "RPC" + std::to_string(dev_id);
|
||||
std::string dev_desc = std::string(endpoint);
|
||||
ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ ind,
|
||||
/* .name = */ dev_name,
|
||||
/* .description = */ dev_desc
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .device = */ ind,
|
||||
/* .name = */ dev_name,
|
||||
/* .description = */ dev_desc,
|
||||
/* .last_graph_uid = */ 0,
|
||||
};
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
|
||||
@@ -72,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_use_async_mem_op_requested = 1;
|
||||
int g_ggml_sycl_enable_level_zero = 0;
|
||||
int g_ggml_sycl_enable_flash_attention = 1;
|
||||
|
||||
@@ -304,6 +305,8 @@ static void ggml_check_sycl() try {
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1);
|
||||
GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested);
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention);
|
||||
@@ -319,11 +322,11 @@ static void ggml_check_sycl() try {
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||
#endif
|
||||
*/
|
||||
// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
|
||||
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
|
||||
// other places.
|
||||
// Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder
|
||||
// staging path while preserving queue ordering semantics. Graph support still depends on the extension being
|
||||
// available, but it no longer needs to control the non-graph fast path.
|
||||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
|
||||
g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph;
|
||||
if (g_ggml_sycl_use_async_mem_op) {
|
||||
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
|
||||
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
|
||||
|
||||
@@ -1234,6 +1234,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
|
||||
const uint32_t h = (uint32_t) src2->ne[1];
|
||||
const uint32_t n_tokens = (uint32_t) src2->ne[2];
|
||||
const uint32_t n_seqs = (uint32_t) src2->ne[3];
|
||||
const uint32_t K = (uint32_t) src5->ne[1];
|
||||
const float scale = 1.0f / sqrtf((float) s_v);
|
||||
uint32_t scale_u32;
|
||||
memcpy(&scale_u32, &scale, sizeof(scale_u32));
|
||||
@@ -1258,6 +1259,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx,
|
||||
|
||||
(uint32_t) src0->ne[1],
|
||||
(uint32_t) (src2->ne[3] / src0->ne[3]),
|
||||
K,
|
||||
scale_u32,
|
||||
};
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ struct Params {
|
||||
|
||||
neq1: u32,
|
||||
rq3: u32,
|
||||
K: u32,
|
||||
scale: f32,
|
||||
};
|
||||
|
||||
@@ -62,11 +63,14 @@ fn main(
|
||||
let iq3 = seq_id / params.rq3;
|
||||
|
||||
let state_size = S_V * S_V;
|
||||
let state_base = (seq_id * params.h + head_id) * state_size;
|
||||
let state_in_base = (seq_id * params.K * params.h + head_id) * state_size;
|
||||
let state_out_base = (seq_id * params.h + head_id) * state_size;
|
||||
let state_size_per_snap = state_size * params.h * params.n_seqs;
|
||||
let shift = i32(params.n_tokens) - i32(params.K);
|
||||
|
||||
var state: array<f32, S_V>;
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
state[i] = src_state[state_base + col * S_V + i];
|
||||
state[i] = src_state[state_in_base + col * S_V + i];
|
||||
}
|
||||
|
||||
var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V;
|
||||
@@ -123,10 +127,22 @@ fn main(
|
||||
dst[attn_off + col] = attn_col * params.scale;
|
||||
attn_off += S_V * params.h;
|
||||
|
||||
if (params.K > 1u) {
|
||||
let target_slot = i32(t) - shift;
|
||||
if (target_slot >= 0 && target_slot < i32(params.K)) {
|
||||
let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base;
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
dst[slot_base + col * S_V + i] = state[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
}
|
||||
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
dst[params.s_off + state_base + col * S_V + i] = state[i];
|
||||
if (params.K == 1u) {
|
||||
for (var i = 0u; i < S_V; i++) {
|
||||
dst[params.s_off + state_out_base + col * S_V + i] = state[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -581,7 +581,8 @@ struct llm_graph_params {
|
||||
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
||||
(
|
||||
(!ubatch.token && !other.ubatch.token) ||
|
||||
(!ubatch.embd && !other.ubatch.embd)
|
||||
(!ubatch.embd && !other.ubatch.embd) ||
|
||||
(ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd)
|
||||
);
|
||||
|
||||
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
||||
|
||||
@@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
if (mem_recr->n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
if (mem_recr->n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -416,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
if (n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -72,6 +72,7 @@ public:
|
||||
|
||||
// number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups
|
||||
uint32_t n_rs_seq = 0;
|
||||
|
||||
// per-seq rollback index
|
||||
std::vector<uint32_t> rs_idx;
|
||||
|
||||
|
||||
@@ -447,13 +447,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
||||
return build_delta_net_chunking(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
bool llm_build_delta_net_base::keep_rs() const {
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
return cparams.n_rs_seq > 0
|
||||
&& n_seq_tokens > 1
|
||||
&& (uint32_t) n_seq_tokens <= 1 + cparams.n_rs_seq;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * conv_states_all,
|
||||
@@ -461,12 +454,12 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
int64_t conv_kernel_size,
|
||||
int64_t conv_channels,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
const uint32_t mem_size = mctx_cur->get_size();
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
const bool keep = keep_rs();
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
const auto mem_size = mctx_cur->get_size();
|
||||
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
cb(conv_states, "conv_states", il);
|
||||
@@ -480,32 +473,52 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
|
||||
cb(conv_input, "conv_input", il);
|
||||
|
||||
if (!keep) {
|
||||
ggml_tensor * last_conv_states =
|
||||
ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
|
||||
conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
|
||||
cb(last_conv_states, "last_conv_states", il);
|
||||
const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
|
||||
ggml_tensor * state_update_target =
|
||||
ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1],
|
||||
kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
|
||||
cb(state_update_target, "state_update_target", il);
|
||||
const size_t row_size = ggml_row_size(conv_states_all->type, row_count);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
|
||||
if (cparams.n_rs_seq == 0) {
|
||||
const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0];
|
||||
const int64_t s_slot = 0;
|
||||
|
||||
ggml_tensor * conv_state_last =
|
||||
ggml_view_3d(ctx0, conv_input,
|
||||
conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
ggml_row_size(conv_input->type, s_idx));
|
||||
cb(conv_state_last, "conv_state_last", il);
|
||||
|
||||
ggml_tensor * conv_state_update =
|
||||
ggml_view_2d(ctx0, conv_states_all,
|
||||
row_count, n_seqs, conv_states_all->nb[1],
|
||||
(s_slot * mem_size + kv_head) * row_size);
|
||||
cb(conv_state_update, "conv_state_update", il);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
|
||||
} else {
|
||||
const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
const size_t row_size = row_count * ggml_element_size(conv_states_all);
|
||||
for (int64_t t = 1; t <= n_seq_tokens; ++t) {
|
||||
const uint32_t slot = (uint32_t)(n_seq_tokens - t);
|
||||
ggml_tensor * src =
|
||||
ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
t * ggml_element_size(conv_input));
|
||||
ggml_tensor * dst =
|
||||
ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs,
|
||||
conv_states_all->nb[1],
|
||||
((size_t) slot * mem_size + kv_head) * row_size);
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are
|
||||
// inside the same ubatch. currently with `split_equal()` this is not correct
|
||||
|
||||
const int64_t K = (int64_t) cparams.n_rs_seq + 1;
|
||||
|
||||
for (int64_t t = 1; t <= K; ++t) {
|
||||
const int64_t s_idx = std::max<int64_t>(0, conv_input->ne[0] - conv_states->ne[0] - K + t);
|
||||
const int64_t s_slot = K - t;
|
||||
|
||||
ggml_tensor * conv_state_last =
|
||||
ggml_view_3d(ctx0, conv_input,
|
||||
conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
ggml_row_size(conv_input->type, s_idx));
|
||||
|
||||
ggml_tensor * conv_state_update =
|
||||
ggml_view_2d(ctx0,
|
||||
conv_states_all, row_count, n_seqs,
|
||||
conv_states_all->nb[1],
|
||||
(s_slot * mem_size + kv_head) * row_size);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -531,7 +544,9 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
const int64_t n_seqs = s->ne[3];
|
||||
const int64_t n_seq_tokens = q->ne[2];
|
||||
|
||||
if (!keep_rs()) {
|
||||
const bool keep = cparams.n_rs_seq > 0;
|
||||
|
||||
if (!keep) {
|
||||
auto attn_out = build_delta_net(q, k, v, g, b, s, il);
|
||||
ggml_tensor * output = attn_out.first;
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
@@ -554,7 +569,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0);
|
||||
|
||||
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d);
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
if (n_seq_tokens > 1) {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
} else {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
}
|
||||
|
||||
const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs;
|
||||
const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs;
|
||||
@@ -576,9 +595,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
ggml_row_size(gdn_out->type, S_v * S_v),
|
||||
ggml_row_size(gdn_out->type, S_v * S_v * H_v),
|
||||
ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap));
|
||||
|
||||
ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all,
|
||||
hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
|
||||
((size_t) cache_slot * mem_size + kv_head) * row_size);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
|
||||
}
|
||||
|
||||
|
||||
@@ -66,9 +66,6 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
|
||||
// true when speculative rollback is enabled and the batch fits in the rs cache
|
||||
bool keep_rs() const;
|
||||
|
||||
// read conv state from cache, concat with qkv_mixed, write back (single slot or per-token)
|
||||
// qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs)
|
||||
ggml_tensor * build_conv_state(
|
||||
|
||||
@@ -191,10 +191,10 @@
|
||||
| `--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) |
|
||||
| `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
|
||||
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
|
||||
@@ -183,6 +183,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
||||
| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
|
||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
|
||||
@@ -244,10 +245,10 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--spec-draft-override-tensor, -otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) |
|
||||
| `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) |
|
||||
| `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) |
|
||||
| `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)<br/>(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) |
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
|
||||
@@ -467,20 +467,26 @@ struct server_slot {
|
||||
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
SLT_INF(*this,
|
||||
"\n"
|
||||
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second);
|
||||
|
||||
SLT_INF(*this,
|
||||
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
t_token_generation, n_decoded, t_gen, n_gen_second);
|
||||
|
||||
SLT_INF(*this,
|
||||
" total time = %10.2f ms / %5d tokens\n",
|
||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
||||
t_token_generation, n_decoded, t_gen, n_gen_second,
|
||||
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
||||
|
||||
SLT_INF(*this,
|
||||
" graphs reused = %10d\n",
|
||||
llama_perf_context(ctx_tgt).n_reused);
|
||||
|
||||
if (n_draft_total > 0) {
|
||||
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
|
||||
SLT_CNT(*this,
|
||||
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total
|
||||
);
|
||||
SLT_INF(*this,
|
||||
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
|
||||
draft_ratio, n_draft_accepted, n_draft_total);
|
||||
}
|
||||
|
||||
common_speculative_print_stats(spec);
|
||||
@@ -2583,9 +2589,9 @@ private:
|
||||
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
|
||||
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, pos_next - n_swa);
|
||||
const auto pos_min_thold = std::max(0, pos_next - n_swa - 1);
|
||||
|
||||
if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
|
||||
if (n_past > 0 && n_past <= slot.prompt.n_tokens()) {
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
|
||||
if (pos_min == -1) {
|
||||
SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
|
||||
|
||||
2
tools/ui/.gitignore
vendored
2
tools/ui/.gitignore
vendored
@@ -25,4 +25,4 @@ vite.config.ts.timestamp-*
|
||||
|
||||
*storybook.log
|
||||
storybook-static
|
||||
*.code-workspace
|
||||
*.code-workspace
|
||||
|
||||
@@ -20,9 +20,7 @@ export default ts.config(
|
||||
prettier,
|
||||
...svelte.configs.prettier,
|
||||
{
|
||||
languageOptions: {
|
||||
globals: { ...globals.browser, ...globals.node }
|
||||
},
|
||||
languageOptions: { globals: { ...globals.browser, ...globals.node } },
|
||||
rules: {
|
||||
// typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects.
|
||||
// see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors
|
||||
@@ -30,6 +28,7 @@ export default ts.config(
|
||||
'svelte/no-at-html-tags': 'off',
|
||||
// This app uses hash-based routing (#/) where resolve() from $app/paths does not apply
|
||||
'svelte/no-navigation-without-resolve': 'off',
|
||||
|
||||
// Enforce empty line at end of file
|
||||
'eol-last': 'error'
|
||||
}
|
||||
|
||||
54
tools/ui/package-lock.json
generated
54
tools/ui/package-lock.json
generated
@@ -2307,9 +2307,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@sveltejs/kit": {
|
||||
"version": "2.59.1",
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz",
|
||||
"integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==",
|
||||
"version": "2.60.1",
|
||||
"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz",
|
||||
"integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
@@ -2318,7 +2318,7 @@
|
||||
"@types/cookie": "^0.6.0",
|
||||
"acorn": "^8.14.1",
|
||||
"cookie": "^0.6.0",
|
||||
"devalue": "^5.6.4",
|
||||
"devalue": "^5.8.1",
|
||||
"esm-env": "^1.2.2",
|
||||
"kleur": "^4.1.5",
|
||||
"magic-string": "^0.30.5",
|
||||
@@ -4296,9 +4296,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/devalue": {
|
||||
"version": "5.6.4",
|
||||
"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz",
|
||||
"integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==",
|
||||
"version": "5.8.1",
|
||||
"resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz",
|
||||
"integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/devlop": {
|
||||
@@ -4856,12 +4856,12 @@
|
||||
}
|
||||
},
|
||||
"node_modules/express-rate-limit": {
|
||||
"version": "8.5.0",
|
||||
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz",
|
||||
"integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==",
|
||||
"version": "8.5.2",
|
||||
"resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz",
|
||||
"integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ip-address": "10.1.0"
|
||||
"ip-address": "^10.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 16"
|
||||
@@ -4909,9 +4909,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/fast-uri": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz",
|
||||
"integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==",
|
||||
"version": "3.1.2",
|
||||
"resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz",
|
||||
"integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
@@ -5541,9 +5541,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/hono": {
|
||||
"version": "4.12.14",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz",
|
||||
"integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==",
|
||||
"version": "4.12.19",
|
||||
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz",
|
||||
"integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=16.9.0"
|
||||
@@ -5722,9 +5722,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/ip-address": {
|
||||
"version": "10.1.0",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
|
||||
"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
|
||||
"version": "10.2.0",
|
||||
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz",
|
||||
"integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 12"
|
||||
@@ -9245,9 +9245,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/svelte": {
|
||||
"version": "5.55.1",
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz",
|
||||
"integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==",
|
||||
"version": "5.55.7",
|
||||
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz",
|
||||
"integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/remapping": "^2.3.4",
|
||||
@@ -9259,7 +9259,7 @@
|
||||
"aria-query": "5.3.1",
|
||||
"axobject-query": "^4.1.0",
|
||||
"clsx": "^2.1.1",
|
||||
"devalue": "^5.6.4",
|
||||
"devalue": "^5.8.1",
|
||||
"esm-env": "^1.2.1",
|
||||
"esrap": "^2.2.4",
|
||||
"is-reference": "^3.0.3",
|
||||
@@ -10606,9 +10606,9 @@
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.18.3",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
|
||||
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
|
||||
"version": "8.20.1",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz",
|
||||
"integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
@import 'tailwindcss';
|
||||
@source ".";
|
||||
|
||||
@source '.';
|
||||
@plugin '@tailwindcss/forms';
|
||||
@plugin '@tailwindcss/typography';
|
||||
@import 'tw-animate-css';
|
||||
|
||||
@custom-variant dark (&:is(.dark *));
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
{#if videoSrc}
|
||||
<video controls class="mb-4 w-full" src={videoSrc}>
|
||||
<track kind="captions" src="" />
|
||||
Your browser does not support the video element.
|
||||
</video>
|
||||
{:else}
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
SETTINGS_KEYS
|
||||
} from '$lib/constants';
|
||||
import { ColorMode, UrlProtocol } from '$lib/enums';
|
||||
import { FileTypeText } from '$lib/enums/files';
|
||||
import { FileTypeText } from '$lib/enums/files.enums';
|
||||
import { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } from '$lib/utils';
|
||||
import '$styles/katex-custom.scss';
|
||||
import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
} from '$lib/constants';
|
||||
import { RouterService } from '$lib/services/router.service';
|
||||
import { setMode } from 'mode-watcher';
|
||||
import { ColorMode } from '$lib/enums/ui';
|
||||
import { ColorMode } from '$lib/enums/ui.enums';
|
||||
import { fade } from 'svelte/transition';
|
||||
import { goto } from '$app/navigation';
|
||||
import { page } from '$app/state';
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
import * as Select from '$lib/components/ui/select';
|
||||
import { Textarea } from '$lib/components/ui/textarea';
|
||||
import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
|
||||
import { SettingsFieldType } from '$lib/enums/settings';
|
||||
import { SettingsFieldType } from '$lib/enums/settings.enums';
|
||||
import { settingsStore } from '$lib/stores/settings.svelte';
|
||||
import { serverStore } from '$lib/stores/server.svelte';
|
||||
import { modelsStore, selectedModelName, propsCacheVersion } from '$lib/stores/models.svelte';
|
||||
|
||||
@@ -2,7 +2,7 @@ import { Zap, Globe, Radio } from '@lucide/svelte';
|
||||
import { MCPTransportType } from '$lib/enums';
|
||||
import type { ClientCapabilities, Implementation } from '$lib/types';
|
||||
import type { Component } from 'svelte';
|
||||
import { MimeTypeImage } from '$lib/enums/files';
|
||||
import { MimeTypeImage } from '$lib/enums/files.enums';
|
||||
|
||||
export const DEFAULT_CLIENT_VERSION = '1.0.0';
|
||||
export const MCP_CLIENT_NAME = 'llama-ui-mcp';
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { ColorMode } from '$lib/enums/ui';
|
||||
import { SettingsFieldType } from '$lib/enums/settings';
|
||||
import { ColorMode } from '$lib/enums/ui.enums';
|
||||
import { SettingsFieldType } from '$lib/enums/settings.enums';
|
||||
import { SyncableParameterType } from '$lib/enums';
|
||||
import {
|
||||
Funnel,
|
||||
|
||||
@@ -18,7 +18,7 @@ import {
|
||||
MimeTypeApplication,
|
||||
MimeTypeText
|
||||
} from '$lib/enums';
|
||||
import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files';
|
||||
import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files.enums';
|
||||
|
||||
// File type configuration using enums
|
||||
export const AUDIO_FILE_TYPES = {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { ToolSource } from '$lib/enums/tools';
|
||||
import { ToolSource } from '$lib/enums/tools.enums';
|
||||
|
||||
export const TOOL_GROUP_LABELS = {
|
||||
[ToolSource.BUILTIN]: 'Built-in',
|
||||
|
||||
@@ -4,9 +4,9 @@ export {
|
||||
AttachmentItemEnabledWhen,
|
||||
AttachmentAction,
|
||||
AttachmentItemVisibleWhen
|
||||
} from './attachment';
|
||||
} from './attachment.enums';
|
||||
|
||||
export { AgenticSectionType, ToolCallType } from './agentic';
|
||||
export { AgenticSectionType, ToolCallType } from './agentic.enums';
|
||||
|
||||
export {
|
||||
ChatMessageStatsView,
|
||||
@@ -17,7 +17,7 @@ export {
|
||||
MessageType,
|
||||
PdfViewMode,
|
||||
ReasoningFormat
|
||||
} from './chat';
|
||||
} from './chat.enums';
|
||||
|
||||
export {
|
||||
FileTypeCategory,
|
||||
@@ -38,7 +38,7 @@ export {
|
||||
MimeTypeImage,
|
||||
MimeTypeText,
|
||||
SpecialFileType
|
||||
} from './files';
|
||||
} from './files.enums';
|
||||
|
||||
export {
|
||||
MCPConnectionPhase,
|
||||
@@ -48,16 +48,16 @@ export {
|
||||
MCPContentType,
|
||||
MCPRefType,
|
||||
JsonSchemaType
|
||||
} from './mcp';
|
||||
} from './mcp.enums';
|
||||
|
||||
export { ModelModality } from './model';
|
||||
export { ModelModality } from './model.enums';
|
||||
|
||||
export { ServerRole, ServerModelStatus } from './server';
|
||||
export { ServerRole, ServerModelStatus } from './server.enums';
|
||||
|
||||
export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings';
|
||||
export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings.enums';
|
||||
|
||||
export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui';
|
||||
export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui.enums';
|
||||
|
||||
export { KeyboardKey } from './keyboard';
|
||||
export { KeyboardKey } from './keyboard.enums';
|
||||
|
||||
export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools';
|
||||
export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums';
|
||||
|
||||
4
tools/ui/src/lib/types/mcp.d.ts
vendored
4
tools/ui/src/lib/types/mcp.d.ts
vendored
@@ -1,5 +1,5 @@
|
||||
import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp';
|
||||
import type { ToolSource } from '$lib/enums/tools';
|
||||
import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp.enums';
|
||||
import type { ToolSource } from '$lib/enums/tools.enums';
|
||||
import type {
|
||||
Client,
|
||||
ClientCapabilities as SDKClientCapabilities,
|
||||
|
||||
@@ -7,11 +7,13 @@
|
||||
import { untrack } from 'svelte';
|
||||
import { onMount } from 'svelte';
|
||||
import { fade } from 'svelte/transition';
|
||||
|
||||
import {
|
||||
DesktopIconStrip,
|
||||
DialogConversationTitleUpdate,
|
||||
SidebarNavigation
|
||||
} from '$lib/components/app';
|
||||
|
||||
import { conversationsStore } from '$lib/stores/conversations.svelte';
|
||||
import * as Sidebar from '$lib/components/ui/sidebar/index.js';
|
||||
import * as Tooltip from '$lib/components/ui/tooltip';
|
||||
@@ -30,26 +32,29 @@
|
||||
import { conversations } from '$lib/stores/conversations.svelte';
|
||||
|
||||
let { children } = $props();
|
||||
|
||||
let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop);
|
||||
let isMobile = new IsMobile();
|
||||
let isDesktop = $derived(!isMobile.current);
|
||||
let sidebarOpen = $state(false);
|
||||
let mounted = $state(false);
|
||||
let innerHeight = $state<number | undefined>();
|
||||
|
||||
let chatSidebar:
|
||||
| { activateSearchMode?: () => void; editActiveConversation?: () => void }
|
||||
| {
|
||||
activateSearchMode?: () => void;
|
||||
editActiveConversation?: () => void;
|
||||
}
|
||||
| undefined = $state();
|
||||
|
||||
let titleUpdateDialogOpen = $state(false);
|
||||
let titleUpdateCurrentTitle = $state('');
|
||||
let titleUpdateNewTitle = $state('');
|
||||
let titleUpdateResolve: ((value: boolean) => void) | null = null;
|
||||
|
||||
const panelNav = useSettingsNavigation();
|
||||
|
||||
function navigateToConversation(direction: -1 | 1) {
|
||||
const allConvs = conversations();
|
||||
|
||||
if (allConvs.length === 0) return;
|
||||
|
||||
const currentId = page.params.id;
|
||||
@@ -61,6 +66,7 @@
|
||||
}
|
||||
|
||||
const idx = allConvs.findIndex((c) => c.id === currentId);
|
||||
|
||||
if (idx === -1) return;
|
||||
|
||||
const targetIdx = idx + direction;
|
||||
@@ -75,9 +81,7 @@
|
||||
// Global keyboard shortcuts
|
||||
const { handleKeydown } = useKeyboardShortcuts({
|
||||
editActiveConversation: () => chatSidebar?.editActiveConversation?.(),
|
||||
|
||||
navigateToPrevConversation: () => navigateToConversation(-1),
|
||||
|
||||
navigateToNextConversation: () => navigateToConversation(1)
|
||||
});
|
||||
|
||||
@@ -139,6 +143,7 @@
|
||||
$effect(() => {
|
||||
if (alwaysShowSidebarOnDesktop && isDesktop) {
|
||||
sidebarOpen = true;
|
||||
|
||||
return;
|
||||
}
|
||||
});
|
||||
@@ -175,6 +180,7 @@
|
||||
// Only fetch router models once when we have models loaded and in router mode
|
||||
if (isRouter && modelsCount > 0 && !routerModelsFetched) {
|
||||
routerModelsFetched = true;
|
||||
|
||||
untrack(() => {
|
||||
modelsStore.fetchRouterModels();
|
||||
});
|
||||
@@ -223,7 +229,6 @@
|
||||
|
||||
<Tooltip.Provider delayDuration={TOOLTIP_DELAY_DURATION}>
|
||||
<ModeWatcher />
|
||||
|
||||
<Toaster richColors />
|
||||
|
||||
<DialogConversationTitleUpdate
|
||||
@@ -236,9 +241,9 @@
|
||||
|
||||
<Sidebar.Provider bind:open={sidebarOpen}>
|
||||
<div class="flex h-screen w-full" style:height="{innerHeight}px">
|
||||
<Sidebar.Root variant="floating" class="h-full">
|
||||
<SidebarNavigation bind:this={chatSidebar} />
|
||||
</Sidebar.Root>
|
||||
<Sidebar.Root variant="floating" class="h-full"
|
||||
><SidebarNavigation bind:this={chatSidebar} /></Sidebar.Root
|
||||
>
|
||||
|
||||
{#if !(alwaysShowSidebarOnDesktop && isDesktop) && !(panelNav.isSettingsRoute && !isDesktop)}
|
||||
{#if mounted}
|
||||
@@ -266,9 +271,9 @@
|
||||
/>
|
||||
{/if}
|
||||
|
||||
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden">
|
||||
{@render children?.()}
|
||||
</Sidebar.Inset>
|
||||
<Sidebar.Inset class="flex flex-1 flex-col overflow-hidden"
|
||||
>{@render children?.()}</Sidebar.Inset
|
||||
>
|
||||
</div>
|
||||
</Sidebar.Provider>
|
||||
</Tooltip.Provider>
|
||||
|
||||
@@ -9,70 +9,72 @@ import { beforeEach, vi } from 'vitest';
|
||||
beforeEach(() => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
vi.spyOn(globalThis, 'fetch').mockImplementation(async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url;
|
||||
vi.spyOn(globalThis, 'fetch').mockImplementation(
|
||||
async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url;
|
||||
|
||||
// Mock server props endpoint
|
||||
if (url.includes('/server')) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
mode: 'router',
|
||||
version: 'test',
|
||||
git_commit: 'test',
|
||||
git_branch: 'test'
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
// Mock server props endpoint
|
||||
if (url.includes('/server')) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
mode: 'router',
|
||||
version: 'test',
|
||||
git_commit: 'test',
|
||||
git_branch: 'test'
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Mock models list endpoint
|
||||
if (/\/v1\/models|\/models\b/.test(url)) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
object: 'list',
|
||||
data: [
|
||||
{
|
||||
id: 'test-model.gguf',
|
||||
object: 'model',
|
||||
owned_by: 'llamacpp',
|
||||
created: 0,
|
||||
in_cache: false,
|
||||
path: 'models/test-model.gguf',
|
||||
status: { value: 'unloaded' },
|
||||
meta: {}
|
||||
}
|
||||
],
|
||||
models: [
|
||||
{
|
||||
model: 'test-model.gguf',
|
||||
name: 'Test Model',
|
||||
details: {}
|
||||
}
|
||||
]
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Mock /props endpoint (used for modalities)
|
||||
if (url.includes('/props')) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
default_generation_settings: { n_ctx: 2048 }
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Mock /tools endpoint (used for built-in tools list)
|
||||
if (url.includes('/tools')) {
|
||||
return new Response(JSON.stringify([]), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
});
|
||||
}
|
||||
|
||||
// Default: use real fetch
|
||||
return originalFetch(input, init);
|
||||
}
|
||||
|
||||
// Mock models list endpoint
|
||||
if (/\/v1\/models|\/models\b/.test(url)) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
object: 'list',
|
||||
data: [
|
||||
{
|
||||
id: 'test-model.gguf',
|
||||
object: 'model',
|
||||
owned_by: 'llamacpp',
|
||||
created: 0,
|
||||
in_cache: false,
|
||||
path: 'models/test-model.gguf',
|
||||
status: { value: 'unloaded' },
|
||||
meta: {}
|
||||
}
|
||||
],
|
||||
models: [
|
||||
{
|
||||
model: 'test-model.gguf',
|
||||
name: 'Test Model',
|
||||
details: {}
|
||||
}
|
||||
]
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Mock /props endpoint (used for modalities)
|
||||
if (url.includes('/props')) {
|
||||
return new Response(
|
||||
JSON.stringify({
|
||||
default_generation_settings: { n_ctx: 2048 }
|
||||
}),
|
||||
{ status: 200, headers: { 'Content-Type': 'application/json' } }
|
||||
);
|
||||
}
|
||||
|
||||
// Mock /tools endpoint (used for built-in tools list)
|
||||
if (url.includes('/tools')) {
|
||||
return new Response(JSON.stringify([]), {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' }
|
||||
});
|
||||
}
|
||||
|
||||
// Default: use real fetch
|
||||
return originalFetch(input, init);
|
||||
});
|
||||
);
|
||||
});
|
||||
|
||||
1
tools/ui/vitest.shims.d.ts
vendored
Normal file
1
tools/ui/vitest.shims.d.ts
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/// <reference types="@vitest/browser-playwright" />
|
||||
Reference in New Issue
Block a user