Compare commits

...

13 Commits
b9318 ... b9331

Author SHA1 Message Date
Georgi Gerganov
302e2c2652 ci : reduce PR jobs by matching backend paths (#23675)
* ci : disable SYCL f16 builds

* ci : extract android and hip into separate workflows

* ci : move webgpu to separate workflow

* ci : move the rpc to a separate workflow

* ci : extract s309x and ppcl jobs

* ci : extract opencl job into a separate workflow
2026-05-25 20:54:54 +03:00
Pascal
328874d054 model: tag ffn_latent as MUL_MAT to fix buft probe (#23664)
ffn_latent_down/up are declared GGML_OP_MUL in LLM_TENSOR_INFOS but
nemotron-h feeds them through ggml_mul_mat. The loader buft probe asks
the backend about the declared op, so it tested an elementwise MUL on a
q8_0 weight. That used to return true unconditionally and the weight
stayed on GPU by luck. Once supports_op told the truth, the probe got a
no and the loader pushed the weight and its matmul to CPU, splitting the
graph. Tagging it MUL_MAT asks the real question, the math is unchanged.

Verified on Nemotron 3 Super 120B Q5_K_M: from 64.9 back to 103.22 t/s.
2026-05-25 16:05:04 +02:00
Aman Gupta
c1f1e28d29 CUDA: add fast walsh-hadamard transform (#23615)
* CUDA: add fast walsh-hadamard transform

* review: add unrolls + change size_t -> int

* warp size 64

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2026-05-25 21:12:10 +08:00
Pascal
5a4126adc1 ui: fix stop/continue during an agentic loop (#23356) 2026-05-25 14:18:59 +02:00
Michael Wand
a4d2d4ae41 convert : add compressed-tensors NVFP4 support (#21095)
* Refactored Compressed Tensors NVFP4 support for new base.py

* Support compressed-tensors NVFP4 conversion

* Moved Qwen MTP remap into filter_tensors

* simplify

* pathlib no longer used

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-05-25 14:16:11 +02:00
Georgi Gerganov
d161ea7071 sync : ggml 2026-05-25 12:43:27 +03:00
Georgi Gerganov
45158f460e ggml : bump version to 0.13.0 (ggml/1510) 2026-05-25 12:43:27 +03:00
Georgi Gerganov
22307b3e8b sync : ggml 2026-05-25 12:38:01 +03:00
Georgi Gerganov
ce5890b5f7 ggml : bump version to 0.12.1 (ggml/1508) 2026-05-25 12:38:01 +03:00
Ori Pekelman
b251f74f49 ggml.h: correct ggml_silu_back arg docstring (a=dy, b=x) (ggml/1500) 2026-05-25 12:38:01 +03:00
Dev-X25874
fa97041524 ggml-alloc: fix out-of-bounds read in ggml_dyn_tallocr_remove_block (ggml/1492) 2026-05-25 12:38:01 +03:00
Johannes Gäßler
ae251b5ff2 TP: fix ggml context size calculation (#22616)
* TP: fix ggml context size calculation, memory leak

* move split state cache back into the context

* revert to constant ggml context size for cgraphs

* increase headroom for statically allocated tensors

* remove obsolete include
2026-05-25 12:37:25 +03:00
Gilad S.
66efd13375 ggml: gguf_init_from_callback and gguf_init_from_buffer (#22341)
* ggml: implement `gguf_init_from_buffer`

* test: `gguf_init_from_buffer`

* fix: memory breakdown for a model loaded with `no_alloc` from a file is consistent with being loaded from a buffer

* fix: use `GGML_UNUSED`

Co-authored-by: Copilot <copilot@github.com>

* fix: remove `total_size` from `gguf_reader`

* fix: file offset calculation, rename `offset` to `data_offset`

Co-authored-by: Copilot <copilot@github.com>

* refactor: extract model loader bug fixes to another PR

* feat: add `gguf_init_from_callback`

* fix: always require a max expected size

* fix: change `gguf_reader_callback_t`'s `output` type to `void *`, change `max_expected_size` and offsets to `uint64_t`

* fix: harden against offset overflow in buffer read

* fix: remove seek behavior from the callback

* feat: `max_chunk_read == 0` means `SIZE_MAX`

* fix: seeking in a gguf file with no tensors

---------

Co-authored-by: Copilot <copilot@github.com>
2026-05-25 11:33:29 +02:00
30 changed files with 1676 additions and 626 deletions

View File

@@ -91,3 +91,53 @@ jobs:
with:
name: llama-cpp-android-arm64-cpu
path: pkg-adb/llama.cpp
android-arm64:
runs-on: ubuntu-latest
env:
NDK_VERSION: "29.0.14206865"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: android-arm64
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Set up JDK
uses: actions/setup-java@v5
with:
java-version: 17
distribution: temurin
- name: Setup Android SDK
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
with:
log-accepted-android-sdk-licenses: false
- name: Install NDK
run: |
sdkmanager "ndk;${{ env.NDK_VERSION }}"
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
- name: Build
id: cmake_build
run: |
cmake -B build \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_RPC=ON
time cmake --build build --config Release -j $(nproc)

167
.github/workflows/build-hip.yml vendored Normal file
View File

@@ -0,0 +1,167 @@
name: CI (hip)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-hip.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-hip.yml',
'ggml/src/ggml-cuda/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-22-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.1.2
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-hip
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGPU_TARGETS="gfx1030" \
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)
windows-latest-hip:
runs-on: windows-2022
env:
# Make sure this is in sync with build-cache.yml
HIPSDK_INSTALLER_VERSION: "26.Q1"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
- name: Use ROCm Installation Cache
uses: actions/cache@v5
id: cache-rocm
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: Setup ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-rocm
with:
version: ${{ env.HIPSDK_INSTALLER_VERSION }}
- name: Verify ROCm
id: verify
run: |
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
- name: Install ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ${{ github.job }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGPU_TARGETS="gfx1100" `
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
ubuntu-22-musa:
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
apt-get update
apt-get install -y build-essential git cmake libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-musa
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake MUSA support
id: cmake_build
run: |
cmake -B build -S . \
-DGGML_MUSA=ON
time cmake --build build --config Release -j $(nproc)

150
.github/workflows/build-ibm.yml vendored Normal file
View File

@@ -0,0 +1,150 @@
name: CI (ibm)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-ibm.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-ibm.yml',
'ggml/src/ggml-cpu/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-24-s390x:
runs-on: ubuntu-24.04-s390x
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Build Dependencies
id: build_depends
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev python3-wheel \
libjpeg-dev build-essential libssl-dev \
git-lfs
- name: Toolchain workaround (GCC 14)
run: |
sudo apt-get install -y gcc-14 g++-14
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Python Dependencies
id: python_depends
run: |
export PIP_BREAK_SYSTEM_PACKAGES="1"
python3 -m pip install --upgrade pip setuptools
pip3 install ./gguf-py
- name: Swap Endianness
id: endianness
run: |
for f in models/*.gguf; do
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
done
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
- name: Test llama2c (s390x)
id: llama2c_test_s390x
run: |
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-24-ppc64le:
runs-on: ubuntu-24.04-ppc64le
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Build Dependencies
id: build_depends
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev python3-wheel \
libjpeg-dev build-essential libssl-dev \
git-lfs
- name: Toolchain workaround (GCC 14)
run: |
sudo apt-get install -y gcc-14 g++-14
echo "CC=gcc-14" >> "$GITHUB_ENV"
echo "CXX=g++-14" >> "$GITHUB_ENV"
- name: Python Dependencies
id: python_depends
run: |
export PIP_BREAK_SYSTEM_PACKAGES="1"
python3 -m pip install --upgrade pip setuptools
pip3 install ./gguf-py
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
run: |
cd build
echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

83
.github/workflows/build-opencl.yml vendored Normal file
View File

@@ -0,0 +1,83 @@
name: CI (opencl)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-opencl.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cl'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-opencl.yml',
'ggml/src/ggml-opencl/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
windows-latest-opencl-adreno:
runs-on: windows-2025
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: windows-latest-llvm-arm64-opencl-adreno
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
cmake -B build `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
cmake -B build-arm64-release `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: Build
id: cmake_build
run: |
cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}

View File

@@ -34,6 +34,76 @@ env:
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-cpu-riscv64-native:
runs-on: ubuntu-24.04-riscv
steps:
- name: Install dependencies
run: |
# Install necessary packages
sudo apt-get update
sudo apt-get install -y libssl-dev
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
git lfs install
- name: Check environment
run: |
uname -a
gcc --version
g++ --version
ldd --version
cmake --version
rustc --version
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
with:
key: ubuntu-cpu-riscv64-native
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=ON \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DGGML_RPC=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
run: |
cd build
echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-riscv64-native-sanitizer:
runs-on: ubuntu-24.04-riscv

67
.github/workflows/build-rpc.yml vendored Normal file
View File

@@ -0,0 +1,67 @@
name: CI (rpc)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-rpc.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-rpc.yml',
'ggml/src/ggml-rpc/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
ubuntu-latest-rpc:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev ninja-build
- name: Build
id: cmake_build
run: |
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_RPC=ON
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose

View File

@@ -38,12 +38,10 @@ jobs:
ubuntu-24-sycl:
strategy:
matrix:
build: [fp32, fp16]
build: [fp32]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04

186
.github/workflows/build-webgpu.yml vendored Normal file
View File

@@ -0,0 +1,186 @@
name: CI (webgpu)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-webgpu.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.wgsl'
]
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-webgpu.yml',
'ggml/src/ggml-webgpu/**'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
env:
GGML_NLOOP: 3
GGML_N_THREADS: 1
LLAMA_LOG_COLORS: 1
LLAMA_LOG_PREFIX: 1
LLAMA_LOG_TIMESTAMPS: 1
jobs:
macOS-latest-arm64-webgpu:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: macOS-latest-arm64-webgpu
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export CMAKE_PREFIX_PATH=dawn
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-24-webgpu:
runs-on: ubuntu-24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-webgpu
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
run: |
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
- name: Use Vulkan SDK Cache
uses: actions/cache@v5
id: cache-sdk
with:
path: ./vulkan_sdk
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
- name: Setup Vulkan SDK
if: steps.cache-sdk.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-vulkan
with:
path: ./vulkan_sdk
version: ${{ env.VULKAN_SDK_VERSION }}
- name: Dawn Dependency
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build \
-DGGML_WEBGPU=ON
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
# This is using llvmpipe and runs slower than other backends
# test-backend-ops is too slow on llvmpipe, skip it
ctest -L main -E test-backend-ops --verbose --timeout 900
ubuntu-24-webgpu-wasm:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Install Emscripten
run: |
git clone https://github.com/emscripten-core/emsdk.git
cd emsdk
./emsdk install latest
./emsdk activate latest
- name: Fetch emdawnwebgpu
run: |
DAWN_TAG="v20260317.182325"
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
echo "Downloading ${EMDAWN_PKG}"
curl -L -o emdawn.zip \
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
unzip emdawn.zip
- name: Build WASM WebGPU
run: |
source emsdk/emsdk_env.sh
emcmake cmake -B build-wasm \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_WEBGPU=ON \
-DLLAMA_OPENSSL=OFF \
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)

View File

@@ -132,47 +132,6 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
macOS-latest-arm64-webgpu:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: macOS-latest-arm64-webgpu
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export CMAKE_PREFIX_PATH=dawn
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-cpu:
strategy:
matrix:
@@ -181,10 +140,6 @@ jobs:
os: ubuntu-22.04
- build: 'arm64'
os: ubuntu-24.04-arm
- build: 's390x'
os: ubuntu-24.04-s390x
- build: 'ppc64le'
os: ubuntu-24.04-ppc64le
runs-on: ${{ matrix.os }}
@@ -194,7 +149,6 @@ jobs:
uses: actions/checkout@v6
- name: ccache
if: ${{ matrix.build != 's390x' && matrix.build != 'ppc64le' }}
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-cpu-${{ matrix.build }}
@@ -224,14 +178,6 @@ jobs:
python3 -m pip install --upgrade pip setuptools
pip3 install ./gguf-py
- name: Swap Endianness
id: endianness
if: ${{ matrix.build == 's390x' }}
run: |
for f in models/*.gguf; do
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
done
- name: Build
id: cmake_build
run: |
@@ -248,7 +194,6 @@ jobs:
- name: Test llama2c conversion
id: llama2c_test
if: ${{ matrix.build != 's390x' }}
run: |
cd build
echo "Fetch tokenizer"
@@ -258,96 +203,6 @@ jobs:
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
if: ${{ matrix.build == 's390x' }}
run: |
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
android-arm64:
runs-on: ubuntu-latest
env:
NDK_VERSION: "29.0.14206865"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: android-arm64
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Set up JDK
uses: actions/setup-java@v5
with:
java-version: 17
distribution: temurin
- name: Setup Android SDK
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
with:
log-accepted-android-sdk-licenses: false
- name: Install NDK
run: |
sdkmanager "ndk;${{ env.NDK_VERSION }}"
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
- name: Build
id: cmake_build
run: |
cmake -B build \
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_NATIVE=OFF \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_RPC=ON
time cmake --build build --config Release -j $(nproc)
ubuntu-latest-rpc:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev ninja-build
- name: Build
id: cmake_build
run: |
cmake -B build \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_RPC=ON
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose
ubuntu-24-vulkan:
strategy:
matrix:
@@ -387,176 +242,6 @@ jobs:
run: |
time cmake --build build -j $(nproc)
ubuntu-24-webgpu:
runs-on: ubuntu-24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-24-webgpu
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Dependencies
id: depends
run: |
sudo add-apt-repository -y ppa:kisak/kisak-mesa
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
- name: Get latest Vulkan SDK version
id: vulkan_sdk_version
run: |
echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
- name: Use Vulkan SDK Cache
uses: actions/cache@v5
id: cache-sdk
with:
path: ./vulkan_sdk
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
- name: Setup Vulkan SDK
if: steps.cache-sdk.outputs.cache-hit != 'true'
uses: ./.github/actions/linux-setup-vulkan
with:
path: ./vulkan_sdk
version: ${{ env.VULKAN_SDK_VERSION }}
- name: Dawn Dependency
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
DAWN_VERSION="v20260317.182325"
DAWN_OWNER="google"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
curl -L -o artifact.tar.gz \
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
mkdir dawn
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build \
-DGGML_WEBGPU=ON
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
# This is using llvmpipe and runs slower than other backends
# test-backend-ops is too slow on llvmpipe, skip it
ctest -L main -E test-backend-ops --verbose --timeout 900
ubuntu-24-webgpu-wasm:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Install Emscripten
run: |
git clone https://github.com/emscripten-core/emsdk.git
cd emsdk
./emsdk install latest
./emsdk activate latest
- name: Fetch emdawnwebgpu
run: |
DAWN_TAG="v20260317.182325"
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
echo "Downloading ${EMDAWN_PKG}"
curl -L -o emdawn.zip \
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
unzip emdawn.zip
- name: Build WASM WebGPU
run: |
source emsdk/emsdk_env.sh
emcmake cmake -B build-wasm \
-G "Ninja" \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_WEBGPU=ON \
-DLLAMA_OPENSSL=OFF \
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
ubuntu-22-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.1.2
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-hip
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGPU_TARGETS="gfx1030" \
-DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)
ubuntu-22-musa:
runs-on: ubuntu-22.04
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Dependencies
id: depends
run: |
apt-get update
apt-get install -y build-essential git cmake libssl-dev
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ubuntu-22-musa
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build with native CMake MUSA support
id: cmake_build
run: |
cmake -B build -S . \
-DGGML_MUSA=ON
time cmake --build build --config Release -j $(nproc)
windows-latest:
runs-on: windows-2025
@@ -580,9 +265,6 @@ jobs:
- build: 'llvm-arm64'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
- build: 'llvm-arm64-opencl-adreno'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps:
- name: Clone
@@ -624,26 +306,6 @@ jobs:
run: |
choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
cmake -B build `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
cmake -B build-arm64-release `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: Build
id: cmake_build
run: |
@@ -764,145 +426,6 @@ jobs:
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
windows-latest-hip:
runs-on: windows-2022
env:
# Make sure this is in sync with build-cache.yml
HIPSDK_INSTALLER_VERSION: "26.Q1"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Grab rocWMMA package
id: grab_rocwmma
run: |
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
7z x rocwmma.deb
7z x data.tar
- name: Use ROCm Installation Cache
uses: actions/cache@v5
id: cache-rocm
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: Setup ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-rocm
with:
version: ${{ env.HIPSDK_INSTALLER_VERSION }}
- name: Verify ROCm
id: verify
run: |
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
- name: Install ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: ${{ github.job }}
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . `
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGPU_TARGETS="gfx1100" `
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
ubuntu-cpu-riscv64-native:
runs-on: ubuntu-24.04-riscv
steps:
- name: Install dependencies
run: |
# Install necessary packages
sudo apt-get update
sudo apt-get install -y libssl-dev
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
git lfs install
- name: Check environment
run: |
uname -a
gcc --version
g++ --version
ldd --version
cmake --version
rustc --version
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
with:
key: ubuntu-cpu-riscv64-native
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Build
id: cmake_build
run: |
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=ON \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DGGML_RPC=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
time cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
run: |
cd build
echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
# TODO: simplify the following workflows using a matrix
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
ggml-ci-x64-cpu-low-perf:

View File

@@ -772,12 +772,10 @@ jobs:
strategy:
matrix:
build: [fp32, fp16]
build: [fp32]
include:
- build: fp32
fp16: OFF
- build: fp16
fp16: ON
runs-on: ubuntu-24.04
@@ -1363,7 +1361,6 @@ jobs:
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
**Android:**
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)

View File

@@ -467,7 +467,14 @@ class ModelBase:
elif quant_method == "compressed-tensors":
quant_format = quant_config["format"]
groups = quant_config["config_groups"]
if len(groups) > 1:
nvfp4_compressed_tensors = (
quant_format == "nvfp4-pack-quantized"
or quant_format == "mixed-precision"
and bool(groups)
and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict))
)
if len(groups) > 1 and not nvfp4_compressed_tensors:
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
weight_config = tuple(groups.values())[0]["weights"]
@@ -505,6 +512,9 @@ class ModelBase:
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
if (base_name + "_zero_point") in self.model_tensors:
tensors_to_remove.append(base_name + "_zero_point")
elif nvfp4_compressed_tensors:
# Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors
pass
else:
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
elif quant_method == "modelopt":
@@ -746,10 +756,13 @@ class ModelBase:
del experts, merged
def prepare_tensors(self):
# detect NVFP4 quantization (ModelOpt format)
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
# detect NVFP4 quantization (ModelOpt and Compressed-tensors formats)
quantization_config = self.hparams.get("quantization_config") or {}
quant_algo = quantization_config.get("quant_algo")
quant_method = quantization_config.get("quant_method")
quant_format = quantization_config.get("format")
quant_groups = quantization_config.get("config_groups") or {}
quant_layers = quantization_config.get("quantized_layers") or {}
quant_config_file = self.dir_model / "hf_quant_config.json"
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
@@ -760,13 +773,25 @@ class ModelBase:
producer_name = (producer.get("name") or "").lower()
if quant_method is None:
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
quant_method = producer_name
quant_algo = quant_config.get("quant_algo", quant_algo)
quant_method = quant_config.get("quant_method", quant_method)
quant_format = quant_config.get("format", quant_format)
quant_groups = quant_config.get("config_groups", quant_groups) or {}
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
# Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
# per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
nvfp4_compressed_tensors = quant_method == "compressed-tensors" and (
quant_format == "nvfp4-pack-quantized"
or quant_format == "mixed-precision"
and bool(quant_groups)
and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict))
)
if quant_algo != "NVFP4":
if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
if nvfp4_compressed_tensors:
quant_algo = "NVFP4"
elif any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
quant_algo = "NVFP4"
self._is_nvfp4 = quant_algo == "NVFP4"
@@ -776,6 +801,28 @@ class ModelBase:
# This must run before dequant_model so NVFP4 tensors are removed
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
if self._is_nvfp4:
if nvfp4_compressed_tensors:
# Convert compressed-tensors 'global' scales into the reciprocal
def inverse_scale(gen):
def load():
scale = LazyTorchTensor.to_eager(gen()).float()
return 1.0 / scale
return load
# Change the compressed-tensors names to the ModelOpt names for handling consistently later
for name in list(self.model_tensors.keys()):
if name.endswith(".weight_packed"):
weight_name = name.removesuffix("_packed")
if weight_name not in self.model_tensors:
self.model_tensors[weight_name] = self.model_tensors.pop(name)
elif name.endswith(".weight_global_scale"):
scale2_name = name.replace(".weight_global_scale", ".weight_scale_2")
if scale2_name not in self.model_tensors:
self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name))
elif name.endswith(".input_global_scale"):
input_scale_name = name.replace(".input_global_scale", ".input_scale")
if input_scale_name not in self.model_tensors:
self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name))
self._generate_nvfp4_tensors()
self.dequant_model()

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
from pathlib import Path
from typing import Any, Callable, Iterable, TYPE_CHECKING
import torch
@@ -549,6 +548,7 @@ class _Qwen35MtpMixin:
tensor_map: gguf.TensorNameMap
no_mtp: bool
mtp_only: bool
_original_block_count: int | None = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -557,22 +557,44 @@ class _Qwen35MtpMixin:
self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
type(self)._original_block_count = hparams.get(key)
return super().index_tensors(remote_hf_model_id=remote_hf_model_id) # ty: ignore[unresolved-attribute]
@classmethod
def filter_tensors(cls, item):
name, _ = item
assert cls._original_block_count is not None
# TODO: change TextModel to super()
if (titem := TextModel.filter_tensors(item)) is None:
return None
name, gen = titem
if name.startswith("model.mtp."):
name = name.replace("model.", "", 1)
if name.startswith("mtp."):
if cls.no_mtp:
return None
return item
if cls.mtp_only:
canonical = name.replace("language_model.", "")
keep = canonical in (
remapper = {
"fc": "eh_proj",
"pre_fc_norm_embedding": "enorm",
"pre_fc_norm_hidden": "hnorm",
"norm": "shared_head.norm",
}
parts = name.split(".", 3)
if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal():
mtp_idx = int(parts[2])
name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}"
elif len(parts) == 3 and parts[1] in remapper:
name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}"
elif cls.mtp_only:
keep = name in (
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
"embed_tokens.weight", "norm.weight",
)
if not keep:
return None
return super().filter_tensors(item) # ty: ignore[unresolved-attribute]
return name, gen
def set_gguf_parameters(self):
super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
@@ -594,29 +616,6 @@ class _Qwen35MtpMixin:
self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("mtp."):
n_layer = self.hparams["num_hidden_layers"]
if name.find("layers.") != -1:
assert bid is not None
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
bid = bid + n_layer
else:
remapper = {
"mtp.fc": "model.layers.{bid}.eh_proj",
"mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
"mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
"mtp.norm": "model.layers.{bid}.shared_head.norm",
}
stem = Path(name).stem
suffix = Path(name).suffix
tmpl = remapper[stem] + suffix
for b in range(n_layer, self.block_count):
yield from super().modify_tensors(data_torch, tmpl.format(bid=b), b) # ty: ignore[unresolved-attribute]
return
yield from super().modify_tensors(data_torch, name, bid) # ty: ignore[unresolved-attribute]
@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):

View File

@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 12)
set(GGML_VERSION_MINOR 13)
set(GGML_VERSION_PATCH 0)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

View File

@@ -1189,8 +1189,8 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// a - x
// b - dy
// a - dy
// b - x
GGML_API struct ggml_tensor * ggml_silu_back(
struct ggml_context * ctx,
struct ggml_tensor * a,

View File

@@ -76,10 +76,16 @@ extern "C" {
struct ggml_context ** ctx;
};
// callback to simulate or wrap a FILE pointer - read up to `len` bytes at `offset` into `output` and return the number of bytes read
typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len);
GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params);
// max_chunk_read is the maximum number of bytes that the GGUF code will read at once from the callback, a value of 0 means no limit
GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params);
GGML_API void gguf_free(struct gguf_context * ctx);
@@ -87,7 +93,7 @@ extern "C" {
GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); // padded to gguf_get_alignment if and only if the gguf_context contains at least one tensor
GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found

View File

@@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
// shift all elements after idx by 1 to the left, overwriting the element at idx
for (int i = idx; i < chunk->n_free_blocks; i++) {
for (int i = idx; i < chunk->n_free_blocks - 1; i++) {
chunk->free_blocks[i] = chunk->free_blocks[i+1];
}
chunk->n_free_blocks--;

View File

@@ -13,6 +13,7 @@
#include <cstring>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <tuple>
#include <utility>
@@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
// meta backend buffer
//
// Container to hold the tensor slices per simple ggml backend buffer.
struct ggml_backend_meta_simple_tensor_container {
std::vector<ggml_context_ptr> ctxs;
std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) {
ctxs.reserve(n_simple);
for (int i = 0; i < n_simple; i++) {
ctxs.emplace_back(ggml_init(params));
}
}
ggml_backend_meta_simple_tensor_container() {}
};
struct ggml_backend_meta_buffer_context {
// FIXME
// Most tensors can simply be stored statically in their own buffer.
// Externally created views however also need a mapping to simple tensors but they use the buffer of the view source.
// If external views are simply using that buffer they will slowly deplete its memory.
// Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp.
// Long-term: tie the lifetime of external views to the meta backend executing the graph instead,
// currently not possible due to graph-external operations in the backend scheduler.
ggml_backend_meta_simple_tensor_container stc_static;
ggml_backend_meta_simple_tensor_container stc_compute[2];
int stc_compute_index = 0;
int stc_compute_index_next = 0;
std::vector<ggml_backend_buffer_ptr> bufs;
// FIXME
// The size of the split state cache is unbounded and can theoretically grow infinitely large.
// However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive.
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
std::map< const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
struct buffer_config {
ggml_context * ctx;
ggml_backend_buffer_t buf;
buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
};
std::vector<buffer_config> buf_configs;
int debug;
ggml_backend_meta_buffer_context() {
ggml_backend_meta_buffer_context(
ggml_backend_meta_simple_tensor_container & stc_static,
ggml_backend_meta_simple_tensor_container & stc_compute_0,
ggml_backend_meta_simple_tensor_container & stc_compute_1,
const std::vector<ggml_backend_buffer_t> & bufs)
: stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} {
this->bufs.reserve(bufs.size());
for (ggml_backend_buffer_t buf : bufs) {
this->bufs.emplace_back(buf);
}
const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG");
debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0;
}
ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) {
if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) {
return stc_static;
}
return stc_compute[stc_compute_index];
}
};
static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
for (auto & [ctx, buf] : buf_ctx->buf_configs) {
ggml_backend_buffer_free(buf);
ggml_free(ctx);
}
delete buf_ctx;
}
static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) {
GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
return buf_ctx->buf_configs.size();
return buf_ctx->bufs.size();
}
static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) {
GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
GGML_ASSERT(index < buf_ctx->buf_configs.size());
return buf_ctx->buf_configs[index].buf;
GGML_ASSERT(index < buf_ctx->bufs.size());
return buf_ctx->bufs[index].get();
}
static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) {
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
GGML_ASSERT(index < buf_ctx->buf_configs.size());
GGML_ASSERT(index < buf_ctx->bufs.size());
auto it = buf_ctx->simple_tensors.find(tensor);
if (it == buf_ctx->simple_tensors.end()) {
ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor);
auto it = stc.simple_tensors.find(tensor);
if (it == stc.simple_tensors.end()) {
return nullptr;
}
return it->second[index];
}
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
@@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
continue;
}
src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
}
@@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
return ret;
}
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync);
}
static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
GGML_UNUSED(buffer);
return (void *) 0x1000000000000000; // FIXME
}
static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) {
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true);
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true);
GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
GGML_ASSERT(split_state.n_segments <= 16);
@@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
std::vector<ggml_tensor *> simple_tensors;
simple_tensors.reserve(n_simple_bufs);
for (size_t j = 0; j < n_simple_bufs; j++) {
ggml_context * simple_ctx = buf_ctx->buf_configs[j].ctx;
ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf;
ggml_context * simple_ctx = stc.ctxs[j].get();
ggml_backend_buffer_t simple_buf = buf_ctx->bufs[j].get();
if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
// TODO: the following assert fails for llama-parallel even though the results are correct:
@@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
} else if (simple_buf != nullptr) {
t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
+ size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
+ size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer));
}
t_ij->extra = tensor->extra;
for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
}
}
buf_ctx->simple_tensors[tensor] = simple_tensors;
stc.simple_tensors[tensor] = simple_tensors;
return GGML_STATUS_SUCCESS;
}
static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next;
return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor);
}
static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
GGML_ASSERT(ggml_is_contiguous(tensor));
@@ -1413,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
}
static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer);
for (size_t i = 0; i < n_buffers; i++) {
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
for (size_t i = 0; i < buf_ctx->bufs.size(); i++) {
ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
}
}
@@ -1440,21 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
ggml_init_params params = {
/*.mem_size =*/ 1024*1024*1024, // FIXME
const ggml_init_params params = {
/*.mem_size =*/ 1024*1024*ggml_tensor_overhead(), // FIXME
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
ggml_backend_meta_simple_tensor_container stc_static;
ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts);
ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts);
ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
size_t max_size = 0;
buf_ctx->buf_configs.reserve(n_simple_bufts);
std::vector<ggml_backend_buffer_t> bufs;
bufs.reserve(n_simple_bufts);
for (size_t i = 0; i < n_simple_bufts; i++) {
ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
GGML_ASSERT(simple_buf != nullptr);
max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size));
GGML_ASSERT(bufs.back() != nullptr);
max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back()));
}
ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
}
@@ -1462,26 +1516,32 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
ggml_init_params params = {
/*.mem_size =*/ 1024*1024*1024, // FIXME
constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals.
const ggml_init_params params_static = {
/*.mem_size =*/ ggml_get_mem_size(ctx),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
const ggml_init_params params_compute = {
/*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,
};
ggml_backend_meta_simple_tensor_container stc_static (params_static, n_simple_bufts);
ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts);
ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts);
ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
for (size_t i = 0; i < n_simple_bufts; i++) {
meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
}
std::vector<ggml_backend_buffer_t> bufs(n_simple_bufts, nullptr);
ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
t->buffer = meta_buf;
ggml_backend_meta_buffer_init_tensor(meta_buf, t);
ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t);
t->data = (void *) 0x2000000000000000; // FIXME
}
for (size_t i = 0; i < n_simple_bufts; i++) {
ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get();
ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
// If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
@@ -1494,15 +1554,15 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
}
}
if (any_nonzero_slice) {
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft));
} else {
meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0));
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
t->buffer = meta_buf_ctx->buf_configs[i].buf;
t->buffer = meta_buf_ctx->bufs[i].get();
}
}
GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
GGML_ASSERT(meta_buf_ctx->bufs[i]);
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get()));
}
return meta_buf;
}
@@ -1724,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
}
if (needs_rebuild) {
std::set<ggml_backend_buffer_t> used_buffers;
for (int i = 0; i < cgraph->n_leafs; i++) {
if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) {
used_buffers.emplace(cgraph->leafs[i]->buffer);
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) {
used_buffers.emplace(cgraph->nodes[i]->buffer);
}
}
for (ggml_backend_buffer_t buf : used_buffers) {
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context;
buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1;
ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next];
for (ggml_context_ptr & ctx : stc.ctxs) {
ggml_reset(ctx.get());
}
stc.simple_tensors.clear();
}
size_t n_subgraphs = 0;
size_t max_tmp_size = 0;
@@ -1909,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
ggml_init_params params = {
const ggml_init_params params = {
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ true,

108
ggml/src/ggml-cuda/fwht.cu Normal file
View File

@@ -0,0 +1,108 @@
#include "common.cuh"
#include "fwht.cuh"
template <int N>
__launch_bounds__(4*ggml_cuda_get_physical_warp_size(), 1)
__global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows, const float scale) {
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
const int64_t r = (int64_t) blockIdx.x * blockDim.y + threadIdx.y;
if (r >= n_rows) {
return;
}
src += r * N;
dst += r * N;
static constexpr int el_w = N / warp_size;
float reg[el_w];
const int lane = threadIdx.x;
#pragma unroll
for (int i = 0; i < el_w; ++i) {
reg[i] = src[i * warp_size + lane] * scale;
}
#pragma unroll
for (int h = 1; h < warp_size; h *= 2) {
#pragma unroll
for (int j = 0; j < el_w; j++) {
const float val = reg[j];
const float val2 = __shfl_xor_sync(0xFFFFFFFF, val, h, warp_size);
reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
}
}
#pragma unroll
for (int h = warp_size; h < N; h *= 2) {
const int step = h / warp_size;
#pragma unroll
for (int j = 0; j < el_w; j += 2 * step) {
#pragma unroll
for (int k = 0; k < step; k++) {
const float x = reg[j + k];
const float y = reg[j + k + step];
reg[j + k] = x + y;
reg[j + k + step] = x - y;
}
}
}
#pragma unroll
for (int i = 0; i < el_w; ++i) {
dst[i * warp_size + lane] = reg[i];
}
}
void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_shape(src, dst));
GGML_ASSERT(ggml_is_contiguous(src));
GGML_ASSERT(ggml_is_contiguous(dst));
const int n = src->ne[0];
const int64_t rows = ggml_nrows(src);
const float * src_d = (const float *) src->data;
float * dst_d = (float *) dst->data;
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
GGML_ASSERT(n % warp_size == 0);
const int rows_per_block = 4;
const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
cudaStream_t stream = ctx.stream();
dim3 grid_dims(num_blocks, 1, 1);
dim3 block_dims(warp_size, rows_per_block, 1);
const ggml_cuda_kernel_launch_params launch_params =
ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
const float scale = 1 / sqrtf(n);
switch (n) {
case 64:
{
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
break;
}
case 128:
{
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
break;
}
case 256:
{
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
break;
}
case 512:
{
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
break;
}
default:
GGML_ABORT("fatal error");
}
}

View File

@@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);

View File

@@ -24,6 +24,7 @@
#include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/diag.cuh"
#include "ggml-cuda/fattn.cuh"
#include "ggml-cuda/fwht.cuh"
#include "ggml-cuda/getrows.cuh"
#include "ggml-cuda/im2col.cuh"
#include "ggml-cuda/mmf.cuh"
@@ -2594,6 +2595,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
const int32_t hint = ggml_get_op_params_i32(dst, 1);
if (hint == GGML_HINT_SRC0_IS_HADAMARD) {
GGML_ASSERT(!split);
ggml_cuda_op_fwht(ctx, src1, dst);
return;
}
if (!split && use_mul_mat_vec_f) {
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)

View File

@@ -228,9 +228,18 @@ struct gguf_context {
};
struct gguf_reader {
gguf_reader(FILE * file) : file(file) {
// read the remaining bytes once and update on each read
nbytes_remain = file_remain(file);
gguf_reader(
gguf_reader_callback_t callback,
void * userdata,
size_t max_chunk_read,
uint64_t data_offset = 0,
uint64_t nbytes_remain = 0)
: callback(callback),
userdata(userdata),
max_chunk_read(max_chunk_read),
data_offset(data_offset),
nbytes_remain(nbytes_remain) {
GGML_ASSERT(max_chunk_read > 0);
}
// helper for remaining bytes in a file
@@ -257,12 +266,10 @@ struct gguf_reader {
template <typename T>
bool read(T & dst) const {
const size_t size = sizeof(dst);
if (nbytes_remain < size) {
if (size > nbytes_remain) {
return false;
}
const size_t nread = fread(&dst, 1, size, file);
nbytes_remain -= nread;
return nread == size;
return read_raw(&dst, size) == size;
}
template <typename T>
@@ -344,24 +351,71 @@ struct gguf_reader {
return false;
}
dst.resize(static_cast<size_t>(size));
const size_t nread = fread(dst.data(), 1, size, file);
nbytes_remain -= nread;
return nread == size;
return read_raw(dst.data(), static_cast<size_t>(size)) == size;
}
bool read(void * dst, const size_t size) const {
if (size > nbytes_remain) {
return false;
}
const size_t nread = fread(dst, 1, size, file);
nbytes_remain -= nread;
return nread == size;
return read_raw(dst, size) == size;
}
uint64_t tell() const {
return data_offset;
}
bool seek(uint64_t absolute_offset) const {
const uint64_t end_offset = uint64_t(data_offset) + nbytes_remain;
if (absolute_offset > end_offset) {
return false;
}
data_offset = absolute_offset;
nbytes_remain = end_offset - absolute_offset;
return true;
}
private:
FILE * file;
size_t read_raw(void * dst, size_t size) const {
if (callback == nullptr || size == 0) {
return 0;
}
mutable uint64_t nbytes_remain;
uint8_t * data = static_cast<uint8_t *>(dst);
size_t total_nread = 0;
bool reached_eof = false;
while (total_nread < size) {
const size_t chunk_size = std::min(max_chunk_read, size - total_nread);
if (data_offset + total_nread < data_offset) {
break;
}
const size_t nread = callback(userdata, static_cast<void *>(data + total_nread), data_offset + total_nread, chunk_size);
total_nread += nread;
if (nread != chunk_size) {
reached_eof = true;
break;
}
}
data_offset += total_nread;
GGML_ASSERT(total_nread <= nbytes_remain);
nbytes_remain -= total_nread;
if (reached_eof) {
nbytes_remain = 0;
}
return total_nread;
}
gguf_reader_callback_t callback = nullptr;
void * userdata = nullptr;
size_t max_chunk_read = 0;
mutable uint64_t data_offset = 0;
mutable uint64_t nbytes_remain = 0;
};
struct gguf_context * gguf_init_empty(void) {
@@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
return true;
}
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
if (!file) {
return nullptr;
}
const struct gguf_reader gr(file);
static struct gguf_context * gguf_init_from_reader(const struct gguf_reader & gr, struct gguf_init_params params) {
struct gguf_context * ctx = new gguf_context;
bool ok = true;
@@ -700,14 +749,14 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
// we require the data section to be aligned, so take into account any padding
if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
gguf_free(ctx);
return nullptr;
}
// store the current file offset - this is where the data section starts
ctx->offset = gguf_ftell(file);
ctx->offset = gr.tell();
// compute the total size of the data section, taking into account the alignment
{
@@ -844,6 +893,89 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
return ctx;
}
struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params) {
if (callback == nullptr) {
return nullptr;
}
const struct gguf_reader gr(callback, userdata, max_chunk_read == 0 ? SIZE_MAX : max_chunk_read, 0, max_expected_size);
return gguf_init_from_reader(gr, params);
}
struct gguf_file_reader {
FILE * file;
uint64_t offset;
};
static size_t gguf_file_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) {
GGML_ASSERT(len > 0);
gguf_file_reader & reader = *static_cast<gguf_file_reader *>(userdata);
if (reader.offset != offset) {
if (offset > INT64_MAX || gguf_fseek(reader.file, static_cast<int64_t>(offset), SEEK_SET) != 0) {
return 0;
}
reader.offset = offset;
}
const size_t nread = fread(static_cast<uint8_t *>(output), 1, len, reader.file);
reader.offset += nread;
return nread;
}
struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
if (!file) {
return nullptr;
}
const int64_t cur = gguf_ftell(file);
if (cur < 0) {
return nullptr;
}
gguf_file_reader reader = {
/*.file = */ file,
/*.offset = */ static_cast<uint64_t>(cur),
};
const struct gguf_reader gr(gguf_file_reader_callback, &reader, SIZE_MAX, reader.offset, gguf_reader::file_remain(file));
return gguf_init_from_reader(gr, params);
}
struct gguf_buffer_reader {
const uint8_t * data;
size_t size;
};
static size_t gguf_buffer_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) {
GGML_ASSERT(len > 0);
const gguf_buffer_reader & reader = *static_cast<gguf_buffer_reader *>(userdata);
if (offset > reader.size || len > reader.size - offset) {
return 0;
}
const size_t data_offset = static_cast<size_t>(offset);
const size_t nread = std::min(len, reader.size - data_offset);
memcpy(static_cast<uint8_t *>(output), reader.data + data_offset, nread);
return nread;
}
struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) {
if (data == nullptr || size == 0) {
return nullptr;
}
gguf_buffer_reader reader = {
/*.data = */ static_cast<const uint8_t *>(data),
/*.size = */ size,
};
const struct gguf_reader gr(gguf_buffer_reader_callback, &reader, SIZE_MAX, 0, size);
return gguf_init_from_reader(gr, params);
}
struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = ggml_fopen(fname, "rb");

View File

@@ -1 +1 @@
0ce7ad348a3151e1da9f65d962044546bcaad421
e705c5fed490514458bdd2eaddc43bd098fcce9b

View File

@@ -767,8 +767,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// Nemotron 3 Super
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

View File

@@ -8308,6 +8308,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 64, 1, 64));
test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 256, 1, 256));
test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 32, 128));
test_cases.emplace_back(new test_mul_mat_hadamard(GGML_TYPE_F32, GGML_TYPE_F32, 128, 4, 128, {2, 3}));
#if 0
// > 4GB A matrix. Too slow to be enabled by default.

View File

@@ -162,6 +162,42 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) {
GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
}
static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
GGML_ASSERT(file != nullptr);
GGML_ASSERT(fseek(file, 0, SEEK_END) == 0);
const long size = ftell(file);
GGML_ASSERT(size >= 0);
rewind(file);
std::vector<uint8_t> data(static_cast<size_t>(size));
GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
rewind(file);
return data;
}
struct callback_reader_data {
const uint8_t * data;
size_t size;
};
static size_t read_buffer_callback(void * userdata, void * output, uint64_t offset, size_t len) {
GGML_ASSERT(len > 0);
const callback_reader_data & reader = *static_cast<callback_reader_data *>(userdata);
if (offset > reader.size || len > reader.size - offset) {
return 0;
}
const size_t data_offset = static_cast<size_t>(offset);
const size_t nread = std::min(len, reader.size - data_offset);
memcpy(static_cast<uint8_t *>(output), reader.data + data_offset, nread);
return nread;
}
static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
FILE * file = tmpfile();
@@ -1095,10 +1131,29 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
return ok;
}
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
enum roundtrip_read_mode {
ROUNDTRIP_READ_MODE_FILE,
ROUNDTRIP_READ_MODE_BUFFER,
ROUNDTRIP_READ_MODE_CALLBACK,
};
static const char * roundtrip_read_mode_name(const roundtrip_read_mode mode) {
switch (mode) {
case ROUNDTRIP_READ_MODE_FILE: return "file";
case ROUNDTRIP_READ_MODE_BUFFER: return "buffer";
case ROUNDTRIP_READ_MODE_CALLBACK: return "callback";
}
GGML_ABORT("fatal error");
}
static std::pair<int, int> test_roundtrip(
ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta,
const roundtrip_read_mode read_mode) {
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
printf("%s: device=%s, backend=%s, only_meta=%s\n",
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
printf("%s: device=%s, backend=%s, only_meta=%s, read_mode=%s\n",
__func__, ggml_backend_dev_description(dev), ggml_backend_name(backend),
only_meta ? "yes" : "no", roundtrip_read_mode_name(read_mode));
int npass = 0;
int ntest = 0;
@@ -1133,7 +1188,22 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
/*no_alloc =*/ false,
/*ctx =*/ only_meta ? nullptr : &ctx_1,
};
struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
struct gguf_context * gguf_ctx_1 = nullptr;
const std::vector<uint8_t> data = read_mode == ROUNDTRIP_READ_MODE_FILE
? std::vector<uint8_t>()
: read_file_to_buffer(file);
if (read_mode == ROUNDTRIP_READ_MODE_BUFFER) {
gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
} else if (read_mode == ROUNDTRIP_READ_MODE_CALLBACK) {
callback_reader_data reader = {
/*.data = */ data.data(),
/*.size = */ data.size(),
};
gguf_ctx_1 = gguf_init_from_callback(read_buffer_callback, &reader, 4096, 4ull << 30 /* 4GB */, gguf_params);
} else {
gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
}
printf("%s: same_version: ", __func__);
if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
@@ -1343,7 +1413,17 @@ int main(int argc, char ** argv) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
for (bool only_meta : {true, false}) {
std::pair<int, int> result = test_roundtrip(dev, seed, only_meta);
std::pair<int, int> result = test_roundtrip(dev, seed, only_meta, ROUNDTRIP_READ_MODE_FILE);
npass += result.first;
ntest += result.second;
}
{
std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_BUFFER);
npass += result.first;
ntest += result.second;
}
{
std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_CALLBACK);
npass += result.first;
ntest += result.second;
}

View File

@@ -16,3 +16,12 @@ export enum AgenticSectionType {
REASONING = 'reasoning',
REASONING_PENDING = 'reasoning_pending'
}
/**
* How a Continue click on an assistant message resumes generation.
*/
export enum ContinueIntentKind {
APPEND_TEXT = 'append_text',
RERUN_TURN = 'rerun_turn',
NEXT_TURN = 'next_turn'
}

View File

@@ -6,7 +6,7 @@ export {
AttachmentItemVisibleWhen
} from './attachment.enums';
export { AgenticSectionType, ToolCallType } from './agentic.enums';
export { AgenticSectionType, ContinueIntentKind, ToolCallType } from './agentic.enums';
export {
ChatMessageStatsView,

View File

@@ -33,6 +33,7 @@ import {
isAbortError,
generateConversationTitle
} from '$lib/utils';
import { classifyContinueIntent } from '$lib/utils/agentic';
import {
MAX_INACTIVE_CONVERSATION_STATES,
INACTIVE_CONVERSATION_STATE_MAX_AGE_MS,
@@ -51,7 +52,7 @@ import type {
DatabaseMessage,
DatabaseMessageExtra
} from '$lib/types';
import { ErrorDialogType, MessageRole, MessageType } from '$lib/enums';
import { ContinueIntentKind, ErrorDialogType, MessageRole, MessageType } from '$lib/enums';
interface ConversationStateEntry {
lastAccessed: number;
@@ -1259,6 +1260,57 @@ class ChatStore {
}
}
/**
* Open a fresh assistant turn anchored at the last tool result of a resolved
* agentic round and let streamChatCompletion route through runAgenticFlow.
* Used by continueAssistantMessage when classifyContinueIntent returns
* next_turn, meaning the target assistant already has its tool_calls paired
* with trailing tool results and the next thing to generate is a brand new
* turn rather than a token level continuation.
*/
private async continueAsNextAgenticTurn(anchorIndex: number): Promise<void> {
const activeConv = conversationsStore.activeConversation;
if (!activeConv) return;
const anchor = conversationsStore.activeMessages[anchorIndex];
if (!anchor) return;
this.cancelPreEncode();
this.setChatLoading(activeConv.id, true);
this.clearChatStreaming(activeConv.id);
try {
const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
const anchorMessage = findMessageById(allMessages, anchor.id);
if (!anchorMessage) {
this.setChatLoading(activeConv.id, false);
return;
}
const newAssistantMessage = await DatabaseService.createMessageBranch(
{
convId: activeConv.id,
type: MessageType.TEXT,
timestamp: Date.now(),
role: MessageRole.ASSISTANT,
content: '',
toolCalls: '',
children: [],
model: null
},
anchorMessage.id
);
await conversationsStore.updateCurrentNode(newAssistantMessage.id);
conversationsStore.updateConversationTimestamp();
await conversationsStore.refreshActiveMessages();
const conversationPath = filterByLeafNodeId(
allMessages,
anchorMessage.id,
false
) as DatabaseMessage[];
await this.streamChatCompletion(conversationPath, newAssistantMessage);
} catch (error) {
if (!isAbortError(error)) console.error('Failed to continue agentic turn:', error);
this.setChatLoading(activeConv.id, false);
}
}
async continueAssistantMessage(messageId: string): Promise<void> {
const activeConv = conversationsStore.activeConversation;
if (!activeConv || this.isChatLoadingInternal(activeConv.id)) return;
@@ -1268,6 +1320,18 @@ class ChatStore {
const { message: msg, index: idx } = result;
// Decide which resume path applies. tool_calls without tool results can
// not be resumed mid sequence by continue_final_message, branch instead.
// tool_calls already paired with tool results need a fresh next turn,
// not a token level continuation of the target assistant.
const intent = classifyContinueIntent(conversationsStore.activeMessages, idx);
if (intent.kind === ContinueIntentKind.RERUN_TURN) {
return this.regenerateMessageWithBranching(messageId);
}
if (intent.kind === ContinueIntentKind.NEXT_TURN) {
return this.continueAsNextAgenticTurn(intent.truncateAfter);
}
try {
this.showErrorDialog(null);
this.setChatLoading(activeConv.id, true);
@@ -1283,15 +1347,11 @@ class ChatStore {
const originalContent = dbMessage.content;
const originalReasoning = dbMessage.reasoningContent || '';
const conversationContext = conversationsStore.activeMessages.slice(0, idx);
const contextWithContinue = [
...conversationContext,
{
role: MessageRole.ASSISTANT as const,
content: originalContent,
reasoning_content: originalReasoning || undefined
}
];
// Hand the persisted DatabaseMessage straight to sendMessage so its
// internal converter preserves tool_calls and extras when present.
// Reconstructing a bare {role, content} here would drop those fields
// and break continue_final_message for messages with tool calls.
const contextWithContinue = conversationsStore.activeMessages.slice(0, idx + 1);
let appendedContent = '';
let appendedReasoning = '';

View File

@@ -1,4 +1,4 @@
import { AgenticSectionType, MessageRole } from '$lib/enums';
import { AgenticSectionType, ContinueIntentKind, MessageRole } from '$lib/enums';
import { ATTACHMENT_SAVED_REGEX, NEWLINE_SEPARATOR } from '$lib/constants';
import type { ApiChatCompletionToolCall } from '$lib/types/api';
import type {
@@ -225,3 +225,62 @@ export function hasAgenticContent(
return toolMessages.length > 0;
}
/**
* Classification of how a Continue click on an assistant message should resume
* generation. The caller dispatches the resume path based on this value.
*
* append_text -> the target is a plain text turn, resume with
* continue_final_message and rehydrate the persisted
* tool_calls and attachments through the regular DB to API
* message converter.
* rerun_turn -> the target carries tool_calls that were never resolved by
* tool result messages. The agentic stream was cut mid turn,
* so we drop the target and rerun the loop from the previous
* history. truncateAfter is the last kept index, inclusive.
* next_turn -> the target's tool_calls were already resolved by trailing
* tool results. Hand the history up to and including the
* last consecutive tool result back to the agentic loop so it
* starts the next turn naturally. truncateAfter points at
* that last tool result.
*/
export type ContinueIntent =
| { kind: ContinueIntentKind.APPEND_TEXT }
| { kind: ContinueIntentKind.RERUN_TURN; truncateAfter: number }
| { kind: ContinueIntentKind.NEXT_TURN; truncateAfter: number };
/**
* Decide how a Continue click on messages[idx] should resume generation.
* Pure function over the persisted history snapshot.
*/
export function classifyContinueIntent(messages: DatabaseMessage[], idx: number): ContinueIntent {
const target = messages[idx];
// Defensive default: callers already filter by role, stay deterministic.
if (!target || target.role !== MessageRole.ASSISTANT) {
return { kind: ContinueIntentKind.APPEND_TEXT };
}
const hasToolCalls = parseToolCalls(target.toolCalls).length > 0;
if (!hasToolCalls) {
return { kind: ContinueIntentKind.APPEND_TEXT };
}
// Walk consecutive trailing tool results. The agentic loop only emits tool
// messages directly after the assistant turn that owns them, so the first
// non tool message marks the boundary.
let lastTrailingTool = idx;
for (let i = idx + 1; i < messages.length; i++) {
if (messages[i].role === MessageRole.TOOL) {
lastTrailingTool = i;
} else {
break;
}
}
if (lastTrailingTool > idx) {
return { kind: ContinueIntentKind.NEXT_TURN, truncateAfter: lastTrailingTool };
}
return { kind: ContinueIntentKind.RERUN_TURN, truncateAfter: idx - 1 };
}

View File

@@ -0,0 +1,166 @@
import { describe, it, expect } from 'vitest';
import { classifyContinueIntent } from '$lib/utils/agentic';
import { ContinueIntentKind, MessageRole, MessageType } from '$lib/enums';
import type { DatabaseMessage } from '$lib/types/database';
/**
* Tests for the Continue button intent classifier.
*
* The classifier walks the persisted message history to decide which of three
* resume paths a Continue click should take:
*
* A. append_text -> plain text assistant turn, resume with
* continue_final_message.
* B. rerun_turn -> assistant turn with tool_calls but no tool results yet,
* the stream was cut mid turn and the tool_calls are
* unrecoverable as a token level continuation. Drop the
* target and rerun from the previous history.
* C. next_turn -> assistant turn with tool_calls that were already
* resolved by trailing tool results. Hand the history
* back to the agentic loop so it starts the next turn.
*/
let nextId = 0;
function makeMsg(role: MessageRole, opts: Partial<DatabaseMessage> = {}): DatabaseMessage {
nextId++;
return {
id: `msg-${nextId}`,
convId: 'conv-1',
type: MessageType.TEXT,
timestamp: nextId,
role,
content: '',
parent: null,
children: [],
...opts
};
}
function toolCall(id: string, name: string, args: string = '{}'): string {
return JSON.stringify([{ id, type: 'function', function: { name, arguments: args } }]);
}
describe('classifyContinueIntent', () => {
it('returns append_text for a plain text assistant turn at the tail', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'hello' }),
makeMsg(MessageRole.ASSISTANT, { content: 'hi there' })
];
const intent = classifyContinueIntent(messages, 1);
expect(intent).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
});
it('returns append_text for a plain text assistant turn in the middle', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'q1' }),
makeMsg(MessageRole.ASSISTANT, { content: 'a1' }),
makeMsg(MessageRole.USER, { content: 'q2' }),
makeMsg(MessageRole.ASSISTANT, { content: 'a2' })
];
expect(classifyContinueIntent(messages, 1)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
});
it('returns rerun_turn when the assistant has tool_calls without results', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'list files' }),
makeMsg(MessageRole.ASSISTANT, {
content: '',
toolCalls: toolCall('call_1', 'bash_tool', '{"command":"ls"}')
})
];
const intent = classifyContinueIntent(messages, 1);
expect(intent).toEqual({ kind: ContinueIntentKind.RERUN_TURN, truncateAfter: 0 });
});
it('returns next_turn when trailing tool results resolve the tool_calls', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'list files' }),
makeMsg(MessageRole.ASSISTANT, {
content: '',
toolCalls: toolCall('call_1', 'bash_tool')
}),
makeMsg(MessageRole.TOOL, { content: 'file1\nfile2', toolCallId: 'call_1' })
];
const intent = classifyContinueIntent(messages, 1);
expect(intent).toEqual({ kind: ContinueIntentKind.NEXT_TURN, truncateAfter: 2 });
});
it('next_turn keeps all consecutive trailing tool results, not just one', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'do many things' }),
makeMsg(MessageRole.ASSISTANT, {
content: '',
toolCalls: JSON.stringify([
{ id: 'call_1', type: 'function', function: { name: 'a', arguments: '{}' } },
{ id: 'call_2', type: 'function', function: { name: 'b', arguments: '{}' } }
])
}),
makeMsg(MessageRole.TOOL, { content: 'r1', toolCallId: 'call_1' }),
makeMsg(MessageRole.TOOL, { content: 'r2', toolCallId: 'call_2' })
];
const intent = classifyContinueIntent(messages, 1);
expect(intent).toEqual({ kind: ContinueIntentKind.NEXT_TURN, truncateAfter: 3 });
});
it('next_turn stops at the first non tool message after the target', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'go' }),
makeMsg(MessageRole.ASSISTANT, {
content: '',
toolCalls: toolCall('call_1', 'a')
}),
makeMsg(MessageRole.TOOL, { content: 'r1', toolCallId: 'call_1' }),
makeMsg(MessageRole.USER, { content: 'wait' }),
makeMsg(MessageRole.TOOL, { content: 'late', toolCallId: 'call_1' })
];
const intent = classifyContinueIntent(messages, 1);
// truncateAfter must point at the contiguous tool block, not jump over
// the user message to grab the dangling late tool.
expect(intent).toEqual({ kind: ContinueIntentKind.NEXT_TURN, truncateAfter: 2 });
});
it('returns append_text when toolCalls is set but parses to empty array', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'q' }),
makeMsg(MessageRole.ASSISTANT, { content: 'a', toolCalls: '[]' })
];
expect(classifyContinueIntent(messages, 1)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
});
it('returns append_text when toolCalls is malformed JSON', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'q' }),
makeMsg(MessageRole.ASSISTANT, { content: 'a', toolCalls: '{not json' })
];
expect(classifyContinueIntent(messages, 1)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
});
it('returns append_text defensively when idx points at a non assistant message', () => {
const messages = [
makeMsg(MessageRole.USER, { content: 'q' }),
makeMsg(MessageRole.ASSISTANT, { content: 'a' })
];
expect(classifyContinueIntent(messages, 0)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
});
it('returns append_text defensively when idx is out of bounds', () => {
const messages = [makeMsg(MessageRole.ASSISTANT, { content: 'a' })];
expect(classifyContinueIntent(messages, 5)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
expect(classifyContinueIntent([], 0)).toEqual({ kind: ContinueIntentKind.APPEND_TEXT });
});
});