Compare commits

...

5 Commits
b6052 ... b6057

Author SHA1 Message Date
lhez
1c872f71fb opencl: add f16 for add, sub, mul, div (#14984) 2025-08-01 13:15:44 +02:00
Srihari-mcw
baad94885d ggml : Q2k interleaving implementation - x86/x64 SIMD (#14373)
* Initial Q2_K Block Interleaving Implementation

* Addressed review comments and clean up of the code

* Post rebase fixes

* Initial CI/CD fixes

* Update declarations in arch-fallback.h

* Changes for GEMV Q2_K in arch-fallback.h

* Enable repacking only on AVX-512 machines

* Update comments in repack.cpp

* Address q2k comments

---------

Co-authored-by: Manogna-Sree <elisetti.manognasree@multicorewareinc.com>
2025-08-01 09:20:33 +03:00
Georgi Gerganov
ba42794c9e graph : fix equal_seq() check (#14986)
ggml-ci
2025-08-01 06:38:12 +03:00
diannao
2860d479b4 docker : add cann build pipline (#14591)
* docker: add cann build pipline

* docker: add cann build pipline

* docker: fix cann devops

* cann : fix multi card hccl

* Update ggml/src/ggml-cann/ggml-cann.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

* Update ggml-cann.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2025-08-01 10:02:34 +08:00
R0CKSTAR
484b2091ce compare-commits.sh: support both llama-bench and test-backend-ops (#14392)
* compare-commits.sh: support both llama-bench and test-backend-ops

Signed-off-by: Xiaodong Ye <yeahdongcn@gmail.com>

* Speed up the build by specifying -j 12

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Remove build_number from test-backend-ops db

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Apply suggestion from @JohannesGaessler

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Refine tool selection logic

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Address review comments

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

---------

Signed-off-by: Xiaodong Ye <yeahdongcn@gmail.com>
Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-08-01 08:47:27 +08:00
16 changed files with 4517 additions and 201 deletions

130
.devops/cann.Dockerfile Normal file
View File

@@ -0,0 +1,130 @@
# ==============================================================================
# ARGUMENTS
# ==============================================================================
# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build
# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3
# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum
# -- Set the working directory --
WORKDIR /app
# -- Copy project files --
COPY . .
# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# ... You can add other environment variables from the original file as needed ...
# For brevity, only core variables are listed here. You can paste the original ENV list here.
# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)
# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;
# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
# If you have a tools.sh script, make sure it is copied here
# cp .devops/tools.sh /app/full/tools.sh
# ==============================================================================
# BASE STAGE
# Create a minimal base image with CANN runtime and common libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base
# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
rm -rf /var/cache/yum
# -- Set CANN environment variables (required for runtime) --
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
# ... You can add other environment variables from the original file as needed ...
WORKDIR /app
# Copy compiled .so files from the build stage
COPY --from=build /app/lib/ /app
# ==============================================================================
# FINAL STAGES (TARGETS)
# ==============================================================================
### Target: full
# Complete image with all tools, Python bindings, and dependencies
# ==============================================================================
FROM base AS full
COPY --from=build /app/full /app
# Install Python dependencies
RUN yum install -y git python3 python3-pip && \
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
pip3 install --no-cache-dir -r requirements.txt && \
yum clean all && \
rm -rf /var/cache/yum
# You need to provide a tools.sh script as the entrypoint
ENTRYPOINT ["/app/tools.sh"]
# If there is no tools.sh, you can set the default to start the server
# ENTRYPOINT ["/app/llama-server"]
### Target: light
# Lightweight image containing only llama-cli
# ==============================================================================
FROM base AS light
COPY --from=build /app/full/llama-cli /app
ENTRYPOINT [ "/app/llama-cli" ]
### Target: server
# Dedicated server image containing only llama-server
# ==============================================================================
FROM base AS server
ENV LLAMA_ARG_HOST=0.0.0.0
COPY --from=build /app/full/llama-server /app
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
ENTRYPOINT [ "/app/llama-server" ]

View File

@@ -2016,6 +2016,9 @@ static bool ggml_backend_cann_cpy_tensor_async(
(ggml_backend_cann_context*)backend_dst->context;
size_t copy_size = ggml_nbytes(dst);
if (copy_size == 0) {
return true;
}
if (backend_src != backend_dst) {
ggml_backend_cann_buffer_context* buf_ctx_src =
(ggml_backend_cann_buffer_context*)buf_src->context;

View File

@@ -37,17 +37,21 @@
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// repack.cpp
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
@@ -72,11 +76,13 @@
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__loongarch64)
// quants.c
@@ -92,11 +98,13 @@
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__riscv)
// quants.c
@@ -119,10 +127,12 @@
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__s390x__)
// quants.c
@@ -147,11 +157,13 @@
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__wasm__)
// quants.c
@@ -175,10 +187,12 @@
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -412,6 +412,82 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
assert (n % qk == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
float sumf[8];
float sum_minf[8];
int sumi1,sumi2,sumi3,sumi4;
int sumi;
const block_q8_K * a_ptr = (const block_q8_K *)vy;
for(int x = 0; x < nc / ncols_interleaved; x++) {
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) {
sumf[j] = 0.0;
sum_minf[j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi3 = 0;
sumi4 = 0;
sumi = 0;
int offset = ((k / 2) % 2) + j * 2;
for (int i = 0; i < blocklen; ++i){
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
sumi1 = sumi1 * (scales_0[offset] & 0xF);
sumi2 = sumi2 * (scales_1[offset] & 0xF);
sumi3 = sumi3 * (scales_2[offset] & 0xF);
sumi4 = sumi4 * (scales_3[offset] & 0xF);
sumi += sumi1 + sumi2 + sumi3 + sumi4;
}
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
for(int sb = 0; sb < 8; sb++) {
const uint8_t *mins = b_ptr[l].scales + sb * 16;
for(int j = 0; j < ncols_interleaved; j++){
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) {
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
}
}
}
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -711,6 +787,97 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 8;
assert (n % qk == 0);
assert (nr % 4 == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
float sumf[4][8];
float sum_minf[4][8];
int sumi1, sumi2, sumi3, sumi4;
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumf[m][j] = 0.0;
sum_minf[m][j] = 0.0;
}
}
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi3 = 0;
sumi4 = 0;
sumi = 0;
int offset = ((k / 2) % 2) + j * 2;
for (int i = 0; i < blocklen; ++i){
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
sumi1 = sumi1 * (scales_0[offset] & 0xF);
sumi2 = sumi2 * (scales_1[offset] & 0xF);
sumi3 = sumi3 * (scales_2[offset] & 0xF);
sumi4 = sumi4 * (scales_3[offset] & 0xF);
sumi += sumi1 + sumi2 + sumi3 + sumi4;
}
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
}
}
}
for(int sb = 0; sb < 8; sb++) {
const uint8_t *mins = b_ptr[l].scales + sb * 16;
for(int m = 0; m < 4; m++) {
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
for(int j = 0; j < ncols_interleaved; j++) {
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
}
}
}
}
}
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
@@ -914,6 +1081,50 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
return out;
}
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
block_q2_Kx8 out;
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
for (int i = 0; i < 8; i++) {
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
}
for (int i = 0; i < 8; i++) {
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
}
const int end = QK_K * 2 / blck_size_interleave;
// Interleave Q2_K quants by taking 8 bytes at a time
for (int i = 0; i < end; ++i) {
int src_id = i % 8;
int src_offset = (i / 8) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;
uint64_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
}
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
for(int i = 0; i < 128; i++){
// Index for selecting which q2k super block
int src1 = (i % 16) / 2;
// Index for selecting scale
int src2 = ((i / 16) * 2) + (i % 2);
out.scales[i] = in[src1].scales[src2];
}
return out;
}
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
@@ -975,6 +1186,37 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
GGML_UNUSED(data_size);
}
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
GGML_ASSERT(interleave_block == 8);
constexpr int nrows_interleaved = 8;
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
const block_q2_K * src = (const block_q2_K*) data;
block_q2_K dst_tmp[8];
int nrow = ggml_nrows(t);
int nblocks = t->ne[0] / QK_K;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
GGML_UNUSED(data_size);
}
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(interleave_block == 8);
@@ -1095,6 +1337,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
}
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
}
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
}
@@ -1124,6 +1370,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
@@ -1148,6 +1398,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
}
@@ -1421,6 +1675,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
// instance for Q2
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
// instance for IQ4
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
@@ -1446,6 +1703,12 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return &q4_K_8x8_q8_K;
}
}
} else if (cur->type == GGML_TYPE_Q2_K) {
if (ggml_cpu_has_avx512()) {
if (cur->ne[1] % 8 == 0) {
return &q2_K_8x8_q8_K;
}
}
} else if (cur->type == GGML_TYPE_IQ4_NL) {
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
if (cur->ne[1] % 4 == 0) {

View File

@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
};
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
struct block_q2_Kx8 {
ggml_half d[8]; // super-block scale for quantized scales
ggml_half dmin[8]; // super-block scale for quantized mins
uint8_t scales[128]; // scales and mins, quantized with 4 bits
uint8_t qs[512]; // 2--bit quants
};
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
struct block_q8_Kx4 {
float d[4]; // delta
int8_t qs[QK_K * 4]; // quants
@@ -71,11 +78,13 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// Native implementations
@@ -86,11 +95,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#if defined(__cplusplus)

View File

@@ -400,10 +400,10 @@ struct ggml_backend_opencl_context {
cl_program program_mul_mm_f32_f32_l4_lm;
cl_program program_mul_mm_f16_f32_l4_lm;
cl_kernel kernel_add, kernel_add_row;
cl_kernel kernel_mul, kernel_mul_row;
cl_kernel kernel_div, kernel_div_row;
cl_kernel kernel_sub, kernel_sub_row;
cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
cl_kernel kernel_scale;
cl_kernel kernel_silu, kernel_silu_4;
cl_kernel kernel_gelu, kernel_gelu_4;
@@ -674,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
backend_ctx->program_add =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
CL_CHECK((backend_ctx->kernel_add_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
GGML_LOG_CONT(".");
}
@@ -1089,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
backend_ctx->program_mul =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
CL_CHECK((backend_ctx->kernel_mul_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
GGML_LOG_CONT(".");
}
@@ -1288,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
#else
const std::string kernel_src = read_file("div.cl");
#endif
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable -cl-finite-math-only ";
backend_ctx->program_div =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
GGML_LOG_CONT(".");
}
@@ -1308,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
backend_ctx->program_sub =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
CL_CHECK((backend_ctx->kernel_sub_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
GGML_LOG_CONT(".");
}
@@ -2447,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
default:
return false;
}
case GGML_OP_ADD:
case GGML_OP_SCALE:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SUB:
return op->src[0]->type == GGML_TYPE_F32;
return (op->src[0]->type == op->src[1]->type) &&
(op->src[0]->type == op->type) &&
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
@@ -3680,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0 ? src0->ne[0] : 0;
const int ne01 = src0 ? src0->ne[1] : 0;
const int ne02 = src0 ? src0->ne[2] : 0;
const int ne03 = src0 ? src0->ne[3] : 0;
GGML_ASSERT(src0->type == src1->type);
GGML_ASSERT(src0->type == dst->type);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const int ne10 = src1 ? src1->ne[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0;
const int ne12 = src1 ? src1->ne[2] : 0;
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3]; UNUSED(ne13);
const int ne0 = dst ? dst->ne[0] : 0;
const int ne1 = dst ? dst->ne[1] : 0;
const int ne2 = dst ? dst->ne[2] : 0;
const int ne3 = dst ? dst->ne[3] : 0;
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -3731,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_add_row;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_add_row;
} else {
kernel = backend_ctx->kernel_add_row_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -3741,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_add;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_add;
} else {
kernel = backend_ctx->kernel_add_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -3803,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0 ? src0->ne[0] : 0;
const int ne01 = src0 ? src0->ne[1] : 0;
const int ne02 = src0 ? src0->ne[2] : 0;
const int ne03 = src0 ? src0->ne[3] : 0;
GGML_ASSERT(src0->type == src1->type);
GGML_ASSERT(src0->type == dst->type);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const int ne10 = src1 ? src1->ne[0] : 0;
const int ne11 = src1 ? src1->ne[1] : 0;
const int ne12 = src1 ? src1->ne[2] : 0;
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3]; UNUSED(ne13);
const int ne0 = dst ? dst->ne[0] : 0;
const int ne1 = dst ? dst->ne[1] : 0;
const int ne2 = dst ? dst->ne[2] : 0;
const int ne3 = dst ? dst->ne[3] : 0;
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
@@ -3854,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_mul_row;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_mul_row;
} else {
kernel = backend_ctx->kernel_mul_row_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -3864,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_mul;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_mul;
} else {
kernel = backend_ctx->kernel_mul_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -3926,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_ASSERT(src0->type == src1->type);
GGML_ASSERT(src0->type == dst->type);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
@@ -3974,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_div_row;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_div_row;
} else {
kernel = backend_ctx->kernel_div_row_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -3984,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_div;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_div;
} else {
kernel = backend_ctx->kernel_div_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -4034,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_ASSERT(src0->type == src1->type);
GGML_ASSERT(src0->type == dst->type);
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
@@ -4082,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_sub_row;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sub_row;
} else {
kernel = backend_ctx->kernel_sub_row_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -4092,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_sub;
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sub;
} else {
kernel = backend_ctx->kernel_sub_f16;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));

View File

@@ -81,3 +81,76 @@ kernel void kernel_add_row(
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] + src1[idx1];
}
kernel void kernel_add_f16(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int ne2,
int ne3,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_add_row_f16(
global half4 * src0,
ulong offset0,
global half4 * src1,
ulong offset1,
global half4 * dst,
ulong offsetd,
int ne
) {
src0 = (global half4*)((global char*)src0 + offset0);
src1 = (global half4*)((global char*)src1 + offset1);
dst = (global half4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] + src1[idx1];
}

View File

@@ -70,3 +70,69 @@ kernel void kernel_div_row(
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] / src1[idx1];
}
kernel void kernel_div_f16(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_div_row_f16(
global half4 * src0,
ulong offset0,
global half4 * src1,
ulong offset1,
global half4 * dst,
ulong offsetd,
int ne
) {
src0 = (global half4*)((global char*)src0 + offset0);
src1 = (global half4*)((global char*)src1 + offset1);
dst = (global half4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] / src1[idx1];
}

View File

@@ -77,3 +77,76 @@ kernel void kernel_mul_row(
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] * src1[idx1];
}
kernel void kernel_mul_f16(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int ne2,
int ne3,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) * *((global half *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_mul_row_f16(
global half4 * src0,
ulong offset0,
global half4 * src1,
ulong offset1,
global half4 * dst,
ulong offsetd,
int ne
) {
src0 = (global half4*)((global char*)src0 + offset0);
src1 = (global half4*)((global char*)src1 + offset1);
dst = (global half4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] * src1[idx1];
}

View File

@@ -70,3 +70,69 @@ kernel void kernel_sub_row(
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] - src1[idx1];
}
kernel void kernel_sub_f16(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_sub_row_f16(
global half4 * src0,
ulong offset0,
global half4 * src1,
ulong offset1,
global half4 * dst,
ulong offsetd,
int ne
) {
src0 = (global half4*)((global char*)src0 + offset0);
src1 = (global half4*)((global char*)src1 + offset1);
dst = (global half4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] - src1[idx1];
}

View File

@@ -1,19 +1,41 @@
#!/usr/bin/env bash
if [ $# -lt 2 ]; then
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [additional llama-bench arguments]"
echo "usage: ./scripts/compare-commits.sh <commit1> <commit2> [tool] [additional arguments]"
echo " tool: 'llama-bench' (default) or 'test-backend-ops'"
echo " additional arguments: passed to the selected tool"
exit 1
fi
set -e
set -x
# Parse arguments
commit1=$1
commit2=$2
tool=${3:-llama-bench}
additional_args="${@:4}"
# Validate tool argument
if [ "$tool" != "llama-bench" ] && [ "$tool" != "test-backend-ops" ]; then
echo "Error: tool must be 'llama-bench' or 'test-backend-ops'"
exit 1
fi
# verify at the start that the compare script has all the necessary dependencies installed
./scripts/compare-llama-bench.py --check
bench_args="${@:3}"
if [ "$tool" = "llama-bench" ]; then
db_file="llama-bench.sqlite"
target="llama-bench"
run_args="-o sql -oe md $additional_args"
else # test-backend-ops
db_file="test-backend-ops.sqlite"
target="test-backend-ops"
run_args="perf --output sql $additional_args"
fi
rm -f llama-bench.sqlite > /dev/null
rm -f "$db_file" > /dev/null
# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
if [ -n "$GGML_CUDA" ]; then
@@ -25,14 +47,14 @@ dir="build-bench"
function run {
rm -fr ${dir} > /dev/null
cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
cmake --build ${dir} -t llama-bench > /dev/null
${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
cmake --build ${dir} -t $target -j $(nproc) > /dev/null
${dir}/bin/$target $run_args | sqlite3 "$db_file"
}
git checkout $1 > /dev/null
git checkout $commit1 > /dev/null
run
git checkout $2 > /dev/null
git checkout $commit2 > /dev/null
run
./scripts/compare-llama-bench.py -b $1 -c $2
./scripts/compare-llama-bench.py -b $commit1 -c $commit2 --tool $tool -i "$db_file"

View File

@@ -1,16 +1,16 @@
#!/usr/bin/env python3
import logging
import argparse
import heapq
import sys
import os
from glob import glob
import sqlite3
import json
import csv
from typing import Optional, Union
import heapq
import json
import logging
import os
import sqlite3
import sys
from collections.abc import Iterator, Sequence
from glob import glob
from typing import Any, Optional, Union
try:
import git
@@ -23,7 +23,7 @@ except ImportError as e:
logger = logging.getLogger("compare-llama-bench")
# All llama-bench SQL fields
DB_FIELDS = [
LLAMA_BENCH_DB_FIELDS = [
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
@@ -33,7 +33,7 @@ DB_FIELDS = [
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
]
DB_TYPES = [
LLAMA_BENCH_DB_TYPES = [
"TEXT", "INTEGER", "TEXT", "TEXT", "TEXT", "TEXT",
"TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
"TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER",
@@ -42,20 +42,41 @@ DB_TYPES = [
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL",
]
assert len(DB_FIELDS) == len(DB_TYPES)
# Properties by which to differentiate results per commit:
KEY_PROPERTIES = [
# All test-backend-ops SQL fields
TEST_BACKEND_OPS_DB_FIELDS = [
"test_time", "build_commit", "backend_name", "op_name", "op_params", "test_mode",
"supported", "passed", "error_message", "time_us", "flops", "bandwidth_gb_s",
"memory_kb", "n_runs"
]
TEST_BACKEND_OPS_DB_TYPES = [
"TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT",
"INTEGER", "INTEGER", "TEXT", "REAL", "REAL", "REAL",
"INTEGER", "INTEGER"
]
assert len(LLAMA_BENCH_DB_FIELDS) == len(LLAMA_BENCH_DB_TYPES)
assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
# Properties by which to differentiate results per commit for llama-bench:
LLAMA_BENCH_KEY_PROPERTIES = [
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth"
]
# Properties that are boolean and are converted to Yes/No for the table:
BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
# Properties by which to differentiate results per commit for test-backend-ops:
TEST_BACKEND_OPS_KEY_PROPERTIES = [
"backend_name", "op_name", "op_params", "test_mode"
]
# Header names for the table:
PRETTY_NAMES = {
# Properties that are boolean and are converted to Yes/No for the table:
LLAMA_BENCH_BOOL_PROPERTIES = ["embeddings", "cpu_strict", "use_mmap", "no_kv_offload", "flash_attn"]
TEST_BACKEND_OPS_BOOL_PROPERTIES = ["supported", "passed"]
# Header names for the table (llama-bench):
LLAMA_BENCH_PRETTY_NAMES = {
"cpu_info": "CPU", "gpu_info": "GPU", "backends": "Backends", "n_gpu_layers": "GPU layers",
"tensor_buft_overrides": "Tensor overrides", "model_filename": "File", "model_type": "Model", "model_size": "Model size [GiB]",
"model_n_params": "Num. of par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size", "embeddings": "Embeddings",
@@ -64,21 +85,42 @@ PRETTY_NAMES = {
"flash_attn": "FlashAttention",
}
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
DEFAULT_HIDE = ["model_filename"] # Always hide these properties by default.
# Header names for the table (test-backend-ops):
TEST_BACKEND_OPS_PRETTY_NAMES = {
"backend_name": "Backend", "op_name": "GGML op", "op_params": "Op parameters", "test_mode": "Mode",
"supported": "Supported", "passed": "Passed", "error_message": "Error",
"flops": "FLOPS", "bandwidth_gb_s": "Bandwidth (GB/s)", "memory_kb": "Memory (KB)", "n_runs": "Runs"
}
DEFAULT_SHOW_LLAMA_BENCH = ["model_type"] # Always show these properties by default.
DEFAULT_HIDE_LLAMA_BENCH = ["model_filename"] # Always hide these properties by default.
DEFAULT_SHOW_TEST_BACKEND_OPS = ["backend_name", "op_name"] # Always show these properties by default.
DEFAULT_HIDE_TEST_BACKEND_OPS = ["error_message"] # Always hide these properties by default.
GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "] # Strip prefixes for smaller tables.
MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}
DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
DESCRIPTION = """Creates tables from llama-bench or test-backend-ops data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):
For llama-bench:
$ git checkout master
$ make clean && make llama-bench
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
$ git checkout some_branch
$ make clean && make llama-bench
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t llama-bench -j $(nproc)
$ ./llama-bench -o sql | sqlite3 llama-bench.sqlite
$ ./scripts/compare-llama-bench.py
For test-backend-ops:
$ git checkout master
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
$ git checkout some_branch
$ cmake -B ${BUILD_DIR} ${CMAKE_OPTS} && cmake --build ${BUILD_DIR} -t test-backend-ops -j $(nproc)
$ ./test-backend-ops perf --output sql | sqlite3 test-backend-ops.sqlite
$ ./scripts/compare-llama-bench.py --tool test-backend-ops -i test-backend-ops.sqlite
Performance numbers from multiple runs per commit are averaged WITHOUT being weighted by the --repetitions parameter of llama-bench.
"""
@@ -96,6 +138,13 @@ help_c = (
"Defaults to the non-master commit for which llama-bench was run most recently."
)
parser.add_argument("-c", "--compare", help=help_c)
help_t = (
"The tool whose data is being compared. "
"Either 'llama-bench' or 'test-backend-ops'. "
"This determines the database schema and comparison logic used. "
"If left unspecified, try to determine from the input file."
)
parser.add_argument("-t", "--tool", help=help_t, default=None, choices=[None, "llama-bench", "test-backend-ops"])
help_i = (
"JSON/JSONL/SQLite/CSV files for comparing commits. "
"Specify multiple times to use multiple input files (JSON/CSV only). "
@@ -114,7 +163,8 @@ parser.add_argument("-o", "--output", help=help_o, default="pipe")
help_s = (
"Columns to add to the table. "
"Accepts a comma-separated list of values. "
f"Legal values: {', '.join(KEY_PROPERTIES[:-3])}. "
f"Legal values for test-backend-ops: {', '.join(TEST_BACKEND_OPS_KEY_PROPERTIES)}. "
f"Legal values for llama-bench: {', '.join(LLAMA_BENCH_KEY_PROPERTIES[:-3])}. "
"Defaults to model name (model_type) and CPU and/or GPU name (cpu_info, gpu_info) "
"plus any column where not all data points are the same. "
"If the columns are manually specified, then the results for each unique combination of the "
@@ -142,8 +192,14 @@ if unknown_args:
sys.exit(1)
input_file = known_args.input
if not input_file and os.path.exists("./llama-bench.sqlite"):
input_file = ["llama-bench.sqlite"]
tool = known_args.tool
if not input_file:
if tool == "llama-bench" and os.path.exists("./llama-bench.sqlite"):
input_file = ["llama-bench.sqlite"]
elif tool == "test-backend-ops" and os.path.exists("./test-backend-ops.sqlite"):
input_file = ["test-backend-ops.sqlite"]
if not input_file:
sqlite_files = glob("*.sqlite")
if len(sqlite_files) == 1:
@@ -161,14 +217,23 @@ class LlamaBenchData:
build_len_max: int
build_len: int = 8
builds: list[str] = []
check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
tool: str = "llama-bench" # Tool type: "llama-bench" or "test-backend-ops"
def __init__(self):
def __init__(self, tool: str = "llama-bench"):
self.tool = tool
try:
self.repo = git.Repo(".", search_parent_directories=True)
except git.InvalidGitRepositoryError:
self.repo = None
# Set schema-specific properties based on tool
if self.tool == "llama-bench":
self.check_keys = set(LLAMA_BENCH_KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])
elif self.tool == "test-backend-ops":
self.check_keys = set(TEST_BACKEND_OPS_KEY_PROPERTIES + ["build_commit", "test_time"])
else:
assert False
def _builds_init(self):
self.build_len = self.build_len_min
@@ -252,52 +317,121 @@ class LlamaBenchData:
class LlamaBenchDataSQLite3(LlamaBenchData):
connection: sqlite3.Connection
cursor: sqlite3.Cursor
table_name: str
def __init__(self):
super().__init__()
def __init__(self, tool: str = "llama-bench"):
super().__init__(tool)
self.connection = sqlite3.connect(":memory:")
self.cursor = self.connection.cursor()
self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
# Set table name and schema based on tool
if self.tool == "llama-bench":
self.table_name = "test"
db_fields = LLAMA_BENCH_DB_FIELDS
db_types = LLAMA_BENCH_DB_TYPES
elif self.tool == "test-backend-ops":
self.table_name = "test_backend_ops"
db_fields = TEST_BACKEND_OPS_DB_FIELDS
db_types = TEST_BACKEND_OPS_DB_TYPES
else:
assert False
self.cursor.execute(f"CREATE TABLE {self.table_name}({', '.join(' '.join(x) for x in zip(db_fields, db_types))});")
def _builds_init(self):
if self.connection:
self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
self.build_len_min = self.cursor.execute(f"SELECT MIN(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
self.build_len_max = self.cursor.execute(f"SELECT MAX(LENGTH(build_commit)) from {self.table_name};").fetchone()[0]
if self.build_len_min != self.build_len_max:
logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
"Try purging the the database of old commits.")
self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
self.cursor.execute(f"UPDATE {self.table_name} SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
builds = self.cursor.execute(f"SELECT DISTINCT build_commit FROM {self.table_name};").fetchall()
self.builds = list(map(lambda b: b[0], builds)) # list[tuple[str]] -> list[str]
super()._builds_init()
def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
data = self.cursor.execute(
"SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
f"SELECT build_commit, test_time FROM {self.table_name} ORDER BY test_time;").fetchall()
return reversed(data) if reverse else data
def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
if self.tool == "llama-bench":
return self._get_rows_llama_bench(properties, hexsha8_baseline, hexsha8_compare)
elif self.tool == "test-backend-ops":
return self._get_rows_test_backend_ops(properties, hexsha8_baseline, hexsha8_compare)
else:
assert False
def _get_rows_llama_bench(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
select_string = ", ".join(
[f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
equal_string = " AND ".join(
[f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
[f"tb.{p} = tc.{p}" for p in LLAMA_BENCH_KEY_PROPERTIES] + [
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
)
group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
return self.cursor.execute(query).fetchall()
def _get_rows_test_backend_ops(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
# For test-backend-ops, we compare FLOPS and bandwidth metrics (prioritizing FLOPS over bandwidth)
select_string = ", ".join(
[f"tb.{p}" for p in properties] + [
"AVG(tb.flops)", "AVG(tc.flops)",
"AVG(tb.bandwidth_gb_s)", "AVG(tc.bandwidth_gb_s)"
])
equal_string = " AND ".join(
[f"tb.{p} = tc.{p}" for p in TEST_BACKEND_OPS_KEY_PROPERTIES] + [
f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'",
"tb.supported = 1", "tc.supported = 1", "tb.passed = 1", "tc.passed = 1"] # Only compare successful tests
)
group_order_string = ", ".join([f"tb.{p}" for p in properties])
query = (f"SELECT {select_string} FROM {self.table_name} tb JOIN {self.table_name} tc ON {equal_string} "
f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
return self.cursor.execute(query).fetchall()
class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
def __init__(self, data_file: str):
super().__init__()
def __init__(self, data_file: str, tool: Any):
super().__init__(tool)
self.connection.close()
self.connection = sqlite3.connect(data_file)
self.cursor = self.connection.cursor()
# Check which table exists in the database
tables = self.cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
table_names = [table[0] for table in tables]
# Tool selection logic
if tool is None:
if "test" in table_names:
self.table_name = "test"
self.tool = "llama-bench"
elif "test_backend_ops" in table_names:
self.table_name = "test_backend_ops"
self.tool = "test-backend-ops"
else:
raise RuntimeError(f"No suitable table found in database. Available tables: {table_names}")
elif tool == "llama-bench":
if "test" in table_names:
self.table_name = "test"
self.tool = "llama-bench"
else:
raise RuntimeError(f"Table 'test' not found for tool 'llama-bench'. Available tables: {table_names}")
elif tool == "test-backend-ops":
if "test_backend_ops" in table_names:
self.table_name = "test_backend_ops"
self.tool = "test-backend-ops"
else:
raise RuntimeError(f"Table 'test_backend_ops' not found for tool 'test-backend-ops'. Available tables: {table_names}")
else:
raise RuntimeError(f"Unknown tool: {tool}")
self._builds_init()
@staticmethod
@@ -317,20 +451,23 @@ class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
def __init__(self, data_file: str):
super().__init__()
def __init__(self, data_file: str, tool: str = "llama-bench"):
super().__init__(tool)
# Get the appropriate field list based on tool
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
with open(data_file, "r", encoding="utf-8") as fp:
for i, line in enumerate(fp):
parsed = json.loads(line)
for k in parsed.keys() - set(DB_FIELDS):
for k in parsed.keys() - set(db_fields):
del parsed[k]
if (missing_keys := self._check_keys(parsed.keys())):
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
self._builds_init()
@@ -349,21 +486,24 @@ class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
def __init__(self, data_files: list[str]):
super().__init__()
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
super().__init__(tool)
# Get the appropriate field list based on tool
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
for data_file in data_files:
with open(data_file, "r", encoding="utf-8") as fp:
parsed = json.load(fp)
for i, entry in enumerate(parsed):
for k in entry.keys() - set(DB_FIELDS):
for k in entry.keys() - set(db_fields):
del entry[k]
if (missing_keys := self._check_keys(entry.keys())):
raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
self._builds_init()
@@ -384,21 +524,24 @@ class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
def __init__(self, data_files: list[str]):
super().__init__()
def __init__(self, data_files: list[str], tool: str = "llama-bench"):
super().__init__(tool)
# Get the appropriate field list based on tool
db_fields = LLAMA_BENCH_DB_FIELDS if tool == "llama-bench" else TEST_BACKEND_OPS_DB_FIELDS
for data_file in data_files:
with open(data_file, "r", encoding="utf-8") as fp:
for i, parsed in enumerate(csv.DictReader(fp)):
keys = set(parsed.keys())
for k in keys - set(DB_FIELDS):
for k in keys - set(db_fields):
del parsed[k]
if (missing_keys := self._check_keys(keys)):
raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
self.cursor.execute(f"INSERT INTO {self.table_name}({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
self._builds_init()
@@ -419,21 +562,90 @@ class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
return True
def format_flops(flops_value: float) -> str:
"""Format FLOPS values with appropriate units for better readability."""
if flops_value == 0:
return "0.00"
# Define unit thresholds and names
units = [
(1e12, "T"), # TeraFLOPS
(1e9, "G"), # GigaFLOPS
(1e6, "M"), # MegaFLOPS
(1e3, "k"), # kiloFLOPS
(1, "") # FLOPS
]
for threshold, unit in units:
if abs(flops_value) >= threshold:
formatted_value = flops_value / threshold
if formatted_value >= 100:
return f"{formatted_value:.1f}{unit}"
else:
return f"{formatted_value:.2f}{unit}"
# Fallback for very small values
return f"{flops_value:.2f}"
def format_flops_for_table(flops_value: float, target_unit: str) -> str:
"""Format FLOPS values for table display without unit suffix (since unit is in header)."""
if flops_value == 0:
return "0.00"
# Define unit thresholds based on target unit
unit_divisors = {
"TFLOPS": 1e12,
"GFLOPS": 1e9,
"MFLOPS": 1e6,
"kFLOPS": 1e3,
"FLOPS": 1
}
divisor = unit_divisors.get(target_unit, 1)
formatted_value = flops_value / divisor
if formatted_value >= 100:
return f"{formatted_value:.1f}"
else:
return f"{formatted_value:.2f}"
def get_flops_unit_name(flops_values: list) -> str:
"""Determine the best FLOPS unit name based on the magnitude of values."""
if not flops_values or all(v == 0 for v in flops_values):
return "FLOPS"
# Find the maximum absolute value to determine appropriate unit
max_flops = max(abs(v) for v in flops_values if v != 0)
if max_flops >= 1e12:
return "TFLOPS"
elif max_flops >= 1e9:
return "GFLOPS"
elif max_flops >= 1e6:
return "MFLOPS"
elif max_flops >= 1e3:
return "kFLOPS"
else:
return "FLOPS"
bench_data = None
if len(input_file) == 1:
if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
bench_data = LlamaBenchDataSQLite3File(input_file[0])
bench_data = LlamaBenchDataSQLite3File(input_file[0], tool)
elif LlamaBenchDataJSON.valid_format(input_file):
bench_data = LlamaBenchDataJSON(input_file)
bench_data = LlamaBenchDataJSON(input_file, tool)
elif LlamaBenchDataJSONL.valid_format(input_file[0]):
bench_data = LlamaBenchDataJSONL(input_file[0])
bench_data = LlamaBenchDataJSONL(input_file[0], tool)
elif LlamaBenchDataCSV.valid_format(input_file):
bench_data = LlamaBenchDataCSV(input_file)
bench_data = LlamaBenchDataCSV(input_file, tool)
else:
if LlamaBenchDataJSON.valid_format(input_file):
bench_data = LlamaBenchDataJSON(input_file)
bench_data = LlamaBenchDataJSON(input_file, tool)
elif LlamaBenchDataCSV.valid_format(input_file):
bench_data = LlamaBenchDataCSV(input_file)
bench_data = LlamaBenchDataCSV(input_file, tool)
if not bench_data:
raise RuntimeError("No valid (or some invalid) input files found.")
@@ -504,12 +716,29 @@ else:
name_compare = bench_data.get_commit_name(hexsha8_compare)
# Get tool-specific configuration
if tool == "llama-bench":
key_properties = LLAMA_BENCH_KEY_PROPERTIES
bool_properties = LLAMA_BENCH_BOOL_PROPERTIES
pretty_names = LLAMA_BENCH_PRETTY_NAMES
default_show = DEFAULT_SHOW_LLAMA_BENCH
default_hide = DEFAULT_HIDE_LLAMA_BENCH
elif tool == "test-backend-ops":
key_properties = TEST_BACKEND_OPS_KEY_PROPERTIES
bool_properties = TEST_BACKEND_OPS_BOOL_PROPERTIES
pretty_names = TEST_BACKEND_OPS_PRETTY_NAMES
default_show = DEFAULT_SHOW_TEST_BACKEND_OPS
default_hide = DEFAULT_HIDE_TEST_BACKEND_OPS
else:
assert False
# If the user provided columns to group the results by, use them:
if known_args.show is not None:
show = known_args.show.split(",")
unknown_cols = []
for prop in show:
if prop not in KEY_PROPERTIES[:-3]: # Last three values are n_prompt, n_gen, n_depth.
valid_props = key_properties if tool == "test-backend-ops" else key_properties[:-3] # Exclude n_prompt, n_gen, n_depth for llama-bench
if prop not in valid_props:
unknown_cols.append(prop)
if unknown_cols:
logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
@@ -518,32 +747,54 @@ if known_args.show is not None:
rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
# Otherwise, select those columns where the values are not all the same:
else:
rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
rows_full = bench_data.get_rows(key_properties, hexsha8_baseline, hexsha8_compare)
properties_different = []
for i, kp_i in enumerate(KEY_PROPERTIES):
if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
continue
for row_full in rows_full:
if row_full[i] != rows_full[0][i]:
properties_different.append(kp_i)
break
if tool == "llama-bench":
# For llama-bench, skip n_prompt, n_gen, n_depth from differentiation logic
check_properties = [kp for kp in key_properties if kp not in ["n_prompt", "n_gen", "n_depth"]]
for i, kp_i in enumerate(key_properties):
if kp_i in default_show or kp_i in ["n_prompt", "n_gen", "n_depth"]:
continue
for row_full in rows_full:
if row_full[i] != rows_full[0][i]:
properties_different.append(kp_i)
break
elif tool == "test-backend-ops":
# For test-backend-ops, check all key properties
for i, kp_i in enumerate(key_properties):
if kp_i in default_show:
continue
for row_full in rows_full:
if row_full[i] != rows_full[0][i]:
properties_different.append(kp_i)
break
else:
assert False
show = []
# Show CPU and/or GPU by default even if the hardware for all results is the same:
if rows_full and "n_gpu_layers" not in properties_different:
ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])
if ngl != 99 and "cpu_info" not in properties_different:
show.append("cpu_info")
if tool == "llama-bench":
# Show CPU and/or GPU by default even if the hardware for all results is the same:
if rows_full and "n_gpu_layers" not in properties_different:
ngl = int(rows_full[0][key_properties.index("n_gpu_layers")])
show += properties_different
if ngl != 99 and "cpu_info" not in properties_different:
show.append("cpu_info")
index_default = 0
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
if prop in show:
index_default += 1
show = show[:index_default] + DEFAULT_SHOW + show[index_default:]
for prop in DEFAULT_HIDE:
show += properties_different
index_default = 0
for prop in ["cpu_info", "gpu_info", "n_gpu_layers", "main_gpu"]:
if prop in show:
index_default += 1
show = show[:index_default] + default_show + show[index_default:]
elif tool == "test-backend-ops":
show = default_show + properties_different
else:
assert False
for prop in default_hide:
try:
show.remove(prop)
except ValueError:
@@ -551,7 +802,7 @@ else:
# Add plot_x parameter to parameters to show if it's not already present:
if known_args.plot:
for k, v in PRETTY_NAMES.items():
for k, v in pretty_names.items():
if v == known_args.plot_x and k not in show:
show.append(k)
break
@@ -563,60 +814,120 @@ if not rows_show:
sys.exit(1)
table = []
for row in rows_show:
n_prompt = int(row[-5])
n_gen = int(row[-4])
n_depth = int(row[-3])
if n_prompt != 0 and n_gen == 0:
test_name = f"pp{n_prompt}"
elif n_prompt == 0 and n_gen != 0:
test_name = f"tg{n_gen}"
else:
test_name = f"pp{n_prompt}+tg{n_gen}"
if n_depth != 0:
test_name = f"{test_name}@d{n_depth}"
# Regular columns test name avg t/s values Speedup
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
primary_metric = "FLOPS" # Default to FLOPS for test-backend-ops
if tool == "llama-bench":
# For llama-bench, create test names and compare avg_ts values
for row in rows_show:
n_prompt = int(row[-5])
n_gen = int(row[-4])
n_depth = int(row[-3])
if n_prompt != 0 and n_gen == 0:
test_name = f"pp{n_prompt}"
elif n_prompt == 0 and n_gen != 0:
test_name = f"tg{n_gen}"
else:
test_name = f"pp{n_prompt}+tg{n_gen}"
if n_depth != 0:
test_name = f"{test_name}@d{n_depth}"
# Regular columns test name avg t/s values Speedup
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
table.append(list(row[:-5]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
elif tool == "test-backend-ops":
# Determine the primary metric by checking rows until we find one with valid data
if rows_show:
primary_metric = "FLOPS" # Default to FLOPS
flops_values = []
# Collect all FLOPS values to determine the best unit
for sample_row in rows_show:
baseline_flops = float(sample_row[-4])
compare_flops = float(sample_row[-3])
baseline_bandwidth = float(sample_row[-2])
if baseline_flops > 0:
flops_values.extend([baseline_flops, compare_flops])
elif baseline_bandwidth > 0 and not flops_values:
primary_metric = "Bandwidth (GB/s)"
# If we have FLOPS data, determine the appropriate unit
if flops_values:
primary_metric = get_flops_unit_name(flops_values)
# For test-backend-ops, prioritize FLOPS > bandwidth for comparison
for row in rows_show:
# Extract metrics: flops, bandwidth_gb_s (baseline and compare)
baseline_flops = float(row[-4])
compare_flops = float(row[-3])
baseline_bandwidth = float(row[-2])
compare_bandwidth = float(row[-1])
# Determine which metric to use for comparison (prioritize FLOPS > bandwidth)
if baseline_flops > 0 and compare_flops > 0:
# Use FLOPS comparison (higher is better)
speedup = compare_flops / baseline_flops
baseline_str = format_flops_for_table(baseline_flops, primary_metric)
compare_str = format_flops_for_table(compare_flops, primary_metric)
elif baseline_bandwidth > 0 and compare_bandwidth > 0:
# Use bandwidth comparison (higher is better)
speedup = compare_bandwidth / baseline_bandwidth
baseline_str = f"{baseline_bandwidth:.2f}"
compare_str = f"{compare_bandwidth:.2f}"
else:
# Fallback if no valid data is available
baseline_str = "N/A"
compare_str = "N/A"
from math import nan
speedup = nan
table.append(list(row[:-4]) + [baseline_str, compare_str, speedup])
else:
assert False
# Some a-posteriori fixes to make the table contents prettier:
for bool_property in BOOL_PROPERTIES:
for bool_property in bool_properties:
if bool_property in show:
ip = show.index(bool_property)
for row_table in table:
row_table[ip] = "Yes" if int(row_table[ip]) == 1 else "No"
if "model_type" in show:
ip = show.index("model_type")
for (old, new) in MODEL_SUFFIX_REPLACE.items():
if tool == "llama-bench":
if "model_type" in show:
ip = show.index("model_type")
for (old, new) in MODEL_SUFFIX_REPLACE.items():
for row_table in table:
row_table[ip] = row_table[ip].replace(old, new)
if "model_size" in show:
ip = show.index("model_size")
for row_table in table:
row_table[ip] = row_table[ip].replace(old, new)
row_table[ip] = float(row_table[ip]) / 1024 ** 3
if "model_size" in show:
ip = show.index("model_size")
for row_table in table:
row_table[ip] = float(row_table[ip]) / 1024 ** 3
if "gpu_info" in show:
ip = show.index("gpu_info")
for row_table in table:
for gns in GPU_NAME_STRIP:
row_table[ip] = row_table[ip].replace(gns, "")
if "gpu_info" in show:
ip = show.index("gpu_info")
for row_table in table:
for gns in GPU_NAME_STRIP:
row_table[ip] = row_table[ip].replace(gns, "")
gpu_names = row_table[ip].split(", ")
num_gpus = len(gpu_names)
all_names_the_same = len(set(gpu_names)) == 1
if len(gpu_names) >= 2 and all_names_the_same:
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
gpu_names = row_table[ip].split(", ")
num_gpus = len(gpu_names)
all_names_the_same = len(set(gpu_names)) == 1
if len(gpu_names) >= 2 and all_names_the_same:
row_table[ip] = f"{num_gpus}x {gpu_names[0]}"
headers = [PRETTY_NAMES[p] for p in show]
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
headers = [pretty_names.get(p, p) for p in show]
if tool == "llama-bench":
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
elif tool == "test-backend-ops":
headers += [f"{primary_metric} {name_baseline}", f"{primary_metric} {name_compare}", "Speedup"]
else:
assert False
if known_args.plot:
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False):
def create_performance_plot(table_data: list[list[str]], headers: list[str], baseline_name: str, compare_name: str, output_file: str, plot_x_param: str, log_scale: bool = False, tool_type: str = "llama-bench", metric_name: str = "t/s"):
try:
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('Agg')
except ImportError as e:
logger.error("matplotlib is required for --plot.")
@@ -627,7 +938,7 @@ if known_args.plot:
plot_x_label = plot_x_param
if plot_x_param not in ["n_prompt", "n_gen", "n_depth"]:
pretty_name = PRETTY_NAMES.get(plot_x_param, plot_x_param)
pretty_name = LLAMA_BENCH_PRETTY_NAMES.get(plot_x_param, plot_x_param)
if pretty_name in data_headers:
plot_x_index = data_headers.index(pretty_name)
plot_x_label = pretty_name
@@ -746,8 +1057,16 @@ if known_args.plot:
title = ', '.join(title_parts) if title_parts else "Performance comparison"
# Determine y-axis label based on tool type
if tool_type == "llama-bench":
y_label = "Tokens per second (t/s)"
elif tool_type == "test-backend-ops":
y_label = metric_name
else:
assert False
ax.set_xlabel(plot_x_label, fontsize=12, fontweight='bold')
ax.set_ylabel('Tokens per second (t/s)', fontsize=12, fontweight='bold')
ax.set_ylabel(y_label, fontsize=12, fontweight='bold')
ax.set_title(title, fontsize=12, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)
@@ -765,7 +1084,7 @@ if known_args.plot:
plt.savefig(output_file, dpi=300, bbox_inches='tight')
plt.close()
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale)
create_performance_plot(table, headers, name_baseline, name_compare, known_args.plot, known_args.plot_x, known_args.plot_log_scale, tool, primary_metric)
print(tabulate( # noqa: NP100
table,

View File

@@ -113,6 +113,15 @@ llama_context::llama_context(
}
}
{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
if (graph_reuse_disable) {
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
}
}
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
const auto gparams = graph_params(res, ubatch, mctx, gtype);
if (res->can_reuse(gparams)) {
if (!graph_reuse_disable && res->can_reuse(gparams)) {
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
n_reused++;

View File

@@ -291,6 +291,9 @@ private:
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = false;
// env: LLAMA_GRAPH_REUSE_DISABLE
bool graph_reuse_disable = false;
// perf
mutable int64_t t_start_us = 0;
mutable int64_t t_load_us = 0;

View File

@@ -423,7 +423,9 @@ struct llm_graph_params {
(!ubatch.embd && !other.ubatch.embd)
);
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
// the reason is because the set of attention streams would be different for different sequences
if (can_reuse_ubatch && ubatch.equal_seqs()) {
if (!ubatch.data) {
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
// therefore we cannot perform the sequence id check. normally should never happen