mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-04-15 07:34:21 +02:00
Compare commits
25 Commits
b1465
...
fix-tensor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47d604fa2d | ||
|
|
73c0010e18 | ||
|
|
c41ea36eaa | ||
|
|
a7fac013cf | ||
|
|
48ade94538 | ||
|
|
05c51f96fe | ||
|
|
f28af0d81a | ||
|
|
d9b33fe95b | ||
|
|
5ba3746171 | ||
|
|
abb77e7319 | ||
|
|
8f961abdc4 | ||
|
|
05816027d6 | ||
|
|
3fdbe6b66b | ||
|
|
629f917cd6 | ||
|
|
51b2fc11f7 | ||
|
|
224e7d5b14 | ||
|
|
c7743fe1c1 | ||
|
|
d6069051de | ||
|
|
4ff1046d75 | ||
|
|
21958bb393 | ||
|
|
2756c4fbff | ||
|
|
1efae9b7dc | ||
|
|
b12fa0d1c1 | ||
|
|
4d719a6d4e | ||
|
|
183b3fac6c |
15
.github/workflows/build.yml
vendored
15
.github/workflows/build.yml
vendored
@@ -288,6 +288,7 @@ jobs:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.21.1-2023-04-24
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -383,11 +384,23 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||
run: |
|
||||
cd build
|
||||
ctest -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
& $sde -future -- ctest -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -65,7 +65,7 @@ models-mnt
|
||||
/parallel
|
||||
/train-text-from-scratch
|
||||
/vdot
|
||||
build-info.h
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
CMakeSettings.json
|
||||
|
||||
@@ -10,7 +10,7 @@ endif()
|
||||
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
set(LLAMA_STANDALONE ON)
|
||||
|
||||
# configure project version
|
||||
@@ -100,39 +100,6 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALO
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||
|
||||
#
|
||||
# Build info header
|
||||
#
|
||||
|
||||
# Generate initial build-info.h
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||
endif()
|
||||
|
||||
# Add a custom target for build-info.h
|
||||
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
||||
|
||||
# Add a custom command to rebuild build-info.h when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
DEPENDS "${GIT_DIR}/index"
|
||||
VERBATIM
|
||||
)
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
@@ -543,6 +510,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (LLAMA_NATIVE)
|
||||
include(cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
||||
|
||||
71
Makefile
71
Makefile
@@ -542,9 +542,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
||||
|
||||
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
|
||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||
@@ -563,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
|
||||
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||
@@ -614,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
|
||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
@@ -623,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip
|
||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
@@ -648,7 +648,7 @@ swift: examples/batched.swift
|
||||
(cd examples/batched.swift; make build)
|
||||
endif
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
|
||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||
@if ! cmp -s $@.tmp $@; then \
|
||||
mv $@.tmp $@; \
|
||||
@@ -656,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
rm $@.tmp; \
|
||||
fi
|
||||
|
||||
build-info.o: common/build-info.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
tests: $(TEST_TARGETS)
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
run-benchmark-matmult: benchmark-matmult
|
||||
@@ -676,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c llama.h
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||

|
||||
|
||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
@@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
|
||||
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
||||
- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
|
||||
|
||||
----
|
||||
|
||||
|
||||
38
build.zig
38
build.zig
@@ -10,7 +10,6 @@ const Maker = struct {
|
||||
builder: *std.build.Builder,
|
||||
target: CrossTarget,
|
||||
optimize: Mode,
|
||||
config_header: *ConfigHeader,
|
||||
enable_lto: bool,
|
||||
|
||||
include_dirs: ArrayList([]const u8),
|
||||
@@ -41,26 +40,24 @@ const Maker = struct {
|
||||
const commit_hash = try std.ChildProcess.exec(
|
||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||
);
|
||||
const config_header = builder.addConfigHeader(
|
||||
.{ .style = .blank, .include_path = "build-info.h" },
|
||||
.{
|
||||
.BUILD_NUMBER = 0,
|
||||
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
|
||||
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
|
||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
||||
},
|
||||
);
|
||||
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||
\\int LLAMA_BUILD_NUMBER = {};
|
||||
\\char const *LLAMA_COMMIT = "{s}";
|
||||
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||
\\
|
||||
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||
var m = Maker{
|
||||
.builder = builder,
|
||||
.target = target,
|
||||
.optimize = builder.standardOptimizeOption(.{}),
|
||||
.config_header = config_header,
|
||||
.enable_lto = false,
|
||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||
};
|
||||
|
||||
try m.addCFlag("-std=c11");
|
||||
try m.addCxxFlag("-std=c++11");
|
||||
try m.addProjectInclude(&.{});
|
||||
@@ -72,7 +69,7 @@ const Maker = struct {
|
||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||
if (o.target.getAbi() != .msvc)
|
||||
o.defineCMacro("_GNU_SOURCE", null);
|
||||
o.addConfigHeader(m.config_header);
|
||||
|
||||
if (std.mem.endsWith(u8, src, ".c")) {
|
||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||
o.linkLibC();
|
||||
@@ -85,7 +82,6 @@ const Maker = struct {
|
||||
o.linkLibCpp();
|
||||
}
|
||||
}
|
||||
o.addConfigHeader(m.config_header);
|
||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||
o.want_lto = m.enable_lto;
|
||||
return o;
|
||||
@@ -105,7 +101,6 @@ const Maker = struct {
|
||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||
e.linkLibCpp();
|
||||
}
|
||||
e.addConfigHeader(m.config_header);
|
||||
m.builder.installArtifact(e);
|
||||
e.want_lto = m.enable_lto;
|
||||
return e;
|
||||
@@ -121,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("console", "common/console.cpp");
|
||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||
@@ -128,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
100
cmake/FindSIMD.cmake
Normal file
100
cmake/FindSIMD.cmake
Normal file
@@ -0,0 +1,100 @@
|
||||
include(CheckCSourceRuns)
|
||||
|
||||
set(AVX_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256 a;
|
||||
a = _mm256_set1_ps(0);
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(AVX512_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
__m512i b = a;
|
||||
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(AVX2_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256i a = {0};
|
||||
a = _mm256_abs_epi16(a);
|
||||
__m256i x;
|
||||
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(FMA_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
const __m256 d = _mm256_setzero_ps();
|
||||
const __m256 p = _mm256_setzero_ps();
|
||||
acc = _mm256_fmadd_ps( d, p, acc );
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
macro(check_sse type flags)
|
||||
set(__FLAG_I 1)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
foreach (__FLAG ${flags})
|
||||
if (NOT ${type}_FOUND)
|
||||
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
|
||||
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
|
||||
if (HAS_${type}_${__FLAG_I})
|
||||
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
|
||||
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
|
||||
endif()
|
||||
math(EXPR __FLAG_I "${__FLAG_I}+1")
|
||||
endif()
|
||||
endforeach()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
|
||||
if (NOT ${type}_FOUND)
|
||||
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
|
||||
set(${type}_FLAGS "" CACHE STRING "${type} flags")
|
||||
endif()
|
||||
|
||||
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
|
||||
endmacro()
|
||||
|
||||
# flags are for MSVC only!
|
||||
check_sse("AVX" " ;/arch:AVX")
|
||||
if (NOT ${AVX_FOUND})
|
||||
set(LLAMA_AVX OFF)
|
||||
else()
|
||||
set(LLAMA_AVX ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX2" " ;/arch:AVX2")
|
||||
check_sse("FMA" " ;/arch:AVX2")
|
||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||
set(LLAMA_AVX2 OFF)
|
||||
else()
|
||||
set(LLAMA_AVX2 ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX512" " ;/arch:AVX512")
|
||||
if (NOT ${AVX512_FOUND})
|
||||
set(LLAMA_AVX512 OFF)
|
||||
else()
|
||||
set(LLAMA_AVX512 ON)
|
||||
endif()
|
||||
@@ -1,8 +1,46 @@
|
||||
# common
|
||||
|
||||
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
endif()
|
||||
|
||||
set(GIT_INDEX "${GIT_DIR}/index")
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
|
||||
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
)
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT build-info.cpp)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
|
||||
set(TARGET common)
|
||||
|
||||
add_library(${TARGET} OBJECT
|
||||
add_library(${TARGET} STATIC
|
||||
common.h
|
||||
common.cpp
|
||||
sampling.h
|
||||
@@ -21,4 +59,4 @@ endif()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||
target_link_libraries(${TARGET} PRIVATE llama)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info)
|
||||
|
||||
4
common/build-info.cpp.in
Normal file
4
common/build-info.cpp.in
Normal file
@@ -0,0 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
@@ -1,5 +1,4 @@
|
||||
#include "common.h"
|
||||
#include "build-info.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -404,6 +403,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_sequences = std::stoi(argv[i]);
|
||||
} else if (arg == "--p-accept" || arg == "-pa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.p_accept = std::stof(argv[i]);
|
||||
} else if (arg == "--p-split" || arg == "-ps") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.p_split = std::stof(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -779,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
|
||||
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||
@@ -1199,8 +1212,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
const llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
|
||||
@@ -26,41 +26,51 @@
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const *LLAMA_COMMIT;
|
||||
extern char const *LLAMA_COMPILER;
|
||||
extern char const *LLAMA_BUILD_TARGET;
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = -1; // RNG seed
|
||||
uint32_t seed = -1; // RNG seed
|
||||
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = NAN; // YaRN extrapolation mix factor
|
||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = 32.0f;// YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
float p_accept = 0.5f; // speculative decoding accept probability
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||
// pinging @cebtenzzre
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
@@ -84,7 +94,7 @@ struct gpt_params {
|
||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
// (which is more convenient to use for plotting)
|
||||
//
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#include "console.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "build-info.h"
|
||||
#include "ggml-cuda.h"
|
||||
|
||||
// utils
|
||||
@@ -641,8 +640,8 @@ struct test {
|
||||
}
|
||||
};
|
||||
|
||||
const std::string test::build_commit = BUILD_COMMIT;
|
||||
const int test::build_number = BUILD_NUMBER;
|
||||
const std::string test::build_commit = LLAMA_COMMIT;
|
||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
|
||||
@@ -5,9 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if (NOT MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET llava)
|
||||
@@ -15,6 +12,3 @@ add_executable(${TARGET} llava.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#include "console.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
@@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
// A basic application simulating a server with multiple clients.
|
||||
// The clients submite requests to the server and they are processed in parallel.
|
||||
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -11,6 +11,3 @@ if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include "../llava/clip.h"
|
||||
@@ -2264,8 +2263,8 @@ int main(int argc, char **argv)
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
LOG_INFO("build info", {{"build", BUILD_NUMBER},
|
||||
{"commit", BUILD_COMMIT}});
|
||||
LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
|
||||
{"commit", LLAMA_COMMIT}});
|
||||
|
||||
LOG_INFO("system info", {
|
||||
{"n_threads", params.n_threads},
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -39,9 +37,11 @@ int main(int argc, char ** argv) {
|
||||
// max number of parallel drafting sequences (i.e. tree branches)
|
||||
const int n_seq_dft = params.n_parallel;
|
||||
|
||||
// TODO: make this configurable
|
||||
const float p_accept = 0.80f;
|
||||
const float p_split = 0.10f;
|
||||
// probability threshold for accepting a token from the draft model
|
||||
const float p_accept = params.p_accept;
|
||||
|
||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||
const float p_split = params.p_split;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("speculative", "log"));
|
||||
|
||||
105
ggml-cuda.cu
105
ggml-cuda.cu
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void
|
||||
|
||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
// qk = quantized weights per x block
|
||||
// qr = number of quantized weights per data value in x block
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
@@ -4867,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4876,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4885,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4894,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4903,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4913,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4922,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4931,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4946,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4954,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4963,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4972,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4981,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4990,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4999,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -5008,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -5017,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -5026,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -5035,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -5054,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<1, 1, convert_f16>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -6892,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
||||
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
||||
|
||||
int used_devices = 0;
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
// by default, use all rows
|
||||
row_low[id] = 0;
|
||||
@@ -6919,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
continue;
|
||||
}
|
||||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
|
||||
@@ -6957,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
// if multiple devices are used they need to wait for the main device
|
||||
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||
if (split && g_device_count > 1) {
|
||||
if (split && used_devices > 1) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
||||
}
|
||||
|
||||
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
||||
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
||||
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
||||
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
||||
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
||||
@@ -7078,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
||||
continue;
|
||||
}
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
// free buffers again when done
|
||||
@@ -7102,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (row_low[id] == row_high[id]) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t is = 0; is < is_max; ++is) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
||||
}
|
||||
@@ -7225,7 +7236,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
|
||||
__global__ void k_compute_batched_ptrs(
|
||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
||||
void ** ptrs,
|
||||
const void ** ptrs_src, void ** ptrs_dst,
|
||||
int ne12, int ne13,
|
||||
int ne23,
|
||||
int nb02, int nb03,
|
||||
@@ -7242,9 +7253,9 @@ __global__ void k_compute_batched_ptrs(
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
||||
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -7350,14 +7361,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||
// use cublasGemmBatchedEx
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
void ** ptrs_as = nullptr;
|
||||
size_t ptrs_s = 0;
|
||||
ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
||||
const void ** ptrs_src = nullptr;
|
||||
void ** ptrs_dst = nullptr;
|
||||
|
||||
size_t ptrs_src_s = 0;
|
||||
size_t ptrs_dst_s = 0;
|
||||
|
||||
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
||||
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
||||
|
||||
dim3 block_dims(ne13, ne12);
|
||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||
src0_as_f16, src1_as_f16, dst_f16,
|
||||
ptrs_as,
|
||||
ptrs_src, ptrs_dst,
|
||||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
@@ -7369,14 +7385,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
||||
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
||||
ne23,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
||||
if (ptrs_src_s != 0) {
|
||||
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
||||
}
|
||||
if (ptrs_dst_s != 0) {
|
||||
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7389,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU) &&
|
||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
@@ -7420,7 +7441,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
|
||||
64
ggml-metal.m
64
ggml-metal.m
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
||||
[encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||
[encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
@@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute(
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
|
||||
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
@@ -1419,34 +1420,35 @@ void ggml_metal_graph_compute(
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
|
||||
[encoder setBytes:&ext_factor length:sizeof(float) atIndex:24];
|
||||
[encoder setBytes:&attn_factor length:sizeof(float) atIndex:25];
|
||||
[encoder setBytes:&beta_fast length:sizeof(float) atIndex:26];
|
||||
[encoder setBytes:&beta_slow length:sizeof(float) atIndex:27];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
|
||||
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
|
||||
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
|
||||
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
|
||||
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
|
||||
@@ -1070,20 +1070,20 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
||||
float * cos_theta, float * sin_theta
|
||||
thread float * cos_theta, thread float * sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = cosf(theta) * mscale;
|
||||
*sin_theta = sinf(theta) * mscale;
|
||||
*cos_theta = cos(theta) * mscale;
|
||||
*sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
@@ -1123,8 +1123,13 @@ typedef void (rope_t)(
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant int & n_orig_ctx,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
constant float & ext_factor,
|
||||
constant float & attn_factor,
|
||||
constant float & beta_fast,
|
||||
constant float & beta_slow,
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]);
|
||||
@@ -1153,6 +1158,7 @@ kernel void kernel_rope(
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant int & n_orig_ctx,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
constant float & ext_factor,
|
||||
|
||||
59
ggml.c
59
ggml.c
@@ -18811,8 +18811,7 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
||||
return n == size;
|
||||
}
|
||||
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
@@ -18824,19 +18823,6 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset
|
||||
return ok;
|
||||
}
|
||||
|
||||
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
bool ok = true;
|
||||
|
||||
uint32_t n = 0;
|
||||
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
|
||||
@@ -18895,20 +18881,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ctx->data = NULL;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t n_tensors = 0;
|
||||
uint32_t n_kv = 0;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
||||
|
||||
ctx->header.n_tensors = n_tensors;
|
||||
ctx->header.n_kv = n_kv;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
@@ -18919,12 +18899,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
||||
if (ctx->header.version == 1) {
|
||||
gguf_fread_str = gguf_fread_str_v1;
|
||||
}
|
||||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
@@ -18955,15 +18929,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t n = 0;
|
||||
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
||||
kv->value.arr.n = n;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
}
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
|
||||
switch (kv->value.arr.type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
@@ -19022,14 +18988,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t t = 0;
|
||||
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
||||
info->ne[j] = t;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
@@ -393,6 +393,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln1", # yi
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -464,6 +465,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
|
||||
32
llama.cpp
32
llama.cpp
@@ -1837,6 +1837,12 @@ struct llama_model_loader {
|
||||
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
||||
}
|
||||
|
||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (ne.size() == 1) {
|
||||
throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
bool is_ok = true;
|
||||
for (size_t i = 0; i < ne.size(); ++i) {
|
||||
@@ -2817,8 +2823,8 @@ static void llm_load_tensors(
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
@@ -2877,13 +2883,13 @@ static void llm_load_tensors(
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
||||
@@ -2949,19 +2955,19 @@ static void llm_load_tensors(
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
@@ -7976,7 +7982,7 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.yarn_ext_factor =*/ NAN,
|
||||
/*.yarn_ext_factor =*/ -1.0f,
|
||||
/*.yarn_attn_factor =*/ 1.0f,
|
||||
/*.yarn_beta_fast =*/ 32.0f,
|
||||
/*.yarn_beta_slow =*/ 1.0f,
|
||||
@@ -8119,7 +8125,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
||||
}
|
||||
|
||||
if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
|
||||
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
|
||||
10
llama.h
10
llama.h
@@ -175,11 +175,11 @@ extern "C" {
|
||||
};
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
|
||||
Binary file not shown.
@@ -1,5 +1,5 @@
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
|
||||
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
set(BUILD_NUMBER 0)
|
||||
set(BUILD_COMMIT "unknown")
|
||||
set(BUILD_COMPILER "unknown")
|
||||
@@ -24,15 +24,21 @@ if(Git_FOUND)
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE HEAD
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE RES
|
||||
)
|
||||
if (RES EQUAL 0)
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
endif()
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE COUNT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE RES
|
||||
)
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
if (RES EQUAL 0)
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
@@ -53,22 +59,22 @@ else()
|
||||
set(BUILD_TARGET ${OUT})
|
||||
endif()
|
||||
|
||||
# Only write the header if it's changed to prevent unnecessary recompilation
|
||||
if(EXISTS ${HEADER_FILE})
|
||||
file(READ ${HEADER_FILE} CONTENTS)
|
||||
string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
#ifndef BUILD_INFO_H
|
||||
#define BUILD_INFO_H
|
||||
|
||||
#define BUILD_NUMBER @BUILD_NUMBER@
|
||||
#define BUILD_COMMIT "@BUILD_COMMIT@"
|
||||
#define BUILD_COMPILER "@BUILD_COMPILER@"
|
||||
#define BUILD_TARGET "@BUILD_TARGET@"
|
||||
|
||||
#endif // BUILD_INFO_H
|
||||
@@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
|
||||
build_target=$out
|
||||
fi
|
||||
|
||||
echo "#ifndef BUILD_INFO_H"
|
||||
echo "#define BUILD_INFO_H"
|
||||
echo
|
||||
echo "#define BUILD_NUMBER $build_number"
|
||||
echo "#define BUILD_COMMIT \"$build_commit\""
|
||||
echo "#define BUILD_COMPILER \"$build_compiler\""
|
||||
echo "#define BUILD_TARGET \"$build_target\""
|
||||
echo
|
||||
echo "#endif // BUILD_INFO_H"
|
||||
echo "int LLAMA_BUILD_NUMBER = ${build_number};"
|
||||
echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
|
||||
echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
|
||||
echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
|
||||
|
||||
Reference in New Issue
Block a user