mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-04-17 16:42:52 +02:00
Compare commits
30 Commits
b1460
...
fix-tensor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47d604fa2d | ||
|
|
73c0010e18 | ||
|
|
c41ea36eaa | ||
|
|
a7fac013cf | ||
|
|
48ade94538 | ||
|
|
05c51f96fe | ||
|
|
f28af0d81a | ||
|
|
d9b33fe95b | ||
|
|
5ba3746171 | ||
|
|
abb77e7319 | ||
|
|
8f961abdc4 | ||
|
|
05816027d6 | ||
|
|
3fdbe6b66b | ||
|
|
629f917cd6 | ||
|
|
51b2fc11f7 | ||
|
|
224e7d5b14 | ||
|
|
c7743fe1c1 | ||
|
|
d6069051de | ||
|
|
4ff1046d75 | ||
|
|
21958bb393 | ||
|
|
2756c4fbff | ||
|
|
1efae9b7dc | ||
|
|
b12fa0d1c1 | ||
|
|
4d719a6d4e | ||
|
|
183b3fac6c | ||
|
|
2fffa0d61f | ||
|
|
0eb332a10f | ||
|
|
d02e98cde0 | ||
|
|
898aeca90a | ||
|
|
c43c2da8af |
15
.github/workflows/build.yml
vendored
15
.github/workflows/build.yml
vendored
@@ -288,6 +288,7 @@ jobs:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.21.1-2023-04-24
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -383,11 +384,23 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||
run: |
|
||||
cd build
|
||||
ctest -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
& $sde -future -- ctest -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -65,7 +65,7 @@ models-mnt
|
||||
/parallel
|
||||
/train-text-from-scratch
|
||||
/vdot
|
||||
build-info.h
|
||||
/common/build-info.cpp
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
CMakeSettings.json
|
||||
|
||||
@@ -10,7 +10,7 @@ endif()
|
||||
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||
|
||||
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||
set(LLAMA_STANDALONE ON)
|
||||
|
||||
# configure project version
|
||||
@@ -100,39 +100,6 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALO
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||
|
||||
#
|
||||
# Build info header
|
||||
#
|
||||
|
||||
# Generate initial build-info.h
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||
endif()
|
||||
|
||||
# Add a custom target for build-info.h
|
||||
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
||||
|
||||
# Add a custom command to rebuild build-info.h when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
DEPENDS "${GIT_DIR}/index"
|
||||
VERBATIM
|
||||
)
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
@@ -543,6 +510,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (LLAMA_NATIVE)
|
||||
include(cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
|
||||
|
||||
71
Makefile
71
Makefile
@@ -542,9 +542,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
||||
|
||||
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
|
||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||
@@ -563,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
|
||||
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||
@@ -614,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
|
||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
@@ -623,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip
|
||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
@@ -648,7 +648,7 @@ swift: examples/batched.swift
|
||||
(cd examples/batched.swift; make build)
|
||||
endif
|
||||
|
||||
build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
|
||||
@sh scripts/build-info.sh $(CC) > $@.tmp
|
||||
@if ! cmp -s $@.tmp $@; then \
|
||||
mv $@.tmp $@; \
|
||||
@@ -656,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
rm $@.tmp; \
|
||||
fi
|
||||
|
||||
build-info.o: common/build-info.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
tests: $(TEST_TARGETS)
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
run-benchmark-matmult: benchmark-matmult
|
||||
@@ -676,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c llama.h
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||

|
||||
|
||||
[](https://github.com/ggerganov/llama.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||
@@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
|
||||
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
|
||||
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
||||
- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
|
||||
|
||||
----
|
||||
|
||||
|
||||
38
build.zig
38
build.zig
@@ -10,7 +10,6 @@ const Maker = struct {
|
||||
builder: *std.build.Builder,
|
||||
target: CrossTarget,
|
||||
optimize: Mode,
|
||||
config_header: *ConfigHeader,
|
||||
enable_lto: bool,
|
||||
|
||||
include_dirs: ArrayList([]const u8),
|
||||
@@ -41,26 +40,24 @@ const Maker = struct {
|
||||
const commit_hash = try std.ChildProcess.exec(
|
||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||
);
|
||||
const config_header = builder.addConfigHeader(
|
||||
.{ .style = .blank, .include_path = "build-info.h" },
|
||||
.{
|
||||
.BUILD_NUMBER = 0,
|
||||
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
|
||||
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
|
||||
.BUILD_TARGET = try target.allocDescription(builder.allocator),
|
||||
},
|
||||
);
|
||||
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||
\\int LLAMA_BUILD_NUMBER = {};
|
||||
\\char const *LLAMA_COMMIT = "{s}";
|
||||
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||
\\
|
||||
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||
var m = Maker{
|
||||
.builder = builder,
|
||||
.target = target,
|
||||
.optimize = builder.standardOptimizeOption(.{}),
|
||||
.config_header = config_header,
|
||||
.enable_lto = false,
|
||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||
};
|
||||
|
||||
try m.addCFlag("-std=c11");
|
||||
try m.addCxxFlag("-std=c++11");
|
||||
try m.addProjectInclude(&.{});
|
||||
@@ -72,7 +69,7 @@ const Maker = struct {
|
||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||
if (o.target.getAbi() != .msvc)
|
||||
o.defineCMacro("_GNU_SOURCE", null);
|
||||
o.addConfigHeader(m.config_header);
|
||||
|
||||
if (std.mem.endsWith(u8, src, ".c")) {
|
||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||
o.linkLibC();
|
||||
@@ -85,7 +82,6 @@ const Maker = struct {
|
||||
o.linkLibCpp();
|
||||
}
|
||||
}
|
||||
o.addConfigHeader(m.config_header);
|
||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||
o.want_lto = m.enable_lto;
|
||||
return o;
|
||||
@@ -105,7 +101,6 @@ const Maker = struct {
|
||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||
e.linkLibCpp();
|
||||
}
|
||||
e.addConfigHeader(m.config_header);
|
||||
m.builder.installArtifact(e);
|
||||
e.want_lto = m.enable_lto;
|
||||
return e;
|
||||
@@ -121,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("console", "common/console.cpp");
|
||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||
@@ -128,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
100
cmake/FindSIMD.cmake
Normal file
100
cmake/FindSIMD.cmake
Normal file
@@ -0,0 +1,100 @@
|
||||
include(CheckCSourceRuns)
|
||||
|
||||
set(AVX_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256 a;
|
||||
a = _mm256_set1_ps(0);
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(AVX512_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0);
|
||||
__m512i b = a;
|
||||
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(AVX2_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256i a = {0};
|
||||
a = _mm256_abs_epi16(a);
|
||||
__m256i x;
|
||||
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
set(FMA_CODE "
|
||||
#include <immintrin.h>
|
||||
int main()
|
||||
{
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
const __m256 d = _mm256_setzero_ps();
|
||||
const __m256 p = _mm256_setzero_ps();
|
||||
acc = _mm256_fmadd_ps( d, p, acc );
|
||||
return 0;
|
||||
}
|
||||
")
|
||||
|
||||
macro(check_sse type flags)
|
||||
set(__FLAG_I 1)
|
||||
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
|
||||
foreach (__FLAG ${flags})
|
||||
if (NOT ${type}_FOUND)
|
||||
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
|
||||
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
|
||||
if (HAS_${type}_${__FLAG_I})
|
||||
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
|
||||
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
|
||||
endif()
|
||||
math(EXPR __FLAG_I "${__FLAG_I}+1")
|
||||
endif()
|
||||
endforeach()
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
|
||||
|
||||
if (NOT ${type}_FOUND)
|
||||
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
|
||||
set(${type}_FLAGS "" CACHE STRING "${type} flags")
|
||||
endif()
|
||||
|
||||
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
|
||||
endmacro()
|
||||
|
||||
# flags are for MSVC only!
|
||||
check_sse("AVX" " ;/arch:AVX")
|
||||
if (NOT ${AVX_FOUND})
|
||||
set(LLAMA_AVX OFF)
|
||||
else()
|
||||
set(LLAMA_AVX ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX2" " ;/arch:AVX2")
|
||||
check_sse("FMA" " ;/arch:AVX2")
|
||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||
set(LLAMA_AVX2 OFF)
|
||||
else()
|
||||
set(LLAMA_AVX2 ON)
|
||||
endif()
|
||||
|
||||
check_sse("AVX512" " ;/arch:AVX512")
|
||||
if (NOT ${AVX512_FOUND})
|
||||
set(LLAMA_AVX512 OFF)
|
||||
else()
|
||||
set(LLAMA_AVX512 ON)
|
||||
endif()
|
||||
@@ -1,8 +1,46 @@
|
||||
# common
|
||||
|
||||
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
endif()
|
||||
|
||||
set(GIT_INDEX "${GIT_DIR}/index")
|
||||
else()
|
||||
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
|
||||
set(GIT_INDEX "")
|
||||
endif()
|
||||
|
||||
# Add a custom command to rebuild build-info.cpp when .git/index changes
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
)
|
||||
set(TARGET build_info)
|
||||
add_library(${TARGET} OBJECT build-info.cpp)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
|
||||
|
||||
set(TARGET common)
|
||||
|
||||
add_library(${TARGET} OBJECT
|
||||
add_library(${TARGET} STATIC
|
||||
common.h
|
||||
common.cpp
|
||||
sampling.h
|
||||
@@ -21,4 +59,4 @@ endif()
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC .)
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||
target_link_libraries(${TARGET} PRIVATE llama)
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info)
|
||||
|
||||
4
common/build-info.cpp.in
Normal file
4
common/build-info.cpp.in
Normal file
@@ -0,0 +1,4 @@
|
||||
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
|
||||
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
|
||||
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
|
||||
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
|
||||
@@ -1,5 +1,4 @@
|
||||
#include "common.h"
|
||||
#include "build-info.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -219,12 +218,52 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-scaling") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
} else if (arg == "--rope-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-orig-ctx") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_orig_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--yarn-ext-factor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_ext_factor = std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-attn-factor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_attn_factor = std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-beta-fast") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_fast = std::stof(argv[i]);
|
||||
} else if (arg == "--yarn-beta-slow") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top-p") {
|
||||
@@ -364,6 +403,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_sequences = std::stoi(argv[i]);
|
||||
} else if (arg == "--p-accept" || arg == "-pa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.p_accept = std::stof(argv[i]);
|
||||
} else if (arg == "--p-split" || arg == "-ps") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.p_split = std::stof(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -716,9 +767,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||
printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
|
||||
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
@@ -732,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
|
||||
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||
@@ -826,17 +886,23 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = params.n_ctx;
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.seed = params.seed;
|
||||
cparams.f16_kv = params.memory_f16;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embedding = params.embedding;
|
||||
cparams.rope_freq_base = params.rope_freq_base;
|
||||
cparams.rope_freq_scale = params.rope_freq_scale;
|
||||
cparams.n_ctx = params.n_ctx;
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.seed = params.seed;
|
||||
cparams.f16_kv = params.memory_f16;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embedding = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
cparams.rope_freq_base = params.rope_freq_base;
|
||||
cparams.rope_freq_scale = params.rope_freq_scale;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||
|
||||
return cparams;
|
||||
}
|
||||
@@ -1146,8 +1212,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
const llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
@@ -25,35 +26,51 @@
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const *LLAMA_COMMIT;
|
||||
extern char const *LLAMA_COMPILER;
|
||||
extern char const *LLAMA_BUILD_TARGET;
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = -1; // RNG seed
|
||||
uint32_t seed = -1; // RNG seed
|
||||
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||
int32_t n_sequences = 1; // number of sequences to decode
|
||||
float p_accept = 0.5f; // speculative decoding accept probability
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
|
||||
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||
// pinging @cebtenzzre
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
@@ -77,7 +94,7 @@ struct gpt_params {
|
||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
// (which is more convenient to use for plotting)
|
||||
//
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
|
||||
@@ -163,7 +163,8 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
||||
if "type" in hparams["rope_scaling"]:
|
||||
if hparams["rope_scaling"]["type"] == "linear":
|
||||
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
|
||||
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
||||
|
||||
|
||||
# TOKENIZATION
|
||||
|
||||
97
convert.py
97
convert.py
@@ -151,8 +151,11 @@ class Params:
|
||||
n_head_kv: int
|
||||
f_norm_eps: float
|
||||
|
||||
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||
f_rope_freq_base: float | None = None
|
||||
f_rope_scale: float | None = None
|
||||
n_orig_ctx: int | None = None
|
||||
rope_finetuned: bool | None = None
|
||||
|
||||
ftype: GGMLFileType | None = None
|
||||
|
||||
@@ -198,20 +201,20 @@ class Params:
|
||||
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"]
|
||||
n_embd = config["hidden_size"]
|
||||
n_layer = config["num_hidden_layers"]
|
||||
n_ff = config["intermediate_size"]
|
||||
n_head = config["num_attention_heads"]
|
||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
||||
f_norm_eps = config["rms_norm_eps"]
|
||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
||||
|
||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||
rope_scaling = config.get("rope_scaling")
|
||||
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
|
||||
f_rope_scale = config["rope_scaling"].get("factor")
|
||||
else:
|
||||
f_rope_scale = None
|
||||
|
||||
if rope_scaling is not None and (typ := rope_scaling.get("type")):
|
||||
rope_factor = rope_scaling.get("factor")
|
||||
f_rope_scale = rope_factor
|
||||
if typ == "linear":
|
||||
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||
elif typ == "yarn":
|
||||
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
||||
rope_finetuned = rope_scaling['finetuned']
|
||||
else:
|
||||
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
||||
|
||||
if "max_sequence_length" in config:
|
||||
n_ctx = config["max_sequence_length"]
|
||||
@@ -222,16 +225,19 @@ class Params:
|
||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||
|
||||
return Params(
|
||||
n_vocab = n_vocab,
|
||||
n_embd = n_embd,
|
||||
n_layer = n_layer,
|
||||
n_ctx = n_ctx,
|
||||
n_ff = n_ff,
|
||||
n_head = n_head,
|
||||
n_head_kv = n_head_kv,
|
||||
f_norm_eps = f_norm_eps,
|
||||
f_rope_freq_base = f_rope_freq_base,
|
||||
f_rope_scale = f_rope_scale,
|
||||
n_vocab = config["vocab_size"],
|
||||
n_embd = config["hidden_size"],
|
||||
n_layer = config["num_hidden_layers"],
|
||||
n_ctx = n_ctx,
|
||||
n_ff = config["intermediate_size"],
|
||||
n_head = (n_head := config["num_attention_heads"]),
|
||||
n_head_kv = config.get("num_key_value_heads", n_head),
|
||||
f_norm_eps = config["rms_norm_eps"],
|
||||
f_rope_freq_base = config.get("rope_theta"),
|
||||
rope_scaling_type = rope_scaling_type,
|
||||
f_rope_scale = f_rope_scale,
|
||||
n_orig_ctx = n_orig_ctx,
|
||||
rope_finetuned = rope_finetuned,
|
||||
)
|
||||
|
||||
# LLaMA v2 70B params.json
|
||||
@@ -240,17 +246,8 @@ class Params:
|
||||
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
|
||||
n_embd = config["dim"]
|
||||
n_layer = config["n_layers"]
|
||||
n_ff = -1
|
||||
n_head = config["n_heads"]
|
||||
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
|
||||
f_norm_eps = config["norm_eps"]
|
||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
||||
|
||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||
if f_rope_freq_base == 1000000:
|
||||
if config.get("rope_theta") == 1000000:
|
||||
# CodeLlama
|
||||
n_ctx = 16384
|
||||
elif config["norm_eps"] == 1e-05:
|
||||
@@ -260,22 +257,16 @@ class Params:
|
||||
# LLaMA v1
|
||||
n_ctx = 2048
|
||||
|
||||
if n_vocab == -1:
|
||||
n_vocab = model["tok_embeddings.weight"].shape[0]
|
||||
|
||||
if n_ff == -1:
|
||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
|
||||
|
||||
return Params(
|
||||
n_vocab = n_vocab,
|
||||
n_embd = n_embd,
|
||||
n_layer = n_layer,
|
||||
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
||||
n_embd = config["dim"],
|
||||
n_layer = config["n_layers"],
|
||||
n_ctx = n_ctx,
|
||||
n_ff = n_ff,
|
||||
n_head = n_head,
|
||||
n_head_kv = n_head_kv,
|
||||
f_norm_eps = f_norm_eps,
|
||||
f_rope_freq_base = f_rope_freq_base,
|
||||
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
|
||||
n_head = (n_head := config["n_heads"]),
|
||||
n_head_kv = config.get("n_kv_heads", n_head),
|
||||
f_norm_eps = config["norm_eps"],
|
||||
f_rope_freq_base = config.get("rope_theta"),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -831,8 +822,16 @@ class OutputFile:
|
||||
if params.f_rope_freq_base is not None:
|
||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||
|
||||
if params.f_rope_scale is not None:
|
||||
self.gguf.add_rope_scale_linear(params.f_rope_scale)
|
||||
if params.rope_scaling_type:
|
||||
assert params.f_rope_scale is not None
|
||||
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
|
||||
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
|
||||
|
||||
if params.n_orig_ctx is not None:
|
||||
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
|
||||
|
||||
if params.rope_finetuned is not None:
|
||||
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
|
||||
|
||||
if params.ftype is not None:
|
||||
self.gguf.add_file_type(params.ftype)
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -642,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
const int rope_mode = 0;
|
||||
|
||||
return ggml_rope_custom(ctx,
|
||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
||||
rope_freq_base, rope_freq_scale);
|
||||
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
||||
rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
|
||||
);
|
||||
};
|
||||
|
||||
set_name(tokens_input, "tokens_input");
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#include "console.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include <cassert>
|
||||
@@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -19,7 +19,6 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "build-info.h"
|
||||
#include "ggml-cuda.h"
|
||||
|
||||
// utils
|
||||
@@ -641,8 +640,8 @@ struct test {
|
||||
}
|
||||
};
|
||||
|
||||
const std::string test::build_commit = BUILD_COMMIT;
|
||||
const int test::build_number = BUILD_NUMBER;
|
||||
const std::string test::build_commit = LLAMA_COMMIT;
|
||||
const int test::build_number = LLAMA_BUILD_NUMBER;
|
||||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
|
||||
@@ -5,9 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if (NOT MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET llava)
|
||||
@@ -15,6 +12,3 @@ add_executable(${TARGET} llava.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
#include "console.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
@@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
// A basic application simulating a server with multiple clients.
|
||||
// The clients submite requests to the server and they are processed in parallel.
|
||||
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -11,6 +11,3 @@ if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
#include "grammar-parser.h"
|
||||
|
||||
#include "../llava/clip.h"
|
||||
@@ -1755,12 +1754,18 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
@@ -1881,6 +1886,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-scaling")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -1899,6 +1917,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--yarn-ext-factor")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_ext_factor = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--yarn-attn-factor")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_attn_factor = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--yarn-beta-fast")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_fast = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--yarn-beta-slow")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||
{
|
||||
params.memory_f16 = false;
|
||||
@@ -2213,8 +2263,8 @@ int main(int argc, char **argv)
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
LOG_INFO("build info", {{"build", BUILD_NUMBER},
|
||||
{"commit", BUILD_COMMIT}});
|
||||
LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
|
||||
{"commit", LLAMA_COMMIT}});
|
||||
|
||||
LOG_INFO("system info", {
|
||||
{"n_threads", params.n_threads},
|
||||
|
||||
@@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -39,9 +37,11 @@ int main(int argc, char ** argv) {
|
||||
// max number of parallel drafting sequences (i.e. tree branches)
|
||||
const int n_seq_dft = params.n_parallel;
|
||||
|
||||
// TODO: make this configurable
|
||||
const float p_accept = 0.80f;
|
||||
const float p_split = 0.10f;
|
||||
// probability threshold for accepting a token from the draft model
|
||||
const float p_accept = params.p_accept;
|
||||
|
||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||
const float p_split = params.p_split;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("speculative", "log"));
|
||||
|
||||
@@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||
// not capturing these, to silcence warnings
|
||||
const int rope_mode = 0;
|
||||
|
||||
return ggml_rope_custom(ctx,
|
||||
t, KQ_pos, n_rot, rope_mode, n_ctx,
|
||||
rope_freq_base, rope_freq_scale);
|
||||
return ggml_rope_custom(
|
||||
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
};
|
||||
|
||||
set_name(tokens_input, "tokens_input");
|
||||
|
||||
310
ggml-cuda.cu
310
ggml-cuda.cu
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
if (row > nrows) return;
|
||||
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void
|
||||
|
||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
// qk = quantized weights per x block
|
||||
// qr = number of quantized weights per data value in x block
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
@@ -4493,11 +4493,41 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
// rope == RoPE == rotary positional embedding
|
||||
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
struct rope_corr_dims {
|
||||
float v[4];
|
||||
};
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static __device__ void rope_yarn(
|
||||
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
||||
float * cos_theta, float * sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = cosf(theta) * mscale;
|
||||
*sin_theta = sinf(theta) * mscale;
|
||||
}
|
||||
|
||||
// rope == RoPE == rotary positional embedding
|
||||
template<typename T, bool has_pos>
|
||||
static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
||||
const int p_delta_rows, const float theta_scale) {
|
||||
static __global__ void rope(
|
||||
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
||||
) {
|
||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (col >= ncols) {
|
||||
@@ -4509,10 +4539,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const int p = has_pos ? pos[i2] : 0;
|
||||
const float p0 = p*freq_scale;
|
||||
const float theta = p0*powf(theta_scale, col/2);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta);
|
||||
const float theta_base = p*powf(freq_base, -float(col)/ncols);
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + 1];
|
||||
@@ -4522,8 +4552,10 @@ static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t
|
||||
}
|
||||
|
||||
template<typename T, bool has_pos>
|
||||
static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
||||
const int p_delta_rows, const float theta_scale) {
|
||||
static __global__ void rope_neox(
|
||||
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
||||
) {
|
||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (col >= ncols) {
|
||||
@@ -4534,11 +4566,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
||||
const int i = row*ncols + col/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
||||
const float cur_rot = -float(col)/ncols;
|
||||
|
||||
const int p = has_pos ? pos[i2] : 0;
|
||||
const float p0 = p*freq_scale;
|
||||
const float theta = p0*powf(theta_scale, col/2);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta);
|
||||
const float theta_base = p*powf(freq_base, cur_rot);
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + ncols/2];
|
||||
@@ -4547,8 +4582,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in
|
||||
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
|
||||
const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
||||
static __global__ void rope_glm_f32(
|
||||
const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
||||
int n_ctx
|
||||
) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
|
||||
@@ -4560,7 +4597,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
||||
const int i = row*ncols + col;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
const float col_theta_scale = powf(theta_scale, col);
|
||||
const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
|
||||
// FIXME: this is likely wrong
|
||||
const int p = pos != nullptr ? pos[i2] : 0;
|
||||
|
||||
@@ -4830,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4839,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4848,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4857,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4866,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -4876,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4885,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4894,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4909,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
@@ -4917,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4926,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4935,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4944,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4953,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4962,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4971,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4980,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4989,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4998,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -5017,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<1, 1, convert_f16>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -5584,40 +5622,54 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
||||
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||
static void rope_cuda(
|
||||
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
||||
) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
if (pos == nullptr) {
|
||||
rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
||||
rope<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
||||
);
|
||||
} else {
|
||||
rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
||||
rope<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
||||
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||
static void rope_neox_cuda(
|
||||
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
||||
) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
if (pos == nullptr) {
|
||||
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
||||
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
||||
);
|
||||
} else {
|
||||
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
|
||||
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
||||
const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
||||
static void rope_glm_f32_cuda(
|
||||
const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, int n_ctx, cudaStream_t stream
|
||||
) {
|
||||
GGML_ASSERT(ncols % 4 == 0);
|
||||
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
||||
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
||||
const dim3 block_nums(num_blocks_x, nrows, 1);
|
||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
|
||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
|
||||
}
|
||||
|
||||
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
||||
@@ -6477,17 +6529,20 @@ inline void ggml_cuda_op_rope(
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
// RoPE alteration for extended context
|
||||
|
||||
float freq_base, freq_scale;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
|
||||
const int32_t * pos = nullptr;
|
||||
if ((mode & 1) == 0) {
|
||||
@@ -6499,24 +6554,39 @@ inline void ggml_cuda_op_rope(
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
rope_corr_dims corr_dims;
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||
|
||||
// compute
|
||||
if (is_glm) {
|
||||
GGML_ASSERT(false);
|
||||
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
|
||||
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
||||
rope_neox_cuda(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
||||
rope_neox_cuda(
|
||||
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
||||
rope_cuda(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
|
||||
rope_cuda(
|
||||
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -6627,8 +6697,10 @@ inline void ggml_cuda_op_clamp(
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float min = ((float *) dst->op_params)[0];
|
||||
const float max = ((float *) dst->op_params)[1];
|
||||
float min;
|
||||
float max;
|
||||
memcpy(&min, dst->op_params, sizeof(float));
|
||||
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
@@ -6821,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
||||
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
||||
|
||||
int used_devices = 0;
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
// by default, use all rows
|
||||
row_low[id] = 0;
|
||||
@@ -6848,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
continue;
|
||||
}
|
||||
|
||||
used_devices++;
|
||||
|
||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||
|
||||
@@ -6886,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
// if multiple devices are used they need to wait for the main device
|
||||
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||
if (split && g_device_count > 1) {
|
||||
if (split && used_devices > 1) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
||||
}
|
||||
|
||||
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
||||
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
||||
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
||||
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
||||
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
||||
@@ -7007,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
||||
continue;
|
||||
}
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
// free buffers again when done
|
||||
@@ -7031,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (row_low[id] == row_high[id]) {
|
||||
continue;
|
||||
}
|
||||
for (int64_t is = 0; is < is_max; ++is) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
||||
}
|
||||
@@ -7152,6 +7234,30 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
__global__ void k_compute_batched_ptrs(
|
||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
||||
const void ** ptrs_src, void ** ptrs_dst,
|
||||
int ne12, int ne13,
|
||||
int ne23,
|
||||
int nb02, int nb03,
|
||||
int nb12, int nb13,
|
||||
int nb2, int nb3,
|
||||
int r2, int r3) {
|
||||
int i13 = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int i12 = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (i13 >= ne13 || i12 >= ne12) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
|
||||
ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
@@ -7253,49 +7359,45 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
// TODO: avoid this alloc
|
||||
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
||||
const void ** ptrs_src = nullptr;
|
||||
void ** ptrs_dst = nullptr;
|
||||
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
size_t ptrs_src_s = 0;
|
||||
size_t ptrs_dst_s = 0;
|
||||
|
||||
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
||||
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
||||
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
||||
}
|
||||
}
|
||||
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
||||
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
||||
|
||||
// allocate device memory for pointers
|
||||
void ** ptrs_as = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
||||
|
||||
// TODO: this does not work for some reason -- not sure why?
|
||||
//size_t ptrs_s = 0;
|
||||
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
||||
|
||||
// copy pointers to device
|
||||
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
||||
|
||||
free(ptrs);
|
||||
dim3 block_dims(ne13, ne12);
|
||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||
src0_as_f16, src1_as_f16, dst_f16,
|
||||
ptrs_src, ptrs_dst,
|
||||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
nb12, nb13,
|
||||
dst->nb[2], dst->nb[3],
|
||||
r2, r3);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
||||
&alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
|
||||
ne23,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
// free device memory for pointers
|
||||
CUDA_CHECK(cudaFree(ptrs_as));
|
||||
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
||||
if (ptrs_src_s != 0) {
|
||||
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
||||
}
|
||||
if (ptrs_dst_s != 0) {
|
||||
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7308,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU) &&
|
||||
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
@@ -7339,7 +7441,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
|
||||
76
ggml-metal.m
76
ggml-metal.m
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
||||
[encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||
[encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
@@ -1400,14 +1400,19 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const int nth = MIN(1024, ne00);
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
|
||||
@@ -1415,30 +1420,35 @@ void ggml_metal_graph_compute(
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:22];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:21];
|
||||
[encoder setBytes:&n_orig_ctx length:sizeof( int) atIndex:22];
|
||||
[encoder setBytes:&freq_base length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&freq_scale length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&ext_factor length:sizeof( float) atIndex:25];
|
||||
[encoder setBytes:&attn_factor length:sizeof( float) atIndex:26];
|
||||
[encoder setBytes:&beta_fast length:sizeof( float) atIndex:27];
|
||||
[encoder setBytes:&beta_slow length:sizeof( float) atIndex:28];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
|
||||
@@ -1061,6 +1061,45 @@ kernel void kernel_alibi_f32(
|
||||
}
|
||||
}
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
||||
thread float * cos_theta, thread float * sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = cos(theta) * mscale;
|
||||
*sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
||||
return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
static void rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||
}
|
||||
|
||||
typedef void (rope_t)(
|
||||
device const void * src0,
|
||||
device const int32_t * src1,
|
||||
@@ -1084,8 +1123,13 @@ typedef void (rope_t)(
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant int & n_orig_ctx,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
constant float & ext_factor,
|
||||
constant float & attn_factor,
|
||||
constant float & beta_fast,
|
||||
constant float & beta_slow,
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]);
|
||||
@@ -1114,8 +1158,13 @@ kernel void kernel_rope(
|
||||
constant int & n_past,
|
||||
constant int & n_dims,
|
||||
constant int & mode,
|
||||
constant int & n_orig_ctx,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
constant float & ext_factor,
|
||||
constant float & attn_factor,
|
||||
constant float & beta_fast,
|
||||
constant float & beta_slow,
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
@@ -1125,19 +1174,22 @@ kernel void kernel_rope(
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = src1;
|
||||
|
||||
const int64_t p = pos[i2];
|
||||
|
||||
const float theta_0 = freq_scale * (float)p;
|
||||
const float theta_0 = (float)p;
|
||||
const float inv_ndims = -1.f/n_dims;
|
||||
|
||||
if (!is_neox) {
|
||||
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||
|
||||
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
@@ -1152,9 +1204,12 @@ kernel void kernel_rope(
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
||||
|
||||
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||
const float cur_rot = inv_ndims*ic - ib;
|
||||
|
||||
const float theta = theta_0 * pow(freq_base, cur_rot);
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
|
||||
300
ggml.c
300
ggml.c
@@ -1,4 +1,5 @@
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
@@ -4845,8 +4846,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
int n_orig_ctx,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow,
|
||||
float xpos_base,
|
||||
bool xpos_down,
|
||||
bool inplace) {
|
||||
@@ -4862,11 +4868,15 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
|
||||
memcpy(params + 4, &freq_base, sizeof(float));
|
||||
memcpy(params + 5, &freq_scale, sizeof(float));
|
||||
memcpy(params + 6, &xpos_base, sizeof(float));
|
||||
memcpy(params + 7, &xpos_down, sizeof(bool));
|
||||
int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
|
||||
memcpy(params + 5, &freq_base, sizeof(float));
|
||||
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||
memcpy(params + 11, &xpos_base, sizeof(float));
|
||||
memcpy(params + 12, &xpos_down, sizeof(bool));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ROPE;
|
||||
@@ -4884,7 +4894,9 @@ struct ggml_tensor * ggml_rope(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
|
||||
return ggml_rope_impl(
|
||||
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
|
||||
);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
@@ -4894,7 +4906,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
|
||||
return ggml_rope_impl(
|
||||
ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
|
||||
);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom(
|
||||
@@ -4904,9 +4918,17 @@ struct ggml_tensor * ggml_rope_custom(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
int n_orig_ctx,
|
||||
float freq_base,
|
||||
float freq_scale) {
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow) {
|
||||
return ggml_rope_impl(
|
||||
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
|
||||
);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
@@ -4916,9 +4938,17 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
int n_orig_ctx,
|
||||
float freq_base,
|
||||
float freq_scale) {
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow) {
|
||||
return ggml_rope_impl(
|
||||
ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
|
||||
);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||
@@ -4928,7 +4958,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||
int n_dims,
|
||||
float base,
|
||||
bool down) {
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
|
||||
return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
||||
}
|
||||
|
||||
// ggml_rope_back
|
||||
@@ -10901,6 +10931,45 @@ static void ggml_compute_forward_clamp(
|
||||
|
||||
// ggml_compute_forward_rope
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
||||
return 1 - MIN(1, MAX(0, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
||||
float * cos_theta, float * sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = cosf(theta) * mscale;
|
||||
*sin_theta = sinf(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
||||
return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||
dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_rope_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
@@ -10910,21 +10979,26 @@ static void ggml_compute_forward_rope_f32(
|
||||
return;
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
|
||||
// these two only relevant for xPos RoPE:
|
||||
float xpos_base;
|
||||
bool xpos_down;
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
|
||||
memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
@@ -10952,6 +11026,9 @@ static void ggml_compute_forward_rope_f32(
|
||||
int ir = 0;
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.f/n_dims;
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
@@ -10965,18 +11042,18 @@ static void ggml_compute_forward_rope_f32(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = freq_scale * (float)p;
|
||||
float theta_base = (float)p;
|
||||
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
theta_base = MIN(p, n_ctx - 2);
|
||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta_base);
|
||||
const float sin_theta = sinf(theta_base);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
block_theta *= theta_scale;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
@@ -10994,13 +11071,16 @@ static void ggml_compute_forward_rope_f32(
|
||||
}
|
||||
} else if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(
|
||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
||||
);
|
||||
|
||||
// zeta scaling for xPos only:
|
||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||
if (xpos_down) zeta = 1.0f / zeta;
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
@@ -11014,12 +11094,19 @@ static void ggml_compute_forward_rope_f32(
|
||||
} else {
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
theta_base *= freq_scale;
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
theta *= theta_scale;
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(
|
||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||
&cos_theta, &sin_theta
|
||||
);
|
||||
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
@@ -11048,15 +11135,19 @@ static void ggml_compute_forward_rope_f16(
|
||||
return;
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
@@ -11084,6 +11175,9 @@ static void ggml_compute_forward_rope_f16(
|
||||
int ir = 0;
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.f/n_dims;
|
||||
float corr_dims[2];
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
@@ -11097,18 +11191,18 @@ static void ggml_compute_forward_rope_f16(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = freq_scale * (float)p;
|
||||
float theta_base = (float)p;
|
||||
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
theta_base = MIN(p, n_ctx - 2);
|
||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta_base);
|
||||
const float sin_theta = sinf(theta_base);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
block_theta *= theta_scale;
|
||||
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
@@ -11126,10 +11220,12 @@ static void ggml_compute_forward_rope_f16(
|
||||
}
|
||||
} else if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(
|
||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
||||
);
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
@@ -11143,12 +11239,19 @@ static void ggml_compute_forward_rope_f16(
|
||||
} else {
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
theta_base *= freq_scale;
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
||||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
theta *= theta_scale;
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(
|
||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||
&cos_theta, &sin_theta
|
||||
);
|
||||
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
@@ -11256,17 +11359,18 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = freq_scale * (float)p;
|
||||
float theta_base = freq_scale * (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta_base);
|
||||
const float sin_theta = sinf(theta_base);
|
||||
|
||||
// zeta scaling for xPos only:
|
||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||
if (xpos_down) zeta = 1.0f / zeta;
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
@@ -11280,10 +11384,10 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
} else {
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta_base);
|
||||
const float sin_theta = sinf(theta_base);
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
@@ -11356,14 +11460,14 @@ static void ggml_compute_forward_rope_back_f16(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = (float)p;
|
||||
float theta_base = (float)p;
|
||||
|
||||
if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta_base);
|
||||
const float sin_theta = sinf(theta_base);
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
@@ -11377,10 +11481,10 @@ static void ggml_compute_forward_rope_back_f16(
|
||||
} else {
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta_base);
|
||||
const float sin_theta = sinf(theta_base);
|
||||
|
||||
theta *= theta_scale;
|
||||
theta_base *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
@@ -15505,9 +15609,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
src1,
|
||||
n_dims,
|
||||
mode,
|
||||
0,
|
||||
n_ctx,
|
||||
freq_base,
|
||||
freq_scale,
|
||||
0.0f,
|
||||
1.0f,
|
||||
0.0f,
|
||||
0.0f,
|
||||
xpos_base,
|
||||
xpos_down,
|
||||
false),
|
||||
@@ -18702,8 +18811,7 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
|
||||
return n == size;
|
||||
}
|
||||
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
@@ -18715,19 +18823,6 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset
|
||||
return ok;
|
||||
}
|
||||
|
||||
static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
|
||||
p->n = 0;
|
||||
p->data = NULL;
|
||||
|
||||
bool ok = true;
|
||||
|
||||
uint32_t n = 0;
|
||||
ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
|
||||
@@ -18786,20 +18881,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ctx->data = NULL;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t n_tensors = 0;
|
||||
uint32_t n_kv = 0;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset);
|
||||
|
||||
ctx->header.n_tensors = n_tensors;
|
||||
ctx->header.n_kv = n_kv;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
|
||||
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
|
||||
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
@@ -18810,12 +18899,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
|
||||
if (ctx->header.version == 1) {
|
||||
gguf_fread_str = gguf_fread_str_v1;
|
||||
}
|
||||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
@@ -18846,15 +18929,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
|
||||
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t n = 0;
|
||||
ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
|
||||
kv->value.arr.n = n;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
}
|
||||
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
|
||||
|
||||
switch (kv->value.arr.type) {
|
||||
case GGUF_TYPE_UINT8:
|
||||
@@ -18913,14 +18988,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
if (ctx->header.version == 1) {
|
||||
// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
|
||||
uint32_t t = 0;
|
||||
ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
|
||||
info->ne[j] = t;
|
||||
} else {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
20
ggml.h
20
ggml.h
@@ -219,7 +219,7 @@
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_SRC 6
|
||||
#define GGML_MAX_NAME 64
|
||||
#define GGML_MAX_OP_PARAMS 32
|
||||
#define GGML_MAX_OP_PARAMS 64
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||
@@ -1326,8 +1326,13 @@ extern "C" {
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
int n_orig_ctx,
|
||||
float freq_base,
|
||||
float freq_scale);
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
@@ -1337,8 +1342,17 @@ extern "C" {
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
int n_orig_ctx,
|
||||
float freq_base,
|
||||
float freq_scale);
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
// compute correction dims for YaRN RoPE scaling
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||
|
||||
// xPos RoPE, in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
|
||||
|
||||
@@ -7,7 +7,7 @@ import shutil
|
||||
import struct
|
||||
import sys
|
||||
import tempfile
|
||||
from enum import IntEnum, auto
|
||||
from enum import Enum, IntEnum, auto
|
||||
from io import BufferedWriter
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, BinaryIO, Callable, Sequence
|
||||
@@ -53,9 +53,12 @@ KEY_ATTENTION_LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||
|
||||
# RoPE
|
||||
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
|
||||
KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear"
|
||||
KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base"
|
||||
KEY_ROPE_SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
KEY_ROPE_SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
KEY_ROPE_SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
KEY_ROPE_SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
|
||||
# tokenization
|
||||
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
|
||||
@@ -390,6 +393,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln1", # yi
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -461,6 +465,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
@@ -577,6 +582,11 @@ class TokenType(IntEnum):
|
||||
UNUSED = 5
|
||||
BYTE = 6
|
||||
|
||||
class RopeScalingType(Enum):
|
||||
NONE = 'none'
|
||||
LINEAR = 'linear'
|
||||
YARN = 'yarn'
|
||||
|
||||
#
|
||||
# implementation
|
||||
#
|
||||
@@ -948,8 +958,17 @@ class GGUFWriter:
|
||||
def add_rope_freq_base(self, value: float):
|
||||
self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scale_linear(self, value: float):
|
||||
self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
|
||||
def add_rope_scaling_type(self, value: RopeScalingType):
|
||||
self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
|
||||
|
||||
def add_rope_scaling_factor(self, value: float):
|
||||
self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_orig_ctx_len(self, value: int):
|
||||
self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_finetuned(self, value: bool):
|
||||
self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
|
||||
|
||||
def add_tokenizer_model(self, model: str):
|
||||
self.add_string(KEY_TOKENIZER_MODEL, model)
|
||||
|
||||
268
llama.cpp
268
llama.cpp
@@ -54,6 +54,7 @@
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <cstdarg>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
@@ -235,6 +236,10 @@ enum llm_kv {
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
LLM_KV_ROPE_SCALING_TYPE,
|
||||
LLM_KV_ROPE_SCALING_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||
|
||||
LLM_KV_TOKENIZER_MODEL,
|
||||
LLM_KV_TOKENIZER_LIST,
|
||||
@@ -276,9 +281,13 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
|
||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||
@@ -552,6 +561,22 @@ do { \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
||||
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
||||
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
||||
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
||||
};
|
||||
|
||||
static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
}
|
||||
}
|
||||
|
||||
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
}
|
||||
|
||||
//
|
||||
// ggml helpers
|
||||
//
|
||||
@@ -1035,8 +1060,11 @@ struct llama_hparams {
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_yarn_orig_ctx;
|
||||
int8_t rope_scaling_type_train : 3;
|
||||
bool rope_finetuned : 1;
|
||||
|
||||
float f_clamp_kqv;
|
||||
float f_max_alibi_bias;
|
||||
@@ -1051,6 +1079,8 @@ struct llama_hparams {
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->rope_finetuned != other.rope_finetuned) return true;
|
||||
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
||||
|
||||
const float EPSILON = 1e-9;
|
||||
|
||||
@@ -1081,8 +1111,16 @@ struct llama_cparams {
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
uint32_t n_yarn_orig_ctx;
|
||||
// These hyperparameters are not exposed in GGUF, because all
|
||||
// existing YaRN models use the same values for them.
|
||||
float yarn_ext_factor;
|
||||
float yarn_attn_factor;
|
||||
float yarn_beta_fast;
|
||||
float yarn_beta_slow;
|
||||
|
||||
bool mul_mat_q;
|
||||
};
|
||||
@@ -1799,6 +1837,12 @@ struct llama_model_loader {
|
||||
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
||||
}
|
||||
|
||||
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
||||
if (ne.size() == 1) {
|
||||
throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
bool is_ok = true;
|
||||
for (size_t i = 0; i < ne.size(); ++i) {
|
||||
@@ -2014,14 +2058,30 @@ static void llm_load_hparams(
|
||||
hparams.n_head_kv = hparams.n_head;
|
||||
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
||||
|
||||
hparams.rope_finetuned = false;
|
||||
GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
|
||||
kv(LLM_KV_ROPE_SCALING_FINETUNED));
|
||||
|
||||
hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
|
||||
GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
|
||||
kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
|
||||
|
||||
// rope_freq_base (optional)
|
||||
hparams.rope_freq_base_train = 10000.0f;
|
||||
GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
||||
|
||||
std::string rope_scaling("linear");
|
||||
GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
|
||||
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
||||
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
||||
|
||||
// rope_freq_scale (inverse of the kv) is optional
|
||||
float ropescale = 1.0f;
|
||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||
hparams.rope_freq_scale_train = 1.0f/ropescale;
|
||||
float ropescale = 0.0f;
|
||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
|
||||
if (ropescale == 0.0f) { // try the old key name
|
||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||
}
|
||||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||||
|
||||
// sanity check for n_rot (optional)
|
||||
{
|
||||
@@ -2371,6 +2431,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & vocab = model.vocab;
|
||||
|
||||
const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||
|
||||
// hparams
|
||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
||||
@@ -2389,8 +2451,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
||||
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||
@@ -2758,8 +2823,8 @@ static void llm_load_tensors(
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
@@ -2818,13 +2883,13 @@ static void llm_load_tensors(
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
||||
@@ -2890,19 +2955,19 @@ static void llm_load_tensors(
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
@@ -3047,21 +3112,11 @@ static void llm_load_tensors(
|
||||
model.t_load_us = ggml_time_us() - model.t_start_us;
|
||||
}
|
||||
|
||||
static bool llama_model_load(
|
||||
const std::string & fname,
|
||||
llama_model & model,
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
bool vocab_only,
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
||||
try {
|
||||
llama_model_loader ml(fname, use_mmap);
|
||||
llama_model_loader ml(fname, params.use_mmap);
|
||||
|
||||
model.hparams.vocab_only = vocab_only;
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
|
||||
llm_load_arch (ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
@@ -3073,15 +3128,15 @@ static bool llama_model_load(
|
||||
throw std::runtime_error("vocab size mismatch");
|
||||
}
|
||||
|
||||
if (vocab_only) {
|
||||
if (params.vocab_only) {
|
||||
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
||||
return true;
|
||||
}
|
||||
|
||||
llm_load_tensors(
|
||||
ml, model, n_gpu_layers,
|
||||
main_gpu, tensor_split,
|
||||
use_mlock, progress_callback, progress_callback_user_data);
|
||||
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
||||
return false;
|
||||
@@ -3150,6 +3205,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
static void llm_build_k_shift(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache & kv,
|
||||
struct ggml_cgraph * graph,
|
||||
llm_rope_type type,
|
||||
@@ -3162,6 +3218,11 @@ static void llm_build_k_shift(
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
||||
const float ext_factor = cparams.yarn_ext_factor;
|
||||
const float attn_factor = cparams.yarn_attn_factor;
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
GGML_ASSERT(n_embd_head % n_rot == 0);
|
||||
|
||||
@@ -3185,7 +3246,8 @@ static void llm_build_k_shift(
|
||||
ggml_element_size(kv.k)*n_embd_head,
|
||||
ggml_element_size(kv.k)*n_embd_gqa,
|
||||
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
||||
K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(tmp, "K_shifted", il);
|
||||
ggml_build_forward_expand(graph, tmp);
|
||||
}
|
||||
@@ -3345,7 +3407,6 @@ static struct ggml_tensor * llm_build_ffn(
|
||||
// if max_alibi_bias > 0 then apply ALiBi
|
||||
static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * cur,
|
||||
const llama_hparams & hparams,
|
||||
const llama_kv_cache & kv,
|
||||
struct ggml_tensor * wo,
|
||||
@@ -3411,7 +3472,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
|
||||
cur = ggml_mul_mat(ctx, wo, cur);
|
||||
@@ -3443,12 +3504,17 @@ struct llm_build_context {
|
||||
|
||||
const float freq_base;
|
||||
const float freq_scale;
|
||||
const float ext_factor;
|
||||
const float attn_factor;
|
||||
const float beta_fast;
|
||||
const float beta_slow;
|
||||
const float norm_eps;
|
||||
const float norm_rms_eps;
|
||||
|
||||
const int32_t n_tokens;
|
||||
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
|
||||
const int32_t kv_head; // index of where we store new KV data in the cache
|
||||
const int32_t n_orig_ctx;
|
||||
|
||||
const bool do_rope_shift;
|
||||
|
||||
@@ -3478,11 +3544,16 @@ struct llm_build_context {
|
||||
n_embd_gqa (hparams.n_embd_gqa()),
|
||||
freq_base (cparams.rope_freq_base),
|
||||
freq_scale (cparams.rope_freq_scale),
|
||||
ext_factor (cparams.yarn_ext_factor),
|
||||
attn_factor (cparams.yarn_attn_factor),
|
||||
beta_fast (cparams.yarn_beta_fast),
|
||||
beta_slow (cparams.yarn_beta_slow),
|
||||
norm_eps (hparams.f_norm_eps),
|
||||
norm_rms_eps (hparams.f_norm_rms_eps),
|
||||
n_tokens (batch.n_tokens),
|
||||
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
do_rope_shift (worst_case || kv_self.has_shift),
|
||||
cb (cb),
|
||||
buf_compute (lctx.buf_compute) {
|
||||
@@ -3533,7 +3604,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -3557,15 +3628,23 @@ struct llm_build_context {
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, cur, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -3635,7 +3714,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -3659,8 +3738,16 @@ struct llm_build_context {
|
||||
|
||||
switch (model.type) {
|
||||
case MODEL_7B:
|
||||
Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||
Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
break;
|
||||
case MODEL_13B:
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
|
||||
@@ -3677,7 +3764,7 @@ struct llm_build_context {
|
||||
// apply ALiBi for 13B model
|
||||
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
||||
|
||||
cur = llm_build_kqv(ctx0, cur, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -3747,7 +3834,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -3787,15 +3874,21 @@ struct llm_build_context {
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, attn_norm, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -3895,7 +3988,7 @@ struct llm_build_context {
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, cur, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -3961,7 +4054,7 @@ struct llm_build_context {
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4054,13 +4147,15 @@ struct llm_build_context {
|
||||
cb(kpass, "kpass", il);
|
||||
|
||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
||||
ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
|
||||
);
|
||||
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(qrotated, "qrotated", il);
|
||||
|
||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
||||
ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
|
||||
);
|
||||
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(krotated, "krotated", il);
|
||||
|
||||
// ggml currently only supports concatenation on dim=2
|
||||
@@ -4100,7 +4195,7 @@ struct llm_build_context {
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
// TODO: not tested, could be broken
|
||||
cur = llm_build_kqv(ctx0, Q, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -4191,7 +4286,7 @@ struct llm_build_context {
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -4288,7 +4383,7 @@ struct llm_build_context {
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -4382,7 +4477,7 @@ struct llm_build_context {
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self,
|
||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
@@ -7884,8 +7979,14 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.n_batch =*/ 512,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.yarn_ext_factor =*/ -1.0f,
|
||||
/*.yarn_attn_factor =*/ 1.0f,
|
||||
/*.yarn_beta_fast =*/ 32.0f,
|
||||
/*.yarn_beta_slow =*/ 1.0f,
|
||||
/*.yarn_orig_ctx =*/ 0,
|
||||
/*.mul_mat_q =*/ true,
|
||||
/*.f16_kv =*/ true,
|
||||
/*.logits_all =*/ false,
|
||||
@@ -7972,10 +8073,7 @@ struct llama_model * llama_load_model_from_file(
|
||||
};
|
||||
}
|
||||
|
||||
if (!llama_model_load(path_model, *model, params.n_gpu_layers,
|
||||
params.main_gpu, params.tensor_split,
|
||||
params.use_mmap, params.use_mlock, params.vocab_only,
|
||||
params.progress_callback, params.progress_callback_user_data)) {
|
||||
if (!llama_model_load(path_model, *model, params)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
||||
delete model;
|
||||
return nullptr;
|
||||
@@ -8001,13 +8099,35 @@ struct llama_context * llama_new_context_with_model(
|
||||
const auto & hparams = model->hparams;
|
||||
auto & cparams = ctx->cparams;
|
||||
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0 ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
||||
|
||||
cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
|
||||
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
||||
hparams.n_ctx_train;
|
||||
|
||||
auto rope_scaling_type = params.rope_scaling_type;
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
||||
rope_scaling_type = hparams.rope_scaling_type_train;
|
||||
}
|
||||
|
||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
|
||||
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
||||
}
|
||||
|
||||
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
|
||||
28
llama.h
28
llama.h
@@ -106,6 +106,14 @@ extern "C" {
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
enum llama_rope_scaling_type {
|
||||
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
||||
LLAMA_ROPE_SCALING_NONE = 0,
|
||||
LLAMA_ROPE_SCALING_LINEAR = 1,
|
||||
LLAMA_ROPE_SCALING_YARN = 2,
|
||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
};
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
float logit; // log-odds of the token
|
||||
@@ -167,15 +175,21 @@ extern "C" {
|
||||
};
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||
float yarn_beta_fast; // YaRN low correction dim
|
||||
float yarn_beta_slow; // YaRN high correction dim
|
||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
|
||||
Binary file not shown.
@@ -1,5 +1,5 @@
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
|
||||
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
set(BUILD_NUMBER 0)
|
||||
set(BUILD_COMMIT "unknown")
|
||||
set(BUILD_COMPILER "unknown")
|
||||
@@ -24,15 +24,21 @@ if(Git_FOUND)
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE HEAD
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE RES
|
||||
)
|
||||
if (RES EQUAL 0)
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
endif()
|
||||
execute_process(
|
||||
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
|
||||
OUTPUT_VARIABLE COUNT
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
RESULT_VARIABLE RES
|
||||
)
|
||||
set(BUILD_COMMIT ${HEAD})
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
if (RES EQUAL 0)
|
||||
set(BUILD_NUMBER ${COUNT})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
@@ -53,22 +59,22 @@ else()
|
||||
set(BUILD_TARGET ${OUT})
|
||||
endif()
|
||||
|
||||
# Only write the header if it's changed to prevent unnecessary recompilation
|
||||
if(EXISTS ${HEADER_FILE})
|
||||
file(READ ${HEADER_FILE} CONTENTS)
|
||||
string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
#ifndef BUILD_INFO_H
|
||||
#define BUILD_INFO_H
|
||||
|
||||
#define BUILD_NUMBER @BUILD_NUMBER@
|
||||
#define BUILD_COMMIT "@BUILD_COMMIT@"
|
||||
#define BUILD_COMPILER "@BUILD_COMPILER@"
|
||||
#define BUILD_TARGET "@BUILD_TARGET@"
|
||||
|
||||
#endif // BUILD_INFO_H
|
||||
@@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
|
||||
build_target=$out
|
||||
fi
|
||||
|
||||
echo "#ifndef BUILD_INFO_H"
|
||||
echo "#define BUILD_INFO_H"
|
||||
echo
|
||||
echo "#define BUILD_NUMBER $build_number"
|
||||
echo "#define BUILD_COMMIT \"$build_commit\""
|
||||
echo "#define BUILD_COMPILER \"$build_compiler\""
|
||||
echo "#define BUILD_TARGET \"$build_target\""
|
||||
echo
|
||||
echo "#endif // BUILD_INFO_H"
|
||||
echo "int LLAMA_BUILD_NUMBER = ${build_number};"
|
||||
echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
|
||||
echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
|
||||
echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
|
||||
|
||||
Reference in New Issue
Block a user