Compare commits

...

13 Commits

Author SHA1 Message Date
Georgi Gerganov
12aa74ba7d minor : spacing 2024-03-22 15:24:57 +02:00
fraxy-v
2605c139a6 Update build.yml 2024-03-20 08:59:38 +02:00
fraxy-v
3e9d3dbff9 Update build.yml 2024-03-20 08:50:46 +02:00
fraxy-v
6014a63125 Update build.yml 2024-03-19 16:52:01 +02:00
Y. Velkov
927be9b58e add test in build action 2024-03-19 16:25:23 +02:00
Y. Velkov
284800b1e3 convert-llama2c-to-ggml: enable conversion of multiqueries, #5608 2024-03-19 15:23:59 +02:00
Artem
c4d7f81786 readme : update ui list (#5731)
* Add LLMFarm (ui for iOS) to list
2024-02-26 16:15:28 +02:00
AidanBeltonS
e849078c6e [SYCL] Add support for soft_max ALiBi (#5639)
* Add support for bias

* Update pre-processor

* rm commented code

* fix format

* fix CI

---------

Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
2024-02-26 19:32:11 +05:30
Georgi Gerganov
67fd33132f unicode : reuse iterator (#5726) 2024-02-26 14:02:12 +02:00
Pierrick Hymbert
4804215cb8 server: CI fix trailing space (#5728) 2024-02-26 12:41:34 +02:00
Pierrick Hymbert
8a533f0d90 server: CI tests reduce build matrix (#5725) 2024-02-26 09:56:10 +01:00
Georgi Gerganov
269de86ba0 llama : fix Gemma rope type (#5691) 2024-02-26 08:30:17 +02:00
github-actions[bot]
c393733988 flake.lock: Update
Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/5863c27340ba4de8f83e7e3c023b9599c3cb3c80' (2024-02-16)
  → 'github:NixOS/nixpkgs/cbc4211f0afffe6dfd2478a62615dd5175a13f9a' (2024-02-23)
2024-02-25 22:24:22 +00:00
9 changed files with 385 additions and 357 deletions

View File

@@ -76,6 +76,17 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
run: |
cd build
echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest

View File

@@ -6,11 +6,10 @@ on:
push:
branches:
- master
- test/server-add-ci-test # FIXME remove
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
jobs:
server:
@@ -18,45 +17,21 @@ jobs:
strategy:
matrix:
build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
include:
- build: 'noavx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
image: ubuntu:latest
- build: 'avx2'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
image: ubuntu:latest
- build: 'avx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
image: ubuntu:latest
- build: 'avx512'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
image: ubuntu:latest
experimental: true
- build: 'cublas'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
image: nvidia/cuda:12.3.1-devel-ubuntu22.04
arch_not_available: true # require nvidia docker engine
- build: 'clblast'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
image: ubuntu:latest
arch_not_available: true
- build: 'openblas'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
image: ubuntu:latest
- build: 'kompute'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
image: ubuntu:latest
arch_not_available: true
- build: 'vulkan'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
image: ubuntu:latest
arch_not_available: true
- build_type: Release
sanitizer: ""
exclude:
- build_type: Release
sanitizer: ADDRESS
- build_type: Release
sanitizer: THREAD
- build_type: Release
sanitizer: UNDEFINED
container:
image: ${{ matrix.image }}
image: ubuntu:latest
ports:
- 8888
options: --cpus 4
@@ -72,40 +47,22 @@ jobs:
apt-get update
apt-get -y install \
build-essential \
pkg-config \
git \
cmake \
python3-pip \
wget \
psmisc
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast' }}
run: |
apt install -y libclblast-dev
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas' }}
run: |
apt-get -y install libopenblas-dev
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
run: |
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
apt-get update
apt-get -y install vulkan-sdk
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests dependencies
@@ -121,7 +78,6 @@ jobs:
- name: Tests
id: server_integration_test
continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
run: |
cd examples/server/tests
PORT=8888 ./tests.sh

View File

@@ -159,6 +159,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
---

View File

@@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View File

@@ -1,6 +1,7 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "log.h"
#include <unordered_map>
#include <vector>
@@ -78,111 +79,101 @@ typedef struct {
struct TransformerWeights {
// token embedding table
float* token_embedding_table; // (vocab_size, dim)
std::vector<float> token_embedding_table; // (vocab_size, dim)
// weights for rmsnorms
float* rms_att_weight; // (layer, dim) rmsnorm weights
float* rms_ffn_weight; // (layer, dim)
std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
std::vector<float> rms_ffn_weight; // (layer, dim)
// weights for matmuls
float* wq; // (layer, dim, dim)
float* wk; // (layer, dim, dim)
float* wv; // (layer, dim, dim)
float* wo; // (layer, dim, dim)
std::vector<float> wq; // (layer, dim, dim)
std::vector<float> wk; // (layer, dim, dim)
std::vector<float> wv; // (layer, dim, dim)
std::vector<float> wo; // (layer, dim, dim)
// weights for ffn
float* w1; // (layer, hidden_dim, dim)
float* w2; // (layer, dim, hidden_dim)
float* w3; // (layer, hidden_dim, dim)
std::vector<float> w1; // (layer, hidden_dim, dim)
std::vector<float> w2; // (layer, dim, hidden_dim)
std::vector<float> w3; // (layer, hidden_dim, dim)
// final rmsnorm
float* rms_final_weight; // (dim,)
std::vector<float> rms_final_weight; // (dim,)
// freq_cis for RoPE relatively positional embeddings
// float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2)
// std::vector<float> freq_cis_real; // (seq_len, dim/2)
// std::vector<float> freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
float* wcls;
~TransformerWeights() {
delete[] token_embedding_table;
delete[] rms_att_weight;
delete[] rms_ffn_weight;
delete[] wq;
delete[] wk;
delete[] wv;
delete[] wo;
delete[] w1;
delete[] w2;
delete[] w3;
delete[] rms_final_weight;
delete[] wcls;
}
std::vector<float> wcls;
};
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try {
w->token_embedding_table.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight = new float[p->n_layers * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo = new float[p->n_layers * p->dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wo.resize(p->n_layers * p->dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
w->rms_final_weight.resize(p->dim);
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = NULL;
} else {
w->wcls = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
if (shared_weights) {
w->wcls = {};
} else {
w->wcls.resize(p->vocab_size * p->dim);
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
catch (std::length_error &) {
die("Invalid configuration. Failed to allocate memory for weights");
}
}
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1;
}
@@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
}
static void print_sample_weights(TransformerWeights *w){
printf("----- Quick print of first of the weight vales of all the variables\n");
printf("%f\n", w->token_embedding_table[0]);
printf("%f\n", w->rms_att_weight[0]);
printf("%f\n", w->rms_ffn_weight[0]);
LOG("----- Quick print of first of the weight vales of all the variables\n");
LOG("%f\n", w->token_embedding_table[0]);
LOG("%f\n", w->rms_att_weight[0]);
LOG("%f\n", w->rms_ffn_weight[0]);
printf("%f\n", w->wq[0]);
printf("%f\n", w->wk[0]);
printf("%f\n", w->wv[0]);
printf("%f\n", w->wo[0]);
printf("%f\n", w->w1[0]);
printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]);
if (w->wcls) printf("%f\n", w->wcls[0]);
LOG("%f\n", w->wq[0]);
LOG("%f\n", w->wk[0]);
LOG("%f\n", w->wv[0]);
LOG("%f\n", w->wo[0]);
LOG("%f\n", w->w1[0]);
LOG("%f\n", w->w2[0]);
LOG("%f\n", w->w3[0]);
LOG("%f\n", w->rms_att_weight[0]);
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -225,14 +216,16 @@ struct llama_vocab {
};
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
uint32_t n_head = 32;
uint32_t n_head_kv = 32;
uint32_t n_layer = 32;
uint32_t n_rot = 64;
bool operator!=(const my_llama_hparams& other) const {
return memcmp(this, &other, sizeof(my_llama_hparams));
}
@@ -325,14 +318,30 @@ struct train_params {
};
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_mult: %u\n", __func__, params->n_mult);
printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %u\n", __func__, params->n_ff);
printf("%s: n_layer: %u\n", __func__, params->n_layer);
printf("%s: n_rot: %u\n", __func__, params->n_rot);
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
LOG("%s: n_head: %u\n", __func__, params->n_head);
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
if (i > 0) LOG("x ");
LOG("[%" PRId64 "] ", t->ne[i]);
total *= t->ne[i];
}
if (i > 1) LOG("= [%" PRId64 "] ", total);
LOG("float space for %s\n", ggml_get_name(t));
}
}
static void init_model(struct my_llama_model * model) {
@@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) {
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab;
const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
const uint32_t n_ff = hparams.n_ff;
struct ggml_context * ctx = model->ctx;
@@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) {
model->train_tokens = 0;
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
// printing the per-layer allocations here so we dont print in the for loop.
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
ggml_set_name(model->norm, "norm.weight");
@@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) {
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) {
ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
}
print_tensor_info(ctx);
}
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
static void print_row(struct ggml_tensor * probs, int i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %f", p);
LOG(" %f", p);
}
printf("\n");
LOG("\n");
}
static void print_matrix(struct ggml_tensor * probs) {
@@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) {
for (int i = 0; i < probs->ne[1]; ++i) {
for (int k = 0; k < probs->ne[0]; ++k) {
float p = get_f32_2d(probs, k, i);
printf(" %.2f", p);
LOG(" %.2f", p);
}
printf("\n");
LOG("\n");
}
}
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2)))
#endif
#endif
static std::string format(const char * fmt, ...) {
va_list ap, ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX);
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), size);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
@@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
return out.str();
}
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
if (is_ggml_file(filename)) {
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
@@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
}
vocab->id_to_token.resize(n_vocab);
@@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
@@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
}
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
int ct;
switch (ggml_n_dims(gg_weights)) {
case 1:
ct = 0;
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
*ptr = karpathy_weights[ct];
ct++;
}
break;
case 2:
ct = 0;
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
*ptr = karpathy_weights[ct];
ct++;
}
}
break;
case 3:
ct = 0;
for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
*ptr = karpathy_weights[ct];
ct++;
}
}
}
break;
int size = 1;
for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
size *= gg_weights->ne[dim];
}
for (int ct = 0; ct < size; ++ct) {
int64_t i0 = 0; int64_t i1 = 0;
int64_t i2 = 0; int64_t i3 = 0;
ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
}
}
@@ -679,16 +635,18 @@ static void save_as_llama_model(
// convert AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
//print_row(model->norm, 0);
// for rms-att-weight
int row_length = model->hparams.n_embd;
int n_ff = model->hparams.n_ff;
const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
auto & layer = model->layers[i];
// 1d
@@ -697,9 +655,10 @@ static void save_as_llama_model(
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]);
convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]);
// from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]);
convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]);
convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
@@ -736,8 +695,8 @@ static void save_as_llama_model(
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
// n_head_kv is optional, default to n_head
// gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@@ -789,12 +748,12 @@ static void save_as_llama_model(
static struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
params.fn_checkpoint_out = "checkpoint.bin";
params.fn_model_out = "ggml-checkpoint-f32.bin";
params.seed = -1;
@@ -829,8 +788,8 @@ static struct train_params get_default_train_params() {
params.adam_alpha = 1e-3f;
params.adam_decay = 1e-3f;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
params.mem_compute0_gb = 8;
params.mem_compute1_gb = 2;
@@ -916,19 +875,30 @@ int main(int argc, char ** argv) {
if (!params_parse(argc, argv, &params)) {
return 1;
}
log_set_target(stdout);
Config config;
TransformerWeights weights = {};
{
FILE *file = fopen(params.fn_llama2c_model, "rb");
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE *file = fopen(params.fn_llama2c_model, "r");
if (!file) {
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1;
}
// read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
if (fread(&config, sizeof(Config), 1, file) != 1) {
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1;
}
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights
malloc_weights(&weights, &config, shared_weights);
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1;
}
fclose(file);
}
@@ -936,15 +906,18 @@ int main(int argc, char ** argv) {
load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;
model.hparams.n_head = config.n_heads; //params.n_head;
model.hparams.n_head_kv = config.n_kv_heads;
model.hparams.n_layer = config.n_layers; //params.n_layer;
model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
print_params(&model.hparams);
struct ggml_init_params lcparams;
lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
lcparams.mem_buffer = NULL;
@@ -956,7 +929,7 @@ int main(int argc, char ** argv) {
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;

6
flake.lock generated
View File

@@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1708118438,
"narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
"lastModified": 1708655239,
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
"type": "github"
},
"original": {

View File

@@ -8126,23 +8126,51 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
}
static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
const sycl::nd_item<3> &item_ct1, float *buf) {
template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = item_ct1.get_local_id(2);
const int rowx = item_ct1.get_group(2);
const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
const int block_size = item_ct1.get_local_range(2);
const int block_size = block_size_template == 0 ? item_ct1.get_local_range(2) : block_size_template;
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
float slope = 0.0f;
// ALiBi
if (max_bias > 0.0f) {
const uint32_t h = rowx/nrows_y; // head index
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = sycl::pow(base, float(exp));
}
float * vals = vals_smem ? buf + WARP_SIZE : dst + rowx*ncols;
float max_val = -INFINITY;
for (int col = tid; col < ncols; col += block_size) {
for (int col0 = 0; col0 < ncols; col0 += block_size) {
const int col = col0 + tid;
if (ncols_template == 0 && col >= ncols) {
break;
}
const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;
max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
vals[col] = val;
max_val = sycl::max(max_val, val);
}
// find the max value in the block
@@ -8151,30 +8179,12 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
if (warp_id == 0) {
buf[lane_id] = -INFINITY;
}
/*
DPCT1118:12: SYCL group functions and algorithms must be encountered in
converged control flow. You may need to adjust the code.
*/
/*
DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
if (lane_id == 0) {
buf[warp_id] = max_val;
}
/*
DPCT1118:13: SYCL group functions and algorithms must be encountered in
converged control flow. You may need to adjust the code.
*/
/*
DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
max_val = buf[lane_id];
max_val = warp_reduce_max(max_val, item_ct1);
@@ -8182,13 +8192,16 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
float tmp = 0.f;
for (int col = tid; col < ncols; col += block_size) {
const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;
const float val =
sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
#pragma unroll
for (int col0 = 0; col0 < ncols; col0 += block_size) {
const int col = col0 + tid;
if (ncols_template == 0 && col >= ncols) {
break;
}
const float val = sycl::native::exp(vals[col] - max_val);
tmp += val;
dst[ix] = val;
vals[col] = val;
}
// find the sum of exps in the block
@@ -8197,40 +8210,29 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
if (warp_id == 0) {
buf[lane_id] = 0.f;
}
/*
DPCT1118:14: SYCL group functions and algorithms must be encountered in
converged control flow. You may need to adjust the code.
*/
/*
DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
if (lane_id == 0) {
buf[warp_id] = tmp;
}
/*
DPCT1118:15: SYCL group functions and algorithms must be encountered in
converged control flow. You may need to adjust the code.
*/
/*
DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
better performance if there is no access to global memory.
*/
item_ct1.barrier();
item_ct1.barrier(sycl::access::fence_space::local_space);
tmp = buf[lane_id];
tmp = warp_reduce_sum(tmp, item_ct1);
}
const float inv_tmp = 1.f / tmp;
const float inv_sum = 1.f / tmp;
for (int col = tid; col < ncols; col += block_size) {
const int i = rowx*ncols + col;
dst[i] *= inv_tmp;
#pragma unroll
for (int col0 = 0; col0 < ncols; col0 += block_size) {
const int col = col0 + tid;
if (ncols_template == 0 && col >= ncols) {
return;
}
const int idst = rowx*ncols + col;
dst[idst] = vals[col] * inv_sum;
}
}
@@ -10867,35 +10869,96 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
});
}
static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
const int ncols_x, const int nrows_x,
const int nrows_y, const float scale,
template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
const size_t n_local_scratch, dpct::queue_ptr stream) {
stream->submit([&](sycl::handler &cgh) {
sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
nrows_y, scale, max_bias, m0,
m1, n_head_log2, item_ct1,
local_buf_acc.get_pointer());
});
});
}
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
float * dst, const int ncols_x, const int nrows_x,
const int nrows_y, const float scale, const float max_bias,
dpct::queue_ptr stream) {
int nth = WARP_SIZE;
while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const sycl::range<3> block_dims(1, 1, nth);
const sycl::range<3> block_nums(1, 1, nrows_x);
/*
DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
limit. To get the device limit, query info::device::max_work_group_size.
Adjust the work-group size if needed.
*/
stream->submit([&](sycl::handler &cgh) {
/*
DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
replaced with a value. Modify the code to use the original expression,
provided in comments, if it is correct.
*/
sycl::local_accessor<float, 1> buf_acc_ct1(
sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
buf_acc_ct1.get_pointer());
});
});
const uint32_t n_head_kv = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
if (n_local_scratch*sizeof(float) < local_mem_size) {
switch (ncols_x) {
case 32:
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 64:
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 128:
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 256:
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 512:
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 1024:
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 2048:
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 4096:
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
default:
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
}
} else {
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, WARP_SIZE, stream);
}
}
template <typename T>
@@ -12435,14 +12498,35 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
const int64_t nrows_y = src0->ne[1];
float scale = 1.0f;
memcpy(&scale, dst->op_params, sizeof(float));
float max_bias = 0.0f;
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
memcpy(&scale, dst->op_params + 0, sizeof(float));
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
(void) dst;
// positions tensor
float * src2_dd = nullptr;
sycl_pool_alloc<float> src2_f;
ggml_tensor * src2 = dst->src[2];
const bool use_src2 = src2 != nullptr;
if (use_src2) {
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
if (src2_on_device) {
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
}
}
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
nrows_x, nrows_y, scale, max_bias, main_stream);
}
inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,

View File

@@ -4894,8 +4894,8 @@ static struct ggml_tensor * llm_build_kqv(
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
}
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
if (hparams.f_max_alibi_bias > 0.0f) {
@@ -12089,7 +12089,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_ORION:
case LLM_ARCH_INTERNLM2:
case LLM_ARCH_MINICPM:
case LLM_ARCH_GEMMA:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
@@ -12101,6 +12100,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2:
case LLM_ARCH_PHI2:
case LLM_ARCH_GEMMA:
return LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here

View File

@@ -404,7 +404,8 @@ static std::unordered_map<uint32_t, int> codepoint_type_map() {
static int codepoint_type(uint32_t cp) {
static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
const auto it = codepoint_types.find(cp);
return it == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
}
static int codepoint_type(const std::string & utf8) {