ggml : rms_norm in chunks

ggml : process mul mat rows in chunks
2026-05-03 00:04:18 +02:00 · 2023-05-20 10:15:54 +03:00 · 2023-05-20 10:15:53 +03:00
12 changed files with 257 additions and 463 deletions
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,7 +1,6 @@
+#include <locale.h>
 #include "ggml.h"
 #include "build-info.h"
-
-#include <locale.h>
 #include <assert.h>
 #include <math.h>
 #include <cstring>
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -578,37 +578,6 @@ void console_set_color(console_state & con_st, console_color_t color) {
 }

 char32_t getchar32() {
-#if defined(_WIN32)
-    HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
-    wchar_t high_surrogate = 0;
-
-    while (true) {
-        INPUT_RECORD record;
-        DWORD count;
-        if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
-            return WEOF;
-        }
-
-        if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
-            wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
-            if (wc == 0) {
-                continue;
-            }
-
-            if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-                high_surrogate = wc;
-                continue;
-            } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
-                if (high_surrogate != 0) { // Check if we have a high surrogate
-                    return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
-                }
-            }
-
-            high_surrogate = 0; // Reset the high surrogate
-            return static_cast<char32_t>(wc);
-        }
-    }
-#else
    wchar_t wc = getwchar();
    if (static_cast<wint_t>(wc) == WEOF) {
        return WEOF;
@@ -627,7 +596,6 @@ char32_t getchar32() {
 #endif

    return static_cast<char32_t>(wc);
-#endif
 }

 void pop_cursor(console_state & con_st) {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -31,8 +31,6 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-    llama_init_backend();
-
    llama_context * ctx;

    // load the model
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -96,7 +96,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-    llama_init_backend();
+//    params.prompt = R"(// this function checks if the number n is prime
+//bool is_prime(int n) {)";

    llama_context * ctx;
    g_ctx = &ctx;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -143,8 +143,6 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-    llama_init_backend();
-
    llama_context * ctx;

    // load the model and apply lora adapter, if any
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,6 +1,6 @@
-#include "build-info.h"
-
+#include "ggml.h"
 #include "llama.h"
+#include "build-info.h"

 #include <cstdio>
 #include <map>
@@ -42,6 +42,8 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
 //  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 int main(int argc, char ** argv) {
+    ggml_time_init();
+
    if (argc < 3) {
        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@@ -50,7 +52,12 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_init_backend();
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }

    // parse command line arguments
    const std::string fname_inp = argv[1];
@@ -109,25 +116,25 @@ int main(int argc, char ** argv) {
    }
    fprintf(stderr, "\n");

-    const int64_t t_main_start_us = llama_time_us();
+    const int64_t t_main_start_us = ggml_time_us();

    int64_t t_quantize_us = 0;

    // load the model
    {
-        const int64_t t_start_us = llama_time_us();
+        const int64_t t_start_us = ggml_time_us();

        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }

-        t_quantize_us = llama_time_us() - t_start_us;
+        t_quantize_us = ggml_time_us() - t_start_us;
    }

    // report timing
    {
-        const int64_t t_main_end_us = llama_time_us();
+        const int64_t t_main_end_us = ggml_time_us();

        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -83,19 +83,9 @@ typedef struct {
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");

-#define CUDA_MUL_BLOCK_SIZE 256
 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
 #define CUDA_DMMV_BLOCK_SIZE 32 // dmmv = dequantize_mul_mat_vec

-static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= kx) {
-        return;
-    }
-    dst[i] = x[i] * y[i%ky];
-}
-
 static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
    const block_q4_0 * x = (const block_q4_0 *) vx;

@@ -238,11 +228,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
    }
 }

-static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
-    const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
-    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
-}
-
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
    dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -482,67 +467,6 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
    }
 }

-static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[2];
-    const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size, d_size;
-
-    float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
-    float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
-    float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            const int i0 = i03*ne02 + i02;
-            float * c_X2 = d_X + i0*ne01*ne00;
-            float * c_D2 = d_D + i0*ne01*ne00;
-
-            cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
-            cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
-            cudaEvent_t  cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
-
-            // copy src0 to device
-            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
-            CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
-            // wait for data
-            CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                const int64_t i13 = i03%ne13;
-                const int64_t i12 = i02%ne12;
-                const int64_t i11 = i01%ne11;
-                const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
-
-                float * c_X1 = c_X2 + i01*ne00;
-                float * c_Y = d_Y + i1*ne10;
-                float * c_D1 = c_D2 + i01*ne00;
-
-                // compute
-                mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
-                CUDA_CHECK(cudaGetLastError());
-            }
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
-        }
-    }
-    CUDA_CHECK(cudaDeviceSynchronize());
-    ggml_cuda_pool_free(d_X, x_size);
-    ggml_cuda_pool_free(d_D, d_size);
-}
-
 static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@@ -800,11 +724,6 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
    ggml_cuda_pool_free(d_Q, q_size);
 }

-void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cuda_mul_f32(src0, src1, dst);
-}
-
 bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    const int64_t ne10 = src1->ne[0];

@@ -878,48 +797,14 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);

    size_t q_size;
-    char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+    char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);

    cudaStream_t cudaStream2 = g_cudaStreams2[0];

    // copy tensor to device
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            int i = i3*ne2 + i2;
-            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
-        }
-    }
+    CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
+    CUDA_CHECK(cudaDeviceSynchronize());

-    tensor->data = dst;
+    tensor->data = d_Q;
    tensor->backend = GGML_BACKEND_CUDA;
 }
-
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
-    FILE * fp = fopen(fname, "rb");
-
-    const size_t size = ggml_nbytes(tensor);
-
-    void * buf;
-    CUDA_CHECK(cudaMalloc(&buf, size));
-    void * buf_host = malloc(size);
-
-#ifdef _WIN32
-    int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
-#else
-    int ret = fseek(fp, (long) offset, SEEK_SET);
-#endif
-    GGML_ASSERT(ret == 0); // same
-
-    size_t ret2 = fread(buf_host, size, 1, fp);
-    if (ret2 != 1) {
-        fprintf(stderr, "unexpectedly reached end of file");
-        exit(1);
-    }
-
-    cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
-    cudaDeviceSynchronize();
-
-    tensor->data = buf;
-    free(buf_host);
-    fclose(fp);
-}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -6,7 +6,6 @@ extern "C" {

 void   ggml_init_cublas(void);

-void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -16,7 +15,6 @@ void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);

 void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);

 #ifdef  __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
@@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }

-#if defined(__AVX2__) || defined(__AVX512F__)
+#if __AVX2__ || __AVX512F__
 // spread 32 bits to 32 bytes { 0x00, 0xFF }
 static inline __m256i bytes_from_bits_32(const uint8_t * x) {
    uint32_t x32;
@@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // __AVX__ || __AVX2__ || __AVX512F__
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)

-#if defined(__ARM_NEON)
+#if __ARM_NEON

 #if !defined(__aarch64__)

@@ -3590,6 +3590,9 @@ struct ggml_compute_params {
    // work buffer for all threads
    size_t wsize;
    void * wdata;
+
+    // atomic counter used to distribute chunks of work
+    atomic_int * aic;
 };

 //
@@ -3776,12 +3779,6 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g
        (t1->ne[3]%t0->ne[3] == 0);
 }

-static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
-}
-
 static inline int ggml_up32(int n) {
    return (n + 31) & ~31;
 }
@@ -4664,15 +4661,11 @@ struct ggml_tensor * ggml_mul_impl(
        struct ggml_tensor * a,
        struct ggml_tensor * b,
        bool inplace) {
-    // TODO: support less-strict constraint
-    //       GGML_ASSERT(ggml_can_repeat(b, a));
-    GGML_ASSERT(ggml_can_repeat_rows(b, a));
+    GGML_ASSERT(ggml_are_same_shape(a, b));

    bool is_node = false;

    if (!inplace && (a->grad || b->grad)) {
-        // TODO: support backward pass for broadcasting
-        GGML_ASSERT(ggml_are_same_shape(a, b));
        is_node = true;
    }

@@ -7970,7 +7963,7 @@ static void ggml_compute_forward_mul_f32(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
@@ -7978,25 +7971,10 @@ static void ggml_compute_forward_mul_f32(
    const int ith = params->ith;
    const int nth = params->nth;

-#ifdef GGML_USE_CUBLAS
-    if (src1->backend == GGML_BACKEND_CUDA) {
-        if (ith == 0) {
-            ggml_cuda_mul(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
-    const int64_t nr = ggml_nrows(src0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
+    const int nr  = ggml_nrows(src0);
+    const int64_t ne0 = src0->ne[0];
+    const int64_t ne1 = src0->ne[1];
+    const int64_t ne2 = src0->ne[2];

    const size_t nb00 = src0->nb[0];
    const size_t nb01 = src0->nb[1];
@@ -8015,51 +7993,44 @@ static void ggml_compute_forward_mul_f32(

    GGML_ASSERT( nb0 == sizeof(float));
    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(ne00 == ne10);

    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+        for (int ir = ith; ir < nr; ir += nth) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);

 #ifdef GGML_USE_ACCELERATE
            UNUSED(ggml_vec_mul_f32);

-            vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr,  1, ne00);
+            vDSP_vmul(
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
+                    ne0);
 #else
-            ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+            ggml_vec_mul_f32(ne0,
+                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
+                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
+                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
 #endif
                // }
            // }
        }
    } else {
        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+        for (int ir = ith; ir < nr; ir += nth) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            for (int i0 = 0; i0 < ne0; i0++) {
+                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);

                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
            }
@@ -9062,18 +9033,20 @@ static void ggml_compute_forward_rms_norm_f32(
    GGML_ASSERT(ggml_are_same_shape(src0, dst));

    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        atomic_store(params->aic, 0);
+
        return;
    }

    GGML_ASSERT(src0->nb[0] == sizeof(float));

-    const int ith = params->ith;
+    const int ith = params->ith; UNUSED(ith);
    const int nth = params->nth;

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
+    const int64_t ne03 = src0->ne[3]; UNUSED(ne03);

    const size_t nb01 = src0->nb[1];
    const size_t nb02 = src0->nb[2];
@@ -9085,30 +9058,45 @@ static void ggml_compute_forward_rms_norm_f32(

    const float eps = 1e-6f; // TODO: make this a parameter

-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+    const int nr = ggml_nrows(src0);
+    const int dr = (nr + 8*nth - 1)/(8*nth);

-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
-                }
+    while (true) {
+        const int ir0 = atomic_fetch_add(params->aic, dr);

-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
+        for (int ir = ir0; ir < ir0 + dr; ++ir) {
+            if (ir >= nr) {
+                break;
            }
+
+            // src0 indices
+            const int i03 = ir/(ne02*ne01);
+            const int i02 = (ir - i03*ne02*ne01)/ne01;
+            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+
+            ggml_float sum = 0.0;
+            for (int64_t i00 = 0; i00 < ne00; i00++) {
+                sum += (ggml_float)(x[i00] * x[i00]);
+            }
+
+            float mean = sum/ne00;
+
+            float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+
+            memcpy(y, x, ne00 * sizeof(float));
+            // for (int i00 = 0; i00 < ne00; i00++) {
+            //     y[i00] = x[i00];
+            // }
+
+            const float scale = 1.0f/sqrtf(mean + eps);
+
+            ggml_vec_scale_f32(ne00, y, scale);
+        }
+
+        if (ir0 + dr >= nr) {
+            break;
        }
    }
 }
@@ -9783,7 +9771,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];

-    const int ith = params->ith;
+    const int ith = params->ith; UNUSED(ith);
    const int nth = params->nth;

    GGML_ASSERT(ne02 == ne12);
@@ -9899,6 +9887,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
            }
        }

+        atomic_store(params->aic, 0);
+
        return;
    }

@@ -9906,43 +9896,48 @@ static void ggml_compute_forward_mul_mat_q_f32(
        return;
    }

-    // parallelize by src0 rows using ggml_vec_dot_q
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
    void * wdata = params->wdata;
    const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];

-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+    // parallelize by src0 rows using ggml_vec_dot_q

-        const int i13 = i03;
-        const int i12 = i02;
+    const int nr = ggml_nrows(src0);
+    const int dr = (nr + 8*nth - 1)/(8*nth);

-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
+    while (true) {
+        const int ir0 = atomic_fetch_add(params->aic, dr);

-        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
+        for (int ir = ir0; ir < ir0 + dr; ++ir) {
+            if (ir >= nr) {
+                break;
+            }

-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+            // src0 indices
+            const int i03 = ir/(ne02*ne01);
+            const int i02 = (ir - i03*ne02*ne01)/ne01;
+            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);

-        assert(ne00 % 32 == 0);
+            const int i13 = i03;
+            const int i12 = i02;

-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
+            const int i0 = i01;
+            const int i2 = i02;
+            const int i3 = i03;
+
+            void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+            char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
+
+            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+
+            assert(ne00 % 32 == 0);
+
+            for (int64_t ic = 0; ic < ne11; ++ic) {
+                vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
+            }
+        }
+
+        if (ir0 + dr >= nr) {
+            break;
        }
    }

@@ -13781,6 +13776,7 @@ struct ggml_compute_state_shared {

    // synchronization primitives
    atomic_int  n_ready;
+    atomic_int  aic;
    atomic_bool has_work;
    atomic_bool stop; // stop all threads
 };
@@ -13849,6 +13845,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
        /*.spin      =*/ GGML_LOCK_INITIALIZER,
        /*.n_threads =*/ n_threads,
        /*.n_ready   =*/ 0,
+        /*.aic       =*/ 0,
        /*.has_work  =*/ false,
        /*.stop      =*/ false,
    };
@@ -13869,6 +13866,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    .nth   = n_threads,
                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
                    .wdata = cgraph->work ? cgraph->work->data : NULL,
+                    .aic   = &state_shared.aic,
                },
                .node   = NULL,
                .shared = &state_shared,
@@ -14158,6 +14156,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
            /*.nth   =*/ node->n_tasks,
            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+            /*.aic   =*/ &state_shared.aic,
        };

        ggml_compute_forward(&params, node);
@@ -14181,6 +14180,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    .nth   = node->n_tasks,
                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
                    .wdata = cgraph->work ? cgraph->work->data : NULL,
+                    .aic   = &state_shared.aic,
                };
                workers[j].node = node;
            }
@@ -14196,6 +14196,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
        }

        params.type = GGML_TASK_COMPUTE;
+        params.aic  = &state_shared.aic;
        ggml_compute_forward(&params, node);

        // wait for thread pool
@@ -14236,6 +14237,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                    .nth   = node->n_tasks,
                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
                    .wdata = cgraph->work ? cgraph->work->data : NULL,
+                    .aic   = &state_shared.aic,
                };
                workers[j].node = node;
            }
--- a/llama-util.h
+++ b/llama-util.h
@@ -101,12 +101,12 @@ struct llama_file {
        LLAMA_ASSERT(ret == 0); // same
    }

-    void read_raw(void * ptr, size_t len) const {
-        if (len == 0) {
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
            return;
        }
        errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
+        std::size_t ret = std::fread(ptr, size, 1, fp);
        if (ferror(fp)) {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
@@ -127,12 +127,12 @@ struct llama_file {
        return std::string(chars.data(), len);
    }

-    void write_raw(const void * ptr, size_t len) const {
-        if (len == 0) {
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
            return;
        }
        errno = 0;
-        size_t ret = std::fwrite(ptr, len, 1, fp);
+        size_t ret = std::fwrite(ptr, size, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
@@ -172,7 +172,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }

-        if (prefetch > 0) {
+        if (prefetch) {
            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
@@ -267,9 +267,9 @@ struct llama_mlock {
        }
    }

-    void init(void * ptr) {
-        LLAMA_ASSERT(addr == NULL && size == 0);
-        addr = ptr;
+    void init(void * addr) {
+        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
+        this->addr = addr;
    }

    void grow_to(size_t target_size) {
@@ -340,14 +340,14 @@ struct llama_mlock {
        return (size_t) si.dwPageSize;
    }

-    bool raw_lock(void * ptr, size_t len) {
+    bool raw_lock(void * addr, size_t size) {
        for (int tries = 1; ; tries++) {
-            if (VirtualLock(ptr, len)) {
+            if (VirtualLock(addr, size)) {
                return true;
            }
            if (tries == 2) {
                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                    len, size, llama_format_win_err(GetLastError()).c_str());
+                        size, this->size, llama_format_win_err(GetLastError()).c_str());
                return false;
            }

@@ -363,7 +363,7 @@ struct llama_mlock {
            // is equal to the number of pages in its minimum working set minus
            // a small overhead."
            // Hopefully a megabyte is enough overhead:
-            size_t increment = len + 1048576;
+            size_t increment = size + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
            min_ws_size += increment;
            max_ws_size += increment;
@@ -375,8 +375,8 @@ struct llama_mlock {
        }
    }

-    void raw_unlock(void * ptr, size_t len) {
-        if (!VirtualUnlock(ptr, len)) {
+    void raw_unlock(void * addr, size_t size) {
+        if (!VirtualUnlock(addr, size)) {
            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
@@ -388,12 +388,12 @@ struct llama_mlock {
        return (size_t) 65536;
    }

-    bool raw_lock(const void * addr, size_t len) {
+    bool raw_lock(const void * addr, size_t size) {
        fprintf(stderr, "warning: mlock not supported on this system\n");
        return false;
    }

-    void raw_unlock(const void * addr, size_t len) {}
+    void raw_unlock(const void * addr, size_t size) {}
 #endif
 };

@@ -404,10 +404,10 @@ struct llama_buffer {

    llama_buffer() = default;

-    void resize(size_t len) {
+    void resize(size_t size) {
        delete[] addr;
-        addr = new uint8_t[len];
-        size = len;
+        addr = new uint8_t[size];
+        this->size = size;
    }

    ~llama_buffer() {
--- a/llama.cpp
+++ b/llama.cpp
@@ -1,7 +1,6 @@
 // Defines fileno on msys:
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
-#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #endif
@@ -46,7 +45,6 @@ enum e_model {
    MODEL_65B,
 };

-
 static const size_t MB = 1024*1024;

 // computed for n_ctx == 2048
@@ -112,7 +110,7 @@ struct llama_hparams {
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;

    bool operator!=(const llama_hparams & other) const {
-        return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
+        return memcmp(this, &other, sizeof(llama_hparams));
    }
 };

@@ -504,7 +502,7 @@ struct llama_file_loader {

            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
                // skip to the next multiple of 32 bytes
-                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+                file.seek(-file.tell() & 31, SEEK_CUR);
            }
            shard.file_idx = file_idx;
            shard.file_off = file.tell();
@@ -579,7 +577,7 @@ struct llama_file_saver {
        file.write_u32(new_type);
        file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
        file.write_raw(tensor.name.data(), tensor.name.size());
-        file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+        file.seek(-file.tell() & 31, SEEK_CUR);
        LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
        file.write_raw(new_data, new_size);
    }
@@ -646,7 +644,7 @@ struct llama_model_loader {
        }
    }

-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
            throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -657,10 +655,10 @@ struct llama_model_loader {
                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
        }

-        return get_tensor_for(lt, backend);
+        return get_tensor_for(lt);
    }

-    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
+    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
        struct ggml_tensor * tensor;
        if (lt.ne.size() == 2) {
            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -670,7 +668,6 @@ struct llama_model_loader {
        }
        ggml_set_name(tensor, lt.name.c_str());
        LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
-        tensor->backend = backend;
        lt.ggml_tensor = tensor;
        num_ggml_tensors_created++;
        return tensor;
@@ -684,16 +681,12 @@ struct llama_model_loader {

    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
        size_t data_size = 0;
-        size_t prefetch_size = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            data_size += lt.size;
-            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
-                prefetch_size += lt.size;
-            }
        }

        if (use_mmap) {
-            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
+            mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
            if (!lmlock) {
                // Don't call the callback since the actual loading will be lazy
                // and we can't measure it.
@@ -706,9 +699,6 @@ struct llama_model_loader {

        size_t done_size = 0;
        for (llama_load_tensor & lt : tensors_map.tensors) {
-            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
-                continue;
-            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
@@ -721,6 +711,9 @@ struct llama_model_loader {
                lmlock->grow_to(done_size);
            }
        }
+        if (progress_callback) {
+            progress_callback(1.0f, progress_callback_user_data);
+        }
    }

    void load_data_for(llama_load_tensor & lt) {
@@ -845,21 +838,6 @@ bool llama_mlock_supported() {
    return llama_mlock::SUPPORTED;
 }

-void llama_init_backend() {
-    ggml_time_init();
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-}
-
-int64_t llama_time_us() {
-    return ggml_time_us();
-}
-
 //
 // model loading
 //
@@ -975,7 +953,27 @@ static void llama_model_load_internal(
    size_t ctx_size;
    size_t mmapped_size;
    ml->calc_sizes(&ctx_size, &mmapped_size);
-    fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
+    fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/1024.0/1024.0);
+
+    // print memory requirements
+    {
+        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
+
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+            ctx_size +
+            mmapped_size +
+            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH1().at(model.type) +
+            MEM_REQ_EVAL().at(model.type);
+
+        // this is the memory required by one llama_state
+        const size_t mem_required_state =
+            scale*MEM_REQ_KV_SELF().at(model.type);
+
+        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
+    }

    // create the ggml context
    {
@@ -997,14 +995,7 @@ static void llama_model_load_internal(
        }
    }

-#ifdef GGML_USE_CUBLAS
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
-#else
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#endif
-
    // prepare memory for the weights
-    size_t vram_total = 0;
    {
        const uint32_t n_embd  = hparams.n_embd;
        const uint32_t n_layer = hparams.n_layer;
@@ -1012,87 +1003,33 @@ static void llama_model_load_internal(

        ml->ggml_ctx = ctx;

-        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
-        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
-
-        // "output" tensor
-        {
-            ggml_backend backend_output;
-            if (n_gpu_layers > int(n_layer)) { // NOLINT
-                backend_output = LLAMA_BACKEND_OFFLOAD;
-            } else {
-                backend_output = GGML_BACKEND_CPU;
-            }
-
-            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
-        }
-
-        const int i_gpu_start = n_layer - n_gpu_layers;
+        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
+        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});

        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
-            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
-
            auto & layer = model.layers[i];

            std::string layers_i = "layers." + std::to_string(i);

-            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});

-            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
-            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
-            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
-            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
+            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
+            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
+            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
+            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});

-            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
+            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});

-            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   backend);
-            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, backend);
-            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff},   backend);
-
-            if (backend == GGML_BACKEND_CUDA) {
-                vram_total +=
-                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
-                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
-                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
-            }
+            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
+            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
+            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
        }
    }

    ml->done_getting_tensors();

-    // print memory requirements
-    {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-            ctx_size +
-            mmapped_size - vram_total + // weights in VRAM not in memory
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
-
-        // this is the memory required by one llama_state
-        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
-
-        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
-                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
-
-#ifdef GGML_USE_CUBLAS
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
-        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
-        }
-        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-#else
-        (void) n_gpu_layers;
-#endif
-    }
-
    // populate `tensors_by_name`
    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@@ -1100,34 +1037,36 @@ static void llama_model_load_internal(

    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

+    model.mapping = std::move(ml->mapping);
 #ifdef GGML_USE_CUBLAS
    {
-        size_t done_size = 0;
-        size_t data_size = 0;
-        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
-            data_size += lt.size;
-            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
-                done_size += lt.size;
-            }
-        }
-        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
-            if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
-                continue;
-            }
-            if (progress_callback) {
-                progress_callback((float) done_size / data_size, progress_callback_user_data);
-            }
-            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
-            done_size += lt.size;
-        }
-    }
-#endif // GGML_USE_CUBLAS
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

-    if (progress_callback) {
-        progress_callback(1.0f, progress_callback_user_data);
-    }
+        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);

-    model.mapping = std::move(ml->mapping);
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+            ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+#else
+    (void) n_gpu_layers;
+#endif

    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
@@ -1226,8 +1165,10 @@ static bool llama_eval_internal(
        {
            cur = ggml_rms_norm(ctx0, inpL);

-            // cur = cur*attention_norm(broadcasted)
-            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
+            // cur = attention_norm*cur
+            cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
+                        cur);
        }

        // self-attention
@@ -1334,8 +1275,10 @@ static bool llama_eval_internal(
            {
                cur = ggml_rms_norm(ctx0, inpFF);

-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+                // cur = ffn_norm*cur
+                cur = ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
+                        cur);
            }

            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1372,8 +1315,10 @@ static bool llama_eval_internal(

        inpL = ggml_rms_norm(ctx0, inpL);

-        // inpL = inpL*norm(broadcasted)
-        inpL = ggml_mul(ctx0, inpL, model.norm);
+        // inpL = norm*inpL
+        inpL = ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.norm, inpL),
+                    inpL);

        embeddings = inpL;
    }
@@ -2197,7 +2142,7 @@ struct llama_context * llama_init_from_file(
            unsigned * cur_percentage_p = (unsigned *) ctx;
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
-                *cur_percentage_p = percentage;
+                ++*cur_percentage_p;
                fprintf(stderr, ".");
                fflush(stderr);
                if (percentage >= 100) {
@@ -2354,7 +2299,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *

        // maybe this should in llama_model_loader
        if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
        }
    }

@@ -2447,7 +2392,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                }
                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
                lt.data = (uint8_t *) lt.ggml_tensor->data;
                model_loader->load_data_for(lt);
                lt.ggml_tensor->data = lt.data;
--- a/llama.h
+++ b/llama.h
@@ -40,9 +40,9 @@ extern "C" {
    typedef int llama_token;

    typedef struct llama_token_data {
-        llama_token id; // token id
-        float logit;    // log-odds of the token
-        float p;        // probability of the token
+        llama_token id;  // token id
+        float logit; // log-odds of the token
+        float p;     // probability of the token
    } llama_token_data;

    typedef struct llama_token_data_array {
@@ -73,16 +73,16 @@ extern "C" {

    // model file types
    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+        LLAMA_FTYPE_ALL_F32     = 0,
+        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
    };

    LLAMA_API struct llama_context_params llama_context_default_params();
@@ -90,13 +90,6 @@ extern "C" {
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();

-    // TODO: not great API - very likely to change
-    // Initialize the llama + ggml backend
-    // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
-
-    LLAMA_API int64_t llama_time_us();
-
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
Author	SHA1	Message	Date
Georgi Gerganov	a1cdd29cd2	ggml : rms_norm in chunks	2023-05-20 10:15:54 +03:00
Georgi Gerganov	5a317898e8	ggml : process mul mat rows in chunks	2023-05-20 10:15:53 +03:00