cleanup, try to add version tagging

2026-04-09 10:31:45 +02:00 · 2024-11-23 12:59:06 +08:00
parent 1dd37933e3
commit afc575fbd8
141 changed files with 123 additions and 14439 deletions
--- a/.github/workflows/kcpp-build-release-osx.yaml
+++ b/.github/workflows/kcpp-build-release-osx.yaml
@@ -24,7 +24,9 @@ jobs:
        id: make_build
        run: |
          make LLAMA_METAL=1 LLAMA_PORTABLE=1
-          pyinstaller --noconfirm --onefile --collect-all customtkinter --collect-all psutil --add-data './koboldcpp_default.so:.' --add-data './ggml-metal-merged.metal:.' --add-data './kcpp_adapters:./kcpp_adapters' --add-data './klite.embd:.' --add-data './kcpp_docs.embd:.' --add-data './kcpp_sdui.embd:.' --add-data './taesd.embd:.' --add-data './taesd_xl.embd:.' --add-data './rwkv_vocab.embd:.' --add-data './rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-mac-arm64"
+          chmod +x './create_ver_file.sh'
+          . create_ver_file.sh
+          pyinstaller --noconfirm --onefile --collect-all customtkinter --collect-all psutil --add-data './koboldcpp_default.so:.' --add-data './ggml-metal-merged.metal:.' --add-data './kcpp_adapters:./kcpp_adapters' --add-data './klite.embd:.' --add-data './kcpp_docs.embd:.' --add-data './kcpp_sdui.embd:.' --add-data './taesd.embd:.' --add-data './taesd_xl.embd:.' --add-data './rwkv_vocab.embd:.' --add-data './rwkv_world_vocab.embd:.' --version-file './version.txt' --clean --console koboldcpp.py -n "koboldcpp-mac-arm64"

      - name: Test
        id: test
--- a/cmake/arm64-apple-clang.cmake
+++ b/cmake/arm64-apple-clang.cmake
@@ -1,16 +0,0 @@
-set( CMAKE_SYSTEM_NAME Darwin )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-apple-darwin-macho )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/create_ver_file.bat
+++ b/create_ver_file.bat
@@ -0,0 +1,33 @@
+@echo off
+setlocal enabledelayedexpansion
+echo Create Version File
+:: Read the version string from koboldcpp.py
+for /f "tokens=2 delims== " %%A in ('findstr "KcppVersion" koboldcpp.py') do (
+    set "version=%%~A"
+    goto :done
+)
+
+:done
+
+:: Display the extracted version (optional, for debugging)
+echo Extracted Version: %version%
+
+for /f "tokens=1,2 delims=." %%a in ("%version%") do (
+    set version_major=%%a
+    set version_minor=%%b
+)
+
+echo Major Version: %version_major%
+echo Minor Version: %version_minor%
+
+:: Replace all instances of "MYVER" in foo.txt with the version
+(
+    for /f "delims=" %%i in (version_template.txt) do (
+        set "line=%%i"
+        set "line=!line:MYVER_MAJOR=%version_major%!"
+        set "line=!line:MYVER_MINOR=%version_minor%!"
+        echo !line!
+    )
+) > "version.txt"
+
+endlocal
--- a/create_ver_file.sh
+++ b/create_ver_file.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+echo "Create Version File"
+extracted_ver=$(cat koboldcpp.py | grep 'KcppVersion = ' | cut -d '"' -f2)
+echo "Extracted Version: $extracted_ver"
+vmajor=$(echo $extracted_ver | cut -d '.' -f1)
+vminor=$(echo $extracted_ver | cut -d '.' -f2)
+echo "Major Version: $vmajor"
+echo "Minor Version: $vminor"
+cp version_template.txt version.txt
+sed -i "s/MYVER_MAJOR/$vmajor/g" version.txt
+sed -i "s/MYVER_MINOR/$vminor/g" version.txt
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -1,259 +0,0 @@
-# llama.cpp for CANN
-
- - [Background](#background)
- - [News](#news)
- - [OS](#os)
- - [Hardware](#hardware)
- - [Model Supports](#model-supports)
- - [DataType Supports](#datatype-supports)
- - [Docker](#docker)
- - [Linux](#linux)
- - [TODO](#todo)
-
-
-## Background
-
-**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
-
-**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
-
-**Llama.cpp + CANN**
-
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
-
-## News
-
- 2024.8
-  - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
-  - Create CANN backend for Ascend NPU.
-
-## OS
-
-| OS      | Status  | Verified                                       |
-|:-------:|:-------:|:----------------------------------------------:|
-| Linux   | Support | Ubuntu 22.04, OpenEuler22.03                   |
-
-
-## Hardware
-
-### Ascend NPU
-
-**Verified devices**
-| Ascend NPU                    | Status  |
-|:-----------------------------:|:-------:|
-| Atlas 300T A2                 | Support |
-
-*Notes:*
-
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
-
-
-## Model Supports
-
-| Model Name                  | FP16  | Q8_0 | Q4_0 |
-|:----------------------------|:-----:|:----:|:----:|
-| AquilaChat2-7B              |   √   |   √  |   √  |
-| Baichuan-7b                 |   √   |   √  |   √  |
-| Baichuan2-7B-Chat           |   √   |   √  |   √  |
-| bitnet_b1_58-large          |   √   |   √  |   √  |
-| bloom-560m                  |   √   |   x  |   √  |
-| bloomz-alpaca-560m          |   √   |   x  |   √  |
-| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
-| chatglm3-6B                 |   x   |   x  |   x  |
-| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
-| CodeShell-7B                |   √   |   √  |   √  |
-| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
-| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
-| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
-| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
-| falcon-7b-instruct          |   √   |   √  |   √  |
-| flan-t5-large               |   √   |   √  |   √  |
-| gemma-2-9b-it               |   √   |   √  |   √  |
-| glm-4-9B                    |   x   |   x  |   x  |
-| gpt2                        |   √   |   √  |   √  |
-| Gpt2-163M                   |   √   |   √  |   √  |
-| granite-3B-code-instruct    |   √   |   √  |   √  |
-| GritLM-7B                   |   √   |   √  |   √  |
-| internlm2_5-7b-chat         |   √   |   √  |   √  |
-| koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
-| mamba-130m-hf               |   √   |   √  |   √  |
-| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
-| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
-| mpt-7B                      |   √   |   √  |   √  |
-| OLMo-1B-hf                  |   √   |   √  |   √  |
-| OpenELM-3B-Instruct         |   √   |   √  |   √  |
-| Orion-14b-base              |   √   |   √  |   √  |
-| phi1                        |   x   |   x  |   x  |
-| phi2                        |   x   |   x  |   x  |
-| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
-| plamo-13b                   |   √   |   √  |   √  |
-| pythia-70M                  |   x   |   x  |   x  |
-| Qwen-7B                     |   √   |   √  |   √  |
-| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
-| Refact-1_6B-fim             |   √   |   √  |   √  |
-| SmolLM-135M                 |   √   |   √  |   √  |
-| stablelm-zephyr             |   x   |   x  |   x  |
-| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
-| starcoderbase-1b            |   √   |   √  |   √  |
-| starcoder2-3b               |   √   |   √  |   √  |
-| vigogne-7b-chat             |   √   |   √  |   √  |
-| xverse-7b-chat              |   √   |   √  |   √  |
-| Yi-6b-Chat                  |   √   |   √  |   √  |
-
-
-
-## DataType Supports
-
-| DataType               | Status  |
-|:----------------------:|:-------:|
-| FP16                   | Support |
-| Q8_0                   | Support |
-| Q4_0                   | Support |
-
-## Docker
-
-### Build Images
-You can get a image with llama.cpp in one command.
-```sh
-docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
-```
-
-### Run container
-
-```sh
-# Find all cards.
-npu-smi info
-
-# Select the cards that you want to use, make sure these cards are not used by someone.
-# Following using cards of device0.
-docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
-```
-
-*Notes:*
-
- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
-
-## Linux
-
-### I. Setup Environment
-
-1. **Install Ascend Driver and firmware**
-
-    ```sh
-    # create driver running user.
-    sudo groupadd -g HwHiAiUser
-    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
-    sudo usermod -aG HwHiAiUser $USER
-
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
-    ```
-
-    Once installed, run `npu-smi info` to check whether driver is installed successfully.
-    ```sh
-    +-------------------------------------------------------------------------------------------+
-    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
-    +----------------------+---------------+----------------------------------------------------+
-    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
-    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
-    +======================+===============+====================================================+
-    | 2     xxx            | OK            | 64.4        51                15   / 15            |
-    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | 5     xxx            | OK            | 64.0        52                15   / 15            |
-    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 2                                                       |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 5                                                       |
-    +======================+===============+====================================================+
-    ```
-
-2. **Install Ascend Firmware**
-    ```sh
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
-    ```
-    If the following messaage appers, firmware is installed successfully.
-    ```sh
-    Firmware package installed successfully!
-    ```
-
-
-3. **Install CANN toolkit and kernels**
-
-    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
-
-    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
-    ```sh
-    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
-    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
-    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
-    ```
-
-    Set Ascend Variables:
-    ```sh
-    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
-    source ~/.bashrc
-    ```
-
-Upon a successful installation, CANN is enabled for the available ascend devices.
-
-### II. Build llama.cpp
-
-```sh
-cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
-cmake --build build --config release
-```
-
-### III. Run the inference
-
-1. **Retrieve and prepare model**
-
-    You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
-
-    **Notes**:
-
-      - CANN backend only supports FP16/Q4_0/Q8_0 models currently.
-
-2. **Launch inference**
-
-    There are two device selection modes:
-
-    - Single device: Use one device target specified by the user.
-    - Multiple devices: Automatically choose the devices with the same backend.
-
-    | Device selection | Parameter                              |
-    |:----------------:|:--------------------------------------:|
-    | Single device    | --split-mode none --main-gpu DEVICE_ID |
-    | Multiple devices | --split-mode layer (default)           |
-
-    Examples:
-
-    - Use device 0:
-
-    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
-    ```
-
-    - Use multiple devices:
-
-    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
-    ```
-
-### **GitHub contribution**:
-Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
-
-
-## TODO
- Support more models and data types.
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -1,50 +0,0 @@
-#!/bin/bash
-set -e
-
-AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
-USER_NAME="${USER_NAME:-Anon}"
-
-# Uncomment and adjust to the number of CPU cores you want to use.
-#N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
-N_PREDICTS="${N_PREDICTS:-4096}"
-
-GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
-
-if [ -n "$N_THREAD" ]; then
-    GEN_OPTIONS+=(--threads "$N_THREAD")
-fi
-
-./llama-cli "${GEN_OPTIONS[@]}" \
-    --model "$MODEL" \
-    --in-prefix " " \
-    --in-suffix "${AI_NAME}:" \
-    --n_predict "$N_PREDICTS" \
-    --color --interactive \
-    --reverse-prompt "${USER_NAME}:" \
-    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
-${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
-${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
-${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
-${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}.
-The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
-${AI_NAME} can only communicate through text, so she can't send images or videos.
-
-
-${USER_NAME}: Hello!
-${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
-${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
-${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
-${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
-${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
-${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
-${AI_NAME}: What do you like to do in your free time? ^_^
-${USER_NAME}:" "$@"
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,204 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG("\n");
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    int is_pp_shared = params.is_pp_shared;
-
-    std::vector<int> n_pp = params.n_pp;
-    std::vector<int> n_tg = params.n_tg;
-    std::vector<int> n_pl = params.n_pl;
-
-    // init LLM
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = common_model_params_to_llama(params);
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = common_context_params_to_llama(params);
-
-    // ensure enough sequences are available
-    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    const int32_t n_kv_max = llama_n_ctx(ctx);
-
-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
-
-    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
-                return false;
-            }
-
-            llama_synchronize(ctx);
-        }
-
-        return true;
-    };
-
-    // warm up
-    {
-        for (int i = 0; i < 16; ++i) {
-            common_batch_add(batch, 0, i, { 0 }, false);
-        }
-
-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-    }
-
-    if (!params.batched_bench_output_jsonl) {
-        LOG("\n");
-        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG("\n");
-        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
-    }
-
-    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
-        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
-            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
-                const int pp = n_pp[i_pp];
-                const int tg = n_tg[i_tg];
-                const int pl = n_pl[i_pl];
-
-                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
-
-                if (n_ctx_req > n_kv_max) {
-                    continue;
-                }
-
-                common_batch_clear(batch);
-
-                for (int i = 0; i < pp; ++i) {
-                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
-                        common_batch_add(batch, 0, i, { j }, false);
-                    }
-                }
-                batch.logits[batch.n_tokens - 1] = true;
-
-                const auto t_pp_start = ggml_time_us();
-
-                llama_kv_cache_clear(ctx);
-
-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_ERR("%s: llama_decode() failed\n", __func__);
-                    return 1;
-                }
-
-                if (is_pp_shared) {
-                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-                    }
-                }
-
-                const auto t_pp_end = ggml_time_us();
-
-                const auto t_tg_start = ggml_time_us();
-
-                for (int i = 0; i < tg; ++i) {
-                    common_batch_clear(batch);
-
-                    for (int j = 0; j < pl; ++j) {
-                        common_batch_add(batch, 0, pp + i, { j }, true);
-                    }
-
-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_ERR("%s: llama_decode() failed\n", __func__);
-                        return 1;
-                    }
-                }
-
-                const auto t_tg_end = ggml_time_us();
-
-                const int32_t n_kv = n_ctx_req;
-
-                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
-                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
-                const float t    = t_pp + t_tg;
-
-                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
-                const float speed_tg = pl*tg / t_tg;
-                const float speed    = n_kv / t;
-
-                if(params.batched_bench_output_jsonl) {
-                    LOG(
-                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
-                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
-                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
-                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
-                    );
-                } else {
-                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
-                }
-            }
-        }
-    }
-
-    LOG("\n");
-    llama_perf_context_print(ctx);
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    LOG("\n\n");
-
-    return 0;
-}
--- a/examples/batched.swift/.gitignore
+++ b/examples/batched.swift/.gitignore
@@ -1,9 +0,0 @@
-.DS_Store
-/.build
-/Packages
-xcuserdata/
-DerivedData/
-.swiftpm/configuration/registries.json
-.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
-.netrc
-batched_swift
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@@ -1,6 +0,0 @@
-.PHONY: build
-
-build:
-	xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./llama-batched-swift
-	ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@@ -1,22 +0,0 @@
-// swift-tools-version: 5.5
-// The swift-tools-version declares the minimum version of Swift required to build this package.
-
-import PackageDescription
-
-let package = Package(
-    name: "llama-batched-swift",
-    platforms: [.macOS(.v12)],
-    dependencies: [
-        .package(name: "llama", path: "../../"),
-    ],
-    targets: [
-        // Targets are the basic building blocks of a package, defining a module or a test suite.
-        // Targets can depend on other targets in this package and products from dependencies.
-        .executableTarget(
-            name: "llama-batched-swift",
-            dependencies: ["llama"],
-            path: "Sources",
-            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
-        ),
-    ]
-)
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -1,251 +0,0 @@
-import Foundation
-import llama
-
-let arguments = CommandLine.arguments
-
-// Check that we have at least one argument (the model path)
-guard arguments.count > 1 else {
-    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
-    exit(1)
-}
-
-let modelPath: String = arguments[1]
-let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
-let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
-
-// total length of the sequences including the prompt
-let n_len: Int = 32
-
-// init LLM
-llama_backend_init()
-defer {
-    llama_backend_free()
-}
-
-let model_params = llama_model_default_params()
-guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
-    print("Failed to load model")
-    exit(1)
-}
-defer {
-    llama_free_model(model)
-}
-
-var tokens = tokenize(text: prompt, add_bos: true)
-
-let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
-
-var context_params = llama_context_default_params()
-context_params.n_ctx = n_kv_req
-context_params.n_batch = UInt32(max(n_len, n_parallel))
-context_params.n_threads = 8
-context_params.n_threads_batch = 8
-
-let context = llama_new_context_with_model(model, context_params)
-guard context != nil else {
-    print("Failed to initialize context")
-    exit(1)
-}
-defer {
-    llama_free(context)
-}
-
-var sparams = llama_sampler_chain_default_params()
-
-let smpl = llama_sampler_chain_init(sparams)
-guard smpl != nil else {
-    print("Failed to initialize sampling")
-    exit(1)
-}
-defer {
-    llama_sampler_free(smpl)
-}
-
-llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
-llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
-llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
-
-let n_ctx = llama_n_ctx(context)
-
-print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
-
-if n_kv_req > n_ctx {
-    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
-    exit(1)
-}
-
-var buffer: [CChar] = []
-for id: llama_token in tokens {
-    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
-}
-
-print("\n")
-
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
-defer {
-    llama_batch_free(batch)
-}
-
-// evaluate the initial prompt
-batch.n_tokens = Int32(tokens.count)
-
-for (i, token) in tokens.enumerated() {
-    batch.token[i] = token
-    batch.pos[i] = Int32(i)
-    batch.n_seq_id[i] = 1
-    // batch.seq_id[i][0] = 0
-    // TODO: is this the proper way to do this?
-    if let seq_id = batch.seq_id[i] {
-        seq_id[0] = 0
-    }
-    batch.logits[i] = 0
-}
-
-// llama_decode will output logits only for the last token of the prompt
-batch.logits[Int(batch.n_tokens) - 1] = 1
-
-if llama_decode(context, batch) != 0 {
-    print("llama_decode() failed")
-    exit(1)
-}
-
-for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
-}
-
-if n_parallel > 1 {
-    print("generating \(n_parallel) sequences ...\n")
-}
-
-var streams: [String] = .init(repeating: "", count: n_parallel)
-var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
-var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
-
-var n_cur = batch.n_tokens
-var n_decode = 0
-
-let t_main_start = ggml_time_us()
-
-while n_cur <= n_len {
-    // prepare the next batch
-    batch.n_tokens = 0
-
-    // sample the next token for each parallel sequence / stream
-    for i in 0 ..< n_parallel {
-        if i_batch[i] < 0 {
-            // the stream has already finished
-            continue
-        }
-
-        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
-
-        // is it an end of stream? -> mark the stream as finished
-        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
-            i_batch[i] = -1
-            // print("")
-            if n_parallel > 1 {
-                print("stream \(i) finished at n_cur = \(n_cur)")
-            }
-
-            continue
-        }
-
-        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
-
-        // if there is only one stream, we print immediately to stdout
-        if n_parallel == 1 {
-            print(nextStringPiece, terminator: "")
-        }
-        streams[i] += nextStringPiece
-
-        // push this new token for next evaluation
-        batch.token[Int(batch.n_tokens)] = new_token_id
-        batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.n_seq_id[Int(batch.n_tokens)] = 1
-        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
-            seq_id[0] = Int32(i)
-        }
-        batch.logits[Int(batch.n_tokens)] = 1
-
-        i_batch[i] = batch.n_tokens
-
-        batch.n_tokens += 1
-
-        n_decode += 1
-    }
-
-    // all streams are finished
-    if batch.n_tokens == 0 {
-        break
-    }
-
-    n_cur += 1
-
-    // evaluate the current batch with the transformer model
-    if llama_decode(context, batch) != 0 {
-        print("llama_decode() failed")
-        exit(1)
-    }
-}
-
-if n_parallel > 1 {
-    print("\n")
-    for (i, stream) in streams.enumerated() {
-        print("sequence \(i):\n\n\(prompt)\(stream)\n")
-    }
-}
-
-let t_main_end = ggml_time_us()
-
-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
-
-llama_perf_sampler_print(smpl)
-llama_perf_context_print(context)
-
-private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let utf8Count = text.utf8.count
-    let n_tokens = utf8Count + (add_bos ? 1 : 0)
-    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
-    var swiftTokens: [llama_token] = []
-    for i in 0 ..< tokenCount {
-        swiftTokens.append(tokens[Int(i)])
-    }
-    tokens.deallocate()
-    return swiftTokens
-}
-
-private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
-    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
-    if nTokens < 0 {
-        let actualTokensCount = -Int(nTokens)
-        result = .init(repeating: 0, count: actualTokensCount)
-        let check = llama_token_to_piece(
-            model,
-            token,
-            &result,
-            Int32(result.count),
-            0,
-            false
-        )
-        assert(check == actualTokensCount)
-    } else {
-        result.removeLast(result.count - Int(nTokens))
-    }
-    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
-        return utfString
-    } else {
-        buffer.append(contentsOf: result)
-        let data = Data(buffer.map { UInt8(bitPattern: $0) })
-        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
-            buffer = []
-        }
-        guard let bufferString = String(data: data, encoding: .utf8) else {
-            return nil
-        }
-        buffer = []
-        return bufferString
-    }
-}
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -1,57 +0,0 @@
-@setlocal disabledelayedexpansion enableextensions
-@echo off
-
-cd /d "%~dp0.."
-if not "%errorlevel%"=="0" (
-    echo Unable to change directory.
-    pause
-    exit /b 1
-)
-
-if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
-if not defined USER_NAME set "USER_NAME=User"
-if not defined AI_NAME set "AI_NAME=ChatLLaMa"
-rem Adjust to the number of CPU cores you want to use.
-rem if not defined N_THREAD set "N_THREAD=8"
-rem Number of tokens to predict (made it larger than default because we want a long interaction)
-if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
-
-rem Default main script paths
-set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
-
-rem Get main script path from command line arguments
-set "MAIN_SCRIPT_PATH=%~1"
-
-rem If the main script path was not specified, try the default paths
-if not defined MAIN_SCRIPT_PATH (
-    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
-        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
-    )
-)
-
-rem If the main script path was not found, tell the user how to specify it
-if not defined MAIN_SCRIPT_PATH (
-    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
-    echo %DEFAULT_MAIN_SCRIPT_PATHS%
-    pause
-    exit /b 1
-)
-
-rem Default context, feel free to edit it
-set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
-
-rem Set a temporary variable if N_THREAD is set
-if defined N_THREAD (
-    set "_N_THREAD=--threads %N_THREAD%"
-) else (
-    set "_N_THREAD="
-)
-
-rem Run the script
-echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
-  --model "%MODEL%" ^
-  --n_predict %N_PREDICTS% ^
-  --color --interactive ^
-  --reverse-prompt "%USER_NAME%:" ^
-  --prompt "%PROMPT_TEXT%"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="${USER_NAME:-USER}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "${USER_NAME}:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -1,149 +0,0 @@
-#!/bin/bash
-
-set -euo pipefail
-
-cd "$(dirname "$0")/.." || exit
-
-if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
-    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
-    exit 1
-fi
-
-MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
-PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
-USER_NAME="${USER_NAME:-User}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-DATE_TIME="$(date +%H:%M)"
-DATE_YEAR="$(date +%Y)"
-
-LOG="${CHAT_SAVE_DIR}/main.log"
-LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
-CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
-CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
-NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
-NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
-
-SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
-'|'\
-'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
-SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
-
-CTX_SIZE=2048
-CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
-OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
-
-# An unbuffered `tail -c+N`
-skip_bytes() {
-    LANG=C IFS= read -r -n "$1" -d '' c
-    while LANG=C IFS= read -r -n 1 -d '' c; do
-        printf '%s' "$c"
-    done
-}
-
-mkdir -p "$CHAT_SAVE_DIR"
-echo >"$LOG"
-trap "tail -n100 ${LOG}" EXIT
-
-if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
-    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
-        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
-fi
-
-if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
-    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-fi
-
-if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
-    echo '...' >>"$NEXT_PROMPT_FILE"
-fi
-
-if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
-    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama-cli 2>>"$LOG" \
-        --batch_size 64 \
-        "${OPTS[@]}" \
-        --prompt-cache "$PROMPT_CACHE_FILE" \
-        --file "$CUR_PROMPT_FILE" \
-        --n_predict 1
-    echo
-    echo 'Done!'
-fi
-
-if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
-fi
-if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-fi
-
-printf '%s ' "$(< "$CUR_PROMPT_FILE")"
-n_tokens=0
-
-while read -e line; do
-    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
-    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
-
-    # Swap prompts when we're about to run out of context
-    if ((n_predict <= 0)); then
-        wait # for background main (below) to finish with next prompt
-        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
-        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
-
-        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-        echo '...' >>"$NEXT_PROMPT_FILE"
-        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-
-        n_tokens=0
-        n_predict=$((CTX_SIZE / 2))
-    fi
-
-    echo " ${line}" >>"$CUR_PROMPT_FILE"
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        echo " ${line}" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
-
-    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
-
-    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
-            --prompt-cache "$CUR_PROMPT_CACHE" \
-            --prompt-cache-all \
-            --file "$CUR_PROMPT_FILE" \
-            --reverse-prompt "${USER_NAME}:" \
-            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
-        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
-        skip_bytes "$n_prompt_len_pre"  # print generation
-
-    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
-
-    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
-    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
-        printf '\n%s:' "$USER_NAME"
-        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
-    fi
-
-    printf ' '
-
-    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
-        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
-        exit 1
-    fi
-
-    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
-
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    # Update cache for next prompt in background, ideally during user input
-    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
-          --prompt-cache "$NEXT_PROMPT_CACHE" \
-          --file "$NEXT_PROMPT_FILE" \
-          --n_predict 1 &
-done
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="### Human"
-AI_NAME="### Assistant"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "### Human:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-# Important:
-#
-#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
-#
-./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
-    --repeat_penalty 1.0 --color -i \
-    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,939 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "log.h"
-
-#include <unordered_map>
-#include <vector>
-#include <cassert>
-#include <climits>
-#include <cstring>
-#include <cstdarg>
-#include <cinttypes>
-#include <ctime>
-#include <random>
-#include <stdexcept>
-#include <sstream>
-#include <algorithm>
-#include <string>
-
-// GGUF keys & tensor names.
-
-#define KV_GENERAL_ARCHITECTURE          "general.architecture"
-#define KV_GENERAL_NAME                  "general.name"
-
-#define KV_TOKENIZER_MODEL               "tokenizer.ggml.model"
-#define KV_TOKENIZER_LIST                "tokenizer.ggml.tokens"
-#define KV_TOKENIZER_TOKEN_TYPE          "tokenizer.ggml.token_type"
-#define KV_TOKENIZER_SCORES              "tokenizer.ggml.scores"
-#define KV_TOKENIZER_BOS_ID              "tokenizer.ggml.bos_token_id"
-#define KV_TOKENIZER_EOS_ID              "tokenizer.ggml.eos_token_id"
-#define KV_TOKENIZER_UNK_ID              "tokenizer.ggml.unknown_token_id"
-#define KV_TOKENIZER_SEP_ID              "tokenizer.ggml.seperator_token_id"
-#define KV_TOKENIZER_PAD_ID              "tokenizer.ggml.padding_token_id"
-#define KV_TOKENIZER_HF_JSON             "tokenizer.huggingface.json"
-
-#define KV_CONTEXT_LENGTH                "llama.context_length"
-#define KV_EMBEDDING_LENGTH              "llama.embedding_length"
-#define KV_BLOCK_COUNT                   "llama.block_count"
-#define KV_FEED_FORWARD_LENGTH           "llama.feed_forward_length"
-#define KV_ATTENTION_HEAD_COUNT          "llama.attention.head_count"
-#define KV_ATTENTION_HEAD_COUNT_KV       "llama.attention.head_count_kv"
-#define KV_ATTENTION_LAYERNORM_RMS_EPS   "llama.attention.layer_norm_rms_epsilon"
-#define KV_ROPE_DIMENSION_COUNT          "llama.rope.dimension_count"
-
-#define TN_TOKEN_EMBD  "token_embd.weight"
-#define TN_OUTPUT_NORM "output_norm.weight"
-#define TN_OUTPUT      "output.weight"
-#define TN_ATTN_NORM   "blk.%d.attn_norm.weight"
-#define TN_ATTN_Q      "blk.%d.attn_q.weight"
-#define TN_ATTN_K      "blk.%d.attn_k.weight"
-#define TN_ATTN_V      "blk.%d.attn_v.weight"
-#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
-#define TN_FFN_NORM    "blk.%d.ffn_norm.weight"
-#define TN_FFN_GATE    "blk.%d.ffn_gate.weight"
-#define TN_FFN_DOWN    "blk.%d.ffn_down.weight"
-#define TN_FFN_UP      "blk.%d.ffn_up.weight"
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
-#define LLAMA_FILE_VERSION_GGJT_V3   3
-
-#define TOKENIZER_NAME "llama"
-#define UNKNOWN_TOKEN_ID 0
-#define BOS_TOKEN_ID 1
-#define EOS_TOKEN_ID 2
-
-//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
-typedef struct {
-    int dim; // transformer dimension
-    int hidden_dim; // for ffn layers
-    int n_layers; // number of layers
-    int n_heads; // number of query heads
-    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
-    int vocab_size; // vocabulary size, usually 256 (byte-level)
-    int seq_len; // max sequence length
-} Config;
-
-struct TransformerWeights {
-    // token embedding table
-    std::vector<float> token_embedding_table;    // (vocab_size, dim)
-    // weights for rmsnorms
-    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
-    std::vector<float> rms_ffn_weight; // (layer, dim)
-    // weights for matmuls
-    std::vector<float> wq; // (layer, dim, dim)
-    std::vector<float> wk; // (layer, dim, dim)
-    std::vector<float> wv; // (layer, dim, dim)
-    std::vector<float> wo; // (layer, dim, dim)
-    // weights for ffn
-    std::vector<float> w1; // (layer, hidden_dim, dim)
-    std::vector<float> w2; // (layer, dim, hidden_dim)
-    std::vector<float> w3; // (layer, hidden_dim, dim)
-    // final rmsnorm
-    std::vector<float> rms_final_weight; // (dim,)
-    // freq_cis for RoPE relatively positional embeddings
-    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
-    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
-    // (optional) classifier weights for the logits, on the last layer
-    std::vector<float> wcls;
-};
-
-static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
-    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
-    try {
-        w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
-
-        w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
-
-        w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
-
-        w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
-
-        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
-
-        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
-
-        w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
-
-        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
-
-        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
-
-        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
-
-        w->rms_final_weight.resize(p->dim);
-        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
-
-        if (shared_weights) {
-            w->wcls = {};
-        } else {
-            w->wcls.resize(p->vocab_size * p->dim);
-            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
-        }
-    }
-    catch (std::length_error &) {
-        die("Invalid configuration. Failed to allocate memory for weights");
-    }
-}
-
-static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
-    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
-    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
-    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
-    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
-    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
-    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
-    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
-    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
-    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
-    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
-    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
-
-    // Skip freq_cis_real & freq_cis_imag
-    int head_size = p->dim / p->n_heads;
-    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
-
-    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
-
-    // Check we didn't forget to read anything
-    auto curr = ftell(f);
-    fseek(f, 0, SEEK_END);
-    auto end = ftell(f);
-    if (curr != end) {
-        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
-        return 1;
-    }
-
-    return 0;
-}
-
-static void print_sample_weights(TransformerWeights *w){
-    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
-    LOG_INF("%f\n", w->token_embedding_table[0]);
-    LOG_INF("%f\n", w->rms_att_weight[0]);
-    LOG_INF("%f\n", w->rms_ffn_weight[0]);
-
-    LOG_INF("%f\n", w->wq[0]);
-    LOG_INF("%f\n", w->wk[0]);
-    LOG_INF("%f\n", w->wv[0]);
-    LOG_INF("%f\n", w->wo[0]);
-    LOG_INF("%f\n", w->w1[0]);
-    LOG_INF("%f\n", w->w2[0]);
-    LOG_INF("%f\n", w->w3[0]);
-    LOG_INF("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
-
-struct my_llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
-
-    struct token_data {
-        token text;
-        float score;
-        ttype type;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_data> id_to_token;
-};
-
-struct my_llama_hparams {
-    uint32_t n_vocab   = 32000;
-    uint32_t n_ctx     = 512;   // this is provided as user input?
-    uint32_t n_embd    = 4096;
-    uint32_t n_ff      = 11008;
-    uint32_t n_mult    = 4;
-    uint32_t n_head    = 32;
-    uint32_t n_head_kv = 32;
-    uint32_t n_layer   = 32;
-    uint32_t n_rot     = 64;
-
-    bool operator!=(const my_llama_hparams& other) const {
-        return memcmp(this, &other, sizeof(my_llama_hparams));
-    }
-};
-
-struct my_llama_layer {
-    // normalization
-    struct ggml_tensor * attention_norm;
-
-    // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-
-    // normalization
-    struct ggml_tensor * ffn_norm;
-
-    // ff
-    struct ggml_tensor * w1;
-    struct ggml_tensor * w2;
-    struct ggml_tensor * w3;
-};
-
-struct my_llama_model {
-    struct ggml_context * ctx = NULL;
-
-    std::string name;
-
-    my_llama_hparams hparams;
-
-    struct ggml_tensor * tok_embeddings;
-
-    struct ggml_tensor * norm;
-    struct ggml_tensor * output;
-
-    std::vector<my_llama_layer> layers;
-
-    uint32_t train_its = 0;
-    uint32_t train_samples = 0;
-    uint32_t train_tokens = 0;
-};
-
-struct train_params {
-    const char * fn_vocab_model;
-    const char * fn_llama2c_model;
-    const char * fn_llama2c_output_model;
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * fn_model_out;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_embd;
-    int n_mult;
-    int n_head;
-    int n_layer;
-    int n_rotmax;
-
-    int n_threads;
-    int n_batch;
-    int n_examples;
-    int n_predict;
-
-    int print_info_interval;
-    int print_details_interval;
-
-    bool samples_start_after_nl;
-    bool use_adam;
-    bool use_flash;
-    bool use_scratch;
-
-    // only adam
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_alpha;
-
-    int   lbfgs_n_iter;
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_decay;
-
-    int mem_model_gb;
-    int mem_compute_gb;
-    int mem_compute0_gb;
-    int mem_compute1_gb;
-};
-
-static void print_params(struct my_llama_hparams * params) {
-    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
-}
-
-static void print_tensor_info(const struct ggml_context * ctx) {
-    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG_INF("%s: Allocating ", __func__);
-        int64_t total = 1;
-        int i = 0;
-        for (; i < ggml_n_dims(t); ++i) {
-            if (i > 0) LOG("x ");
-            LOG("[%" PRId64 "] ", t->ne[i]);
-            total *= t->ne[i];
-        }
-        if (i > 1) LOG("= [%" PRId64 "] ", total);
-        LOG("float space for %s\n", ggml_get_name(t));
-    }
-}
-
-static void init_model(struct my_llama_model * model) {
-    const auto & hparams = model->hparams;
-
-    const uint32_t n_embd  = hparams.n_embd;
-    const uint32_t n_layer = hparams.n_layer;
-    const uint32_t n_vocab = hparams.n_vocab;
-
-    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
-
-    const uint32_t n_ff = hparams.n_ff;
-    struct ggml_context * ctx = model->ctx;
-
-    model->train_its = 0;
-    model->train_samples = 0;
-    model->train_tokens = 0;
-
-    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-
-    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
-    ggml_set_name(model->norm,           "norm.weight");
-    ggml_set_name(model->output,         "output.weight");
-
-    model->layers.resize(n_layer);
-    for (uint32_t i = 0; i < n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        std::string layers_i = "layers." + std::to_string(i);
-
-        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
-        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-
-        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
-        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
-
-        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
-
-        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
-        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
-        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
-        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
-
-        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
-
-        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
-        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
-        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
-    }
-
-    print_tensor_info(ctx);
-}
-
-static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
-    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
-    return *ptr;
-}
-
-static void print_row(struct ggml_tensor * probs, int i) {
-    for (int k = 0; k < probs->ne[0]; ++k) {
-        float p = get_f32_2d(probs, k, i);
-        LOG(" %f", p);
-    }
-    LOG("\n");
-}
-
-static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
-    for (int i = 0; i < probs->ne[1]; ++i) {
-        for (int k = 0; k < probs->ne[0]; ++k) {
-            float p = get_f32_2d(probs, k, i);
-            LOG(" %.2f", p);
-        }
-        LOG("\n");
-    }
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("fread failed: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-    std::float_t read_f32() {
-        std::float_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static bool is_ggml_file(const char * filename) {
-    llama_file file(filename, "rb");
-    if (file.size < 4) {
-        return false;
-    }
-    std::string magic = file.read_string(4);
-    return magic == GGUF_MAGIC;
-}
-
-static std::string llama_escape_whitespaces(const std::string & text) {
-    std::ostringstream out;
-    for (char c : text) {
-        if (c == ' ') out << "\xe2\x96\x81";
-        else out << c;
-    }
-    return out.str();
-}
-
-static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
-    if (is_ggml_file(filename)) {
-        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
-        struct ggml_context * ctx_data = NULL;
-
-        struct gguf_init_params params = {
-            /*.no_alloc = */ false,
-            /*.ctx      = */ &ctx_data,
-        };
-
-        struct gguf_context * ctx = gguf_init_from_file(filename, params);
-        GGML_ASSERT(ctx != NULL);
-
-        const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
-        GGML_ASSERT(model_idx >= 0);
-        std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
-        GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
-
-        const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
-        GGML_ASSERT(token_idx >= 0);
-
-        const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
-        GGML_ASSERT(score_idx >= 0);
-        const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
-
-        const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
-        GGML_ASSERT(toktype_idx >= 0);
-        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
-
-        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
-        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
-            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
-        }
-
-        vocab->id_to_token.resize(n_vocab);
-
-        for (uint32_t i = 0; i < n_vocab; i++) {
-            std::string word = gguf_get_arr_str(ctx, token_idx, i);
-
-            vocab->token_to_id[word] = i;
-
-            auto & token_data = vocab->id_to_token[i];
-            token_data.text  = std::move(word);
-            token_data.score = scores[i];
-            token_data.type  = (llama_token_type) toktypes[i];
-        }
-        ggml_free(ctx_data);
-        gguf_free(ctx);
-    } else {
-        // assume llama2.c vocabulary
-        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
-        llama_file file(filename, "rb");
-        if (!file.fp) {
-            die_fmt("%s: %s", strerror(errno), filename);
-        }
-        const int  n_vocab = config->vocab_size;
-        /* uint32_t max_token_length =  */ file.read_u32(); // unused
-        vocab->id_to_token.resize(n_vocab);
-        for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
-            float_t score = file.read_f32();
-            uint32_t len = file.read_u32();
-            std::string text = file.read_string(len);
-
-            unsigned char byte_val;
-            my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
-            if (id == UNKNOWN_TOKEN_ID) {
-                text = "<unk>";
-                type = LLAMA_TOKEN_TYPE_UNKNOWN;
-            } else if (id == BOS_TOKEN_ID) {
-                text = "<s>";
-                type = LLAMA_TOKEN_TYPE_CONTROL;
-            } else if (id == EOS_TOKEN_ID) {
-                text = "</s>";
-                type = LLAMA_TOKEN_TYPE_CONTROL;
-            } else if (text.empty()) {
-                type = LLAMA_TOKEN_TYPE_CONTROL;
-            } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
-                // Text of byte tokens is already in the expected format.
-                type = LLAMA_TOKEN_TYPE_BYTE;
-            } else {
-                type = LLAMA_TOKEN_TYPE_NORMAL;
-            }
-            text = llama_escape_whitespaces(text);
-
-            vocab->id_to_token[id].text = text;
-            vocab->id_to_token[id].score = score;
-            vocab->id_to_token[id].type = type;
-            vocab->token_to_id.emplace(text, id);
-        }
-    }
-}
-
-static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
-    int size = 1;
-    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
-        size *= gg_weights->ne[dim];
-    }
-    for (int ct = 0; ct < size; ++ct) {
-        int64_t i0 = 0; int64_t i1 = 0;
-        int64_t i2 = 0; int64_t i3 = 0;
-        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
-        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
-    }
-}
-
-static void save_as_llama_model(
-    struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
-) {
-    // convert AK weights into GG weights one by one.
-    // w->token_embedding_table -> model->tok_embeddings
-    // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
-    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
-
-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
-    //print_row(model->norm, 0);
-
-    // for rms-att-weight
-    int row_length = model->hparams.n_embd;
-    int n_ff = model->hparams.n_ff;
-
-    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
-        auto & layer = model->layers[i];
-        // 1d
-        convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        convert_weights_ak_to_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);
-
-        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        convert_weights_ak_to_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
-    }
-
-    struct gguf_context * ctx = gguf_init_empty();
-
-    std::vector<const char*> tokens;
-    std::vector<float> scores;
-    std::vector<llama_token_type> token_types;
-    for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
-        tokens.push_back(token_data.text.c_str());
-        scores.push_back(token_data.score);
-        token_types.push_back(token_data.type);
-    }
-    gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
-    gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
-    gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
-
-    gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
-
-    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
-    gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
-
-    // special tokens
-    gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
-    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
-
-    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
-    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
-    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
-    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
-    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
-    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
-
-    // write tensors
-    ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
-    gguf_add_tensor(ctx, model->tok_embeddings);
-
-    ggml_set_name(model->norm, TN_OUTPUT_NORM);
-    gguf_add_tensor(ctx, model->norm);
-
-    ggml_set_name(model->output, TN_OUTPUT);
-    gguf_add_tensor(ctx, model->output);
-
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        ggml_format_name(layer.wq, TN_ATTN_Q, i);
-        gguf_add_tensor(ctx, layer.wq);
-
-        ggml_format_name(layer.wk, TN_ATTN_K, i);
-        gguf_add_tensor(ctx, layer.wk);
-
-        ggml_format_name(layer.wv, TN_ATTN_V, i);
-        gguf_add_tensor(ctx, layer.wv);
-
-        ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
-        gguf_add_tensor(ctx, layer.wo);
-
-        ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
-        gguf_add_tensor(ctx, layer.attention_norm);
-
-        ggml_format_name(layer.w1, TN_FFN_GATE, i);
-        gguf_add_tensor(ctx, layer.w1);
-
-        ggml_format_name(layer.w2, TN_FFN_DOWN, i);
-        gguf_add_tensor(ctx, layer.w2);
-
-        ggml_format_name(layer.w3, TN_FFN_UP, i);
-        gguf_add_tensor(ctx, layer.w3);
-
-        ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
-        gguf_add_tensor(ctx, layer.ffn_norm);
-    }
-
-    gguf_write_to_file(ctx, filename, false);
-    gguf_free(ctx);
-}
-
-static struct train_params get_default_train_params() {
-    struct train_params params;
-    params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
-    params.fn_llama2c_output_model = "ak_llama_model.bin";
-    params.fn_train_data           = "shakespeare.txt";
-    params.fn_checkpoint_in        = "checkpoint.bin";
-    params.fn_checkpoint_out       = "checkpoint.bin";
-    params.fn_model_out            = "ggml-checkpoint-f32.bin";
-
-    params.seed       =   -1;
-
-    params.n_ctx      =  128;
-    params.n_embd     =  256;
-    params.n_mult     =  256;
-    params.n_head     =    8;
-    params.n_layer    =   16;
-    params.n_rotmax   =   64;
-
-    params.n_threads  =    6;
-    params.n_batch    =    8;
-    params.n_examples =    8;
-    params.n_predict  = 1024;
-
-    params.print_info_interval    = 1;
-    params.print_details_interval = 2;
-
-    params.samples_start_after_nl = false;
-    params.use_adam               = true;
-    params.use_flash              = false;
-    params.use_scratch            = true;
-
-    // only adam
-    params.warmup            =  100;
-    params.cos_decay_steps   = 1000;
-    params.cos_decay_restart = 1.1f;
-    params.cos_decay_alpha   = 0.0f;
-
-    params.lbfgs_n_iter      = 16;
-    params.adam_n_iter       = 16;
-    params.adam_alpha        = 1e-3f;
-    params.adam_decay        = 1e-3f;
-
-    params.mem_model_gb    = 2;
-    params.mem_compute_gb  = 24;
-    params.mem_compute0_gb = 8;
-    params.mem_compute1_gb = 2;
-
-    return params;
-}
-
-static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
-    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
-    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
-    fprintf(stderr, "\n");
-}
-
-static bool params_parse(int argc, char ** argv, struct train_params * params) {
-    bool invalid_param = false;
-    bool reqd_param_found = false;
-    std::string arg;
-    struct train_params default_params = get_default_train_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "--copy-vocab-from-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_vocab_model = argv[i];
-        } else if (arg == "--llama2c-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            reqd_param_found = true;
-            params->fn_llama2c_model = argv[i];
-        } else if (arg == "--llama2c-output-model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_llama2c_output_model = argv[i];
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv, &default_params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (!reqd_param_found){
-        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
-        print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-
-    return true;
-}
-
-static std::string basename(const std::string &path) {
-    size_t pos = path.find_last_of("/\\");
-    if (pos == std::string::npos) {
-        return path;
-    }
-    return path.substr(pos + 1);
-}
-
-int main(int argc, char ** argv) {
-    common_init();
-
-    struct train_params params = get_default_train_params();
-    if (!params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    Config config;
-    TransformerWeights weights = {};
-    {
-        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
-        FILE * file = fopen(params.fn_llama2c_model, "rb");
-        if (!file) {
-            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
-            return 1;
-        }
-        // read in the config header
-        if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
-            return 1;
-        }
-        auto shared_weights = config.vocab_size > 0;
-        config.vocab_size = abs(config.vocab_size);
-
-        // read in the Transformer weights
-        alloc_weights(&weights, &config, shared_weights);
-        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
-            return 1;
-        }
-        fclose(file);
-    }
-
-    struct my_llama_vocab vocab;
-    load_vocab(params.fn_vocab_model, &config, &vocab);
-
-    struct my_llama_model model;
-    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx     = params.n_ctx;
-    model.hparams.n_embd    = config.dim; //params.n_embd;
-    model.hparams.n_ff      = config.hidden_dim;
-    model.hparams.n_mult    = 32;//params.n_mult;
-    model.hparams.n_head    = config.n_heads; //params.n_head;
-    model.hparams.n_head_kv = config.n_kv_heads;
-    model.hparams.n_layer   = config.n_layers; //params.n_layer;
-    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
-
-    print_params(&model.hparams);
-
-    struct ggml_init_params lcparams;
-    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
-    lcparams.mem_buffer = NULL;
-    lcparams.no_alloc   = false;
-
-    model.ctx = ggml_init(lcparams);
-
-    init_model(&model);
-    model.name = basename(params.fn_llama2c_model);
-    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
-
-    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
-
-    ggml_free(model.ctx);
-    return 0;
-}
--- a/examples/cvector-generator/completions.txt
+++ b/examples/cvector-generator/completions.txt
@@ -1,582 +0,0 @@
-
-That game
-I can see
-Hmm, this
-I can relate to
-Who is
-I understand the
-Ugh,
-What the hell was
-Hey, did anyone
-Although
-Thank you for choosing
-What are you
-Oh w
-How dare you open
-It was my pleasure
-I'm hon
-I appreciate that you
-Are you k
-Whoever left this
-It's always
-Ew,
-Hey, I l
-Hello? Is someone
-I understand that
-That poem
-Aww, poor
-Hey, it
-Alright, who
-I didn't
-Well, life
-The document
-Oh no, this
-I'm concerned
-Hello, this is
-This art
-Hmm, this drink
-Hi there!
-It seems
-Is
-Good
-I can't
-Ex
-Who are
-I can see that
-Wow,
-Today is a
-Hey friend
-Sometimes friends
-Oh, this old
-The weather outside
-This place is sur
-I appreciate your input
-Thank you for the
-Look at
-I'm disappoint
-To my
-How dare you
-That's an
-This piece of art
-Eww
-This park is
-This is incredible
-Oh no, someone
-Exc
-Well, it'
-I warned
-Hey, I understand
-Hey, I saw
-How dare you go
-What the he
-Hey
-It's
-Hello? Hello?
-It
-Oh no!
-This is the perfect
-Good morning,
-Oh no, there
-It's so
-Yeah
-Uh,
-Hello everyone
-Who turned off
-The weather
-Who'
-Hey, this
-Wait,
-Eww, gross
-Excuse
-It seems like you
-Thank you so
-What happened?
-Oh my g
-I am deeply sad
-I war
-Okay, let'
-Hey, that
-That was a beautiful
-Oh no! That
-What happened
-Hey there
-The artist'
-What?!
-Hey, it'
-I am disappoint
-It seems like
-Oh no! The
-This park is a
-If you
-Yes! I did
-It sounds
-What
-Who is it
-Hmm, that
-That's strange
-Yeah, that was
-That's interesting
-This park
-What the hell
-Who is that
-I feel like my
-Oh well
-What the hell is
-Hello? Hello
-To my dearest
-Bless you!\"
-Thank you for
-Oh, looks like
-Can you please
-This place is
-Eww, what
-Bless you
-Is everything
-Hey, I just
-Whoever left these
-Well, that'
-I feel
-Hey, do you
-It's sad
-Oh no, it
-Hey, that'
-Oh my god,
-Thank you,
-Hello little one,
-I apolog
-Hey team, I
-How dare you read
-Who is this and
-Whoever left
-Hi there! W
-A
-If you have
-I was
-U
-Bless
-Well, this
-Oh, I'
-It's a
-Eww,
-Is everything okay?
-Oh, I
-Hello, can you
-Al
-That was a great
-What are
-I understand that not
-Oh no, not
-Who is it?\"
-Hey, can we
-Whoever is taking
-I would love to
-Hey, I noticed
-Hey, could
-I understand that there
-Hello?
-D
-Oh man, I
-Thank you so much
-Oh no, my
-Dear [Name
-Uh
-I remember
-Hey, who
-Well, it
-Are you
-I understand that it
-Hey, is
-I would
-Who is this
-Excuse me
-Alright
-I am thrilled
-Sometimes friends have
-Who the
-It's interesting
-I would love
-E
-Hello? Is anyone
-Well, this is
-This place
-Well,
-I warned you
-Hey, watch where
-Oh my
-That'
-Sometimes friends have different
-I understand that everyone
-What?
-What do these notes
-I can relate
-I'm not
-I understand
-To my dear
-Guys
-Well
-Hey, I appreciate
-Wow, what
-Dear
-That melody
-Who the hell
-Today is
-Hello little
-Wow, look
-That's great
-Love is never wrong
-I'm having
-Whoa, did
-Ugh
-Can you please provide
-I miss you,
-I feel uncom
-I know
-Ugh, this
-Hey, watch
-Oh great, a
-I didn
-Okay
-That game of char
-Oh
-I appreciate
-Who's there
-I am so
-Oh great, someone
-Hey, could you
-I remember wondering
-Wait, what?
-What do
-Hello? Can
-Hey there,
-That game of
-This is incred
-Oh my gosh
-Oh great, f
-I appreciate your
-It sounds like
-What the heck
-Okay, I understand
-Ew
-I understand that this
-Uh, hi
-Hi everyone!
-What the hell?
-Thank you for your
-Oh no, the
-Wow, I
-Who turned
-Dear [
-Whoever
-This is a
-Whoa, he
-What in the world
-Although the physical
-Hello, who is
-That's amaz
-Hey, I know
-Okay, that
-Hi everyone
-Hey, is everything
-I understand your fr
-Oh no, poor
-Oh, look
-Good morning
-Ew, gross
-Oh no, did
-Look at the family
-Hey team
-Yes!
-Hey, can I
-Okay, that'
-It's great
-Love is
-Hey, what
-Good morning, world
-Who is it?
-That poem really reson
-I
-That's
-I understand the task
-Gu
-Hello? Who'
-This postcard is
-Whoa,
-Oh, that
-I understand that I
-Whoever is
-Hello? Who is
-I'm really
-Wow, this
-Can
-This artwork really
-This is a shame
-I miss you too
-Who are you?
-Today is a difficult
-Hey, just
-Are you okay
-I am
-Hi,
-Wow, that
-Hey there! Can
-Okay, stay
-Oh great, just
-Yeah,
-Hello? Can you
-Oh, looks
-Thank you for sharing
-I'm glad
-Hey, is that
-Hmm
-It was my
-It sounds like you
-Wow, your
-I was promised certain
-That was such a
-Thank
-Excuse you
-That was
-Hey team,
-I feel un
-It was
-What'
-Hey friend, I
-How
-Saying goodbye
-That
-It's heart
-How dare
-Oh,
-Hello, may
-What's this
-Thank you for recogn
-Aww, that
-Oh, I remember
-Hmm, that'
-I miss
-I know this
-Wait
-Is everything okay
-Who is that person
-Wow, you
-Oh great
-I'm sad
-Wow, the
-I am very disappoint
-Who turned off the
-I understand that things
-I'm very
-Hi
-That's very
-Okay, I
-Oh no,
-Wow, there
-What's wrong
-I apologize for
-Hey, I
-Can I help you
-Oh, I didn
-Alright,
-Oh wow,
-Oh my goodness
-I know this event
-What in the
-Saying
-Yeah, that
-Guys, I
-Hey, this v
-This post
-Are
-Hey, can
-Hello? Is
-I can only imagine
-Oh, that sounds
-Hey, is anyone
-I am disappointed
-Hello,
-Hey everyone, I
-That was such
-It's okay
-The artist
-Whoa
-I understand that mistakes
-Can I help
-Who
-Hi everyone! I
-Hey, can you
-Wow, how
-Today
-Oh no, I
-Oh well, I
-Well, that
-This is the
-Yes! I finally
-Hey there little
-Hello everyone!
-Love is never
-Look at the
-This postcard
-Oh great,
-Can I
-Hmm, this is
-I understand your
-Oh, look at
-B
-I'm so
-Whoa, this
-W
-Oh, this
-Sometimes
-This piece of
-What the
-That was a
-Hey, do
-Oh no
-Whoa, what
-I feel like I
-The documentary
-Hello
-Hello little one
-I understand that my
-Eww, that
-Wow, an
-Yes! Finally,
-Although the physical location
-Whoever is watching
-That movie
-I remember wondering about
-Hey there, little
-Who's
-Hello, who
-Hello everyone! Thank
-Hello, can
-That's too
-Hey, just wanted
-Hey there, I
-Saying good
-Hey there!
-Who is there?
-Oh my good
-I am very
-Oh no, what
-Wow, thank
-I was promised
-Hi, is
-Hey, I'
-Guys, the
-Oh no, that
-Who is there
-Hello, this
-That movie really touched
-If you have something
-The documentary was
-I'm starting
-Are you kidd
-That movie really
-Hey everyone,
-Thank you for considering
-I didn'
-Yes! I
-Can you
-Oh my god
-Hey, whoever
-That melody really
-Thank you, little
-Hello, may I
-Look
-Wow, we
-It looks
-What do these
-Oh wow
-I apologize
-What are you all
-It's such
-It's clear
-Hey, I was
-Hey friend,
-I can only
-The weather outside is
-Eww, this
-I miss you
-Wow
-Aww,
-Hi, is there
-This artwork
-Okay,
-Oh well,
-This
-I'
-Say
-Hey there little gu
-Hmm,
-Whoa, who
-I am thr
-Oh man
-Okay, stay calm
-I'm happy
-Oh, this cur
-Oh man,
-I'm sorry
-Hello? Who
-What?! That
-This piece
-Hey everyone
-That's so
-Are you okay?
-What happened? Where
-Hi there
-The
-Who the hell entered
-I can
-Guys,
-What's
-What in
-It's important
-I'm
-I'm coming
-It'
-Yes! Finally
-Wait, what
-Wow, reading
-I'm surprised
-Hey, did
-Hey,
-Okay, let
-I understand that you
-Who the hell threw
-Eww, who
-Thank you for thinking
-Who is this?\"
-I am deeply
-Thank you for including
-Oh no, an
-It looks like you
-Aww
-I'm confused
-Wow, it
-That poem really
-Yes
-Hey there, is
-Hey, what'
-Thank you for remember
-To
-This is
-Thank you for making
-I can'
-That mel
-Wow, they
-I feel like
-Although the
-Who are you
-Love
-If
-What the hell are
-I am so sad
-Oh, I found
-Thank you
-It looks like
-Well, life is
-I appreciate that
-The artist's
-Whoa, that
-It's never
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -1,503 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "llama.h"
-#include "ggml.h"
-#include "pca.hpp"
-#include "mean.hpp"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include <algorithm>
-#include <climits>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <vector>
-
-
-//////////////////////////////////////////////////
-// utils
-
-template <class Iter>
-static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
-    std::string ret;
-    for (; begin != end; ++begin) {
-        ret += common_token_to_piece(ctx, *begin);
-    }
-
-    return ret;
-}
-
-static void print_usage(int, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
-    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
-    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
-    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
-    printf("\n");
-}
-
-//////////////////////////////////////////////////
-
-
-// cb_eval is reused for each pair of positive - negative prompt
-struct callback_data {
-    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
-
-    int n_layers = 0;
-    int n_tokens = 0;
-    bool is_eval_pos = true;
-
-    // each element of the vector correspond to one layer
-    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
-    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
-    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
-
-    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
-    void save_tensor_for_layer(struct ggml_tensor * t) {
-        GGML_ASSERT(t->type == GGML_TYPE_F32);
-
-        if (ctx_ggml == nullptr) {
-            // alloc a new ctx_ggml if needed
-            struct ggml_init_params params_ggml = {
-                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
-                /*.mem_buffer =*/ NULL,
-                /*.no_alloc   =*/ true,
-            };
-            ctx_ggml = ggml_init(params_ggml);
-        }
-
-        // copy tensor data
-        auto n_bytes = ggml_nbytes(t);
-        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
-        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
-        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
-        ggml_set_name(t_layer, ggml_get_name(t));
-        //print_debug_tensor(t_layer);
-
-        if (is_eval_pos) {
-            v_pos.push_back(t_layer);
-        } else {
-            v_neg.push_back(t_layer);
-        }
-    }
-
-    // calculate diff (v_pos - v_neg) and place the result back to v_pos
-    // all zero rows in the diff tensor will also be removed
-    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
-    std::vector<struct ggml_tensor *> calc_diff() {
-        for (float il = 0; il < v_pos.size(); il++) {
-            float * a = (float *) v_pos[il]->data;
-            float * b = (float *) v_neg[il]->data;
-            size_t n_elem = ggml_nelements(v_pos[il]);
-            for (size_t j = 0; j < n_elem; j++) {
-                a[j] -= b[j];
-            }
-            //print_debug_tensor(v_pos[i]);
-            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
-            v_diff_filtered.push_back(diff_filtered);
-        }
-        return v_diff_filtered; // for convinient, we return the result std::vector
-    }
-
-    // delete zero rows from a given 2D tensor
-    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
-        //printf("filter_nonzero_rows\n");
-        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
-            // check if given row containing all zero elements
-            int n_cols = t->ne[0]; // hint: should be equal to n_embd
-            for (int col = 0; col < n_cols; ++col) {
-                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
-                    return false;
-                }
-            }
-            return true;
-        };
-        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
-        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
-            if (!is_row_all_zeros(a, i_row, 1e-6)) {
-                rows_to_copy.push_back(i_row);
-            }
-        }
-
-        // get "n_nonzero_rows" for the output "diff_filtered"
-        int n_nonzero_rows = rows_to_copy.size();
-        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
-        int n_embd = a->ne[0];
-        GGML_ASSERT(n_nonzero_rows > 0);
-
-        // diff_filtered: [n_embd, n_nonzero_rows]
-        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
-            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
-        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
-        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
-
-        // copy non-zero rows
-        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
-            int src_row = rows_to_copy[dest_row];
-            for (int i = 0; i < n_embd; i++) {
-                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
-                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
-            }
-        }
-
-        //print_debug_tensor(diff_filtered);
-
-        return diff_filtered;
-    }
-
-    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
-    void reset() {
-        for (auto ptr : v_pos) free(ptr->data);
-        for (auto ptr : v_neg) free(ptr->data);
-        for (auto ptr : v_diff_filtered) free(ptr->data);
-        v_pos.clear();
-        v_neg.clear();
-        v_diff_filtered.clear();
-        if (ctx_ggml) {
-            ggml_free(ctx_ggml);
-        }
-        ctx_ggml = nullptr;
-    }
-};
-
-/**
- * process_ctx is used to store the ggml context for pre-post processing the diff vectors
- * in short, input => v_diff and output => v_final
- */
-struct train_context {
-    ggml_context * ctx_ggml;
-    int n_embd;
-    int n_layers;
-
-    /* pair of prompts to be used for generating final vector */
-    std::vector<std::string> positive_entries;
-    std::vector<std::string> negative_entries;
-
-    // each element of the vector correspond to one layer
-    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
-    // NOTE (2): v_diff is transposed from v_diff_tmp
-    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
-    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
-
-    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
-    // v_diff_tmp will get converted unto v_diff later on
-    std::vector<std::vector<uint8_t>> v_diff_tmp;
-
-    train_context(int n_embd_, int n_layers_) {
-        n_embd = n_embd_;
-        n_layers = n_layers_;
-        struct ggml_init_params params_ggml = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ctx_ggml = ggml_init(params_ggml);
-        for (int il = 0; il < n_layers - 1; il++) {
-            std::vector<uint8_t> empty;
-            v_diff_tmp.push_back(empty);
-            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
-            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
-            v_final.push_back(t);
-        }
-    }
-
-    // add new rows into existing tensor in v_diff_tmp
-    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
-        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
-        for (int il = 0; il < n_layers - 1; il++) {
-            auto t = diff_filtered[il];
-            auto & diff_tmp = v_diff_tmp[il];
-            size_t curr_size = diff_tmp.size();
-            diff_tmp.resize(curr_size + ggml_nbytes(t));
-            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
-        }
-    }
-
-    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
-    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
-    void build_v_diff(bool transpose) {
-        printf("build_v_diff\n");
-        for (int il = 0; il < n_layers - 1; il++) {
-            auto & diff_tmp = v_diff_tmp[il];
-            int n_elem = diff_tmp.size() / sizeof(float);
-            GGML_ASSERT(n_elem % n_embd == 0);
-            int n_rows = n_elem / n_embd;
-            struct ggml_tensor * diff = transpose
-                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
-                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
-            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
-            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
-            if (transpose) {
-                // copy data & transpose
-                float * arr = (float *) diff_tmp.data();
-                for (int ir = 0; ir < n_rows; ++ir) {
-                    for (int ic = 0; ic < n_embd; ++ic) {
-                        float f = arr[ir*n_embd + ic];
-                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
-                    }
-                }
-            } else {
-                // only copy
-                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
-            }
-            v_diff.push_back(diff);
-            print_debug_tensor(diff);
-            // free memory of diff_tmp
-            diff_tmp.resize(0);
-        }
-    }
-
-    ~train_context() {
-        for (auto ptr : v_final) free(ptr->data);
-        for (auto ptr : v_diff) free(ptr->data);
-        // no need to free v_diff_tmp, since we didn't use malloc
-        ggml_free(ctx_ggml);
-    }
-};
-
-struct tokenized_prompt {
-    std::vector<llama_token> tokens_pos;
-    std::vector<llama_token> tokens_neg;
-    size_t max_seq_len;
-
-    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
-        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
-        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
-        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
-        padding_seq(ctx, tokens_pos, max_seq_len);
-        padding_seq(ctx, tokens_neg, max_seq_len);
-    }
-
-    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
-        // TODO: customize padding token
-        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
-        llama_token pad_tok = pad_tokens.back();
-        while (tokens.size() < len) {
-            tokens.push_back(pad_tok);
-        }
-    }
-};
-
-//////////////////////////////////////////////////
-
-template <typename T>
-static std::string to_string(const T & val) {
-    std::stringstream ss;
-    ss << val;
-    return ss.str();
-}
-
-static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
-    std::vector<std::string> output;
-    std::ifstream file(path);
-    if (!file.is_open()) {
-        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
-        exit(1);
-    }
-    std::string line;
-    while (std::getline(file, line)) {
-        bool is_skip = skip_empty_lines && line.empty();
-        if (!is_skip) {
-            string_process_escapes(line);
-            output.push_back(line);
-        }
-    }
-    file.close();
-    return output;
-}
-
-//////////////////////////////////////////////////
-
-static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (callback_data *) user_data;
-    static const char * l_out_name = "l_out";
-    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
-
-    if (ask) {
-        return is_l_out;
-    }
-
-    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
-        return true;
-    }
-
-    // save the tensor to current context
-    cb_data->save_tensor_for_layer(t);
-    return true;
-}
-
-static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
-        return false;
-    }
-    return true;
-}
-
-static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
-    struct gguf_context * ctx = gguf_init_empty();
-
-    const std::string arch = "controlvector";
-    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
-    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
-    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
-
-    for (size_t i = 0; i < v_ctrl.size(); ++i) {
-        gguf_add_tensor(ctx, v_ctrl[i]);
-        print_debug_tensor(v_ctrl[i]);
-        printf("Added tensor: %s\n", v_ctrl[i]->name);
-    }
-
-    printf("%s: writing file...\n", __func__);
-    gguf_write_to_file(ctx, fname.c_str(), false);
-    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
-    gguf_free(ctx);
-}
-
-/**
- * Load prompt files and completion file.
- * Then format each pair of prompt + completion to make an entry.
- */
-static int prepare_entries(common_params & params, train_context & ctx_train) {
-    // load prompts
-    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
-    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
-    if (positive_prompts.size() != negative_prompts.size()) {
-        fprintf(stderr, "number of positive and negative prompts must be equal\n");
-        return 1;
-    }
-    if (positive_prompts.empty()) {
-        fprintf(stderr, "must provide at least one prompt pair\n");
-        return 1;
-    }
-    ctx_train.positive_entries = positive_prompts;
-    ctx_train.negative_entries = negative_prompts;
-    return 0;
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
-        return 1;
-    }
-
-    if (params.n_pca_iterations % params.n_pca_batch != 0) {
-        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
-        return 1;
-    }
-
-
-    callback_data cb_data;
-
-    // pass the callback to the backend scheduler
-    // it will be executed for each node during the graph computation
-    params.cb_eval = cb_eval;
-    params.cb_eval_user_data = &cb_data;
-    params.warmup = false;
-
-    print_build_info();
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the model to get hparams
-    common_init_result llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-
-    // int n_ctx = llama_n_ctx(ctx);
-    int n_layers = llama_n_layer(model);
-    int n_embd = llama_n_embd(model);
-    // get model hint param (a.k.a model arch name)
-    char model_hint[128];
-    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
-
-    // init train_context
-    train_context ctx_train(n_embd, n_layers);
-
-    // load and prepare entries for training
-    prepare_entries(params, ctx_train);
-
-    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
-    std::vector<tokenized_prompt> tokenized_prompts;
-    size_t n_total_tokens = 0;
-    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
-        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
-        n_total_tokens += 2 * t.max_seq_len;
-        tokenized_prompts.push_back(std::move(t));
-    }
-
-    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
-
-    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
-        bool success = false;
-        tokenized_prompt t = tokenized_prompts[i];
-        cb_data.n_layers = n_layers;
-        cb_data.n_tokens = t.max_seq_len;
-
-        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
-            (int) i+1, (int) ctx_train.positive_entries.size(),
-            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
-            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
-            (int) t.max_seq_len);
-
-        cb_data.is_eval_pos = true;
-        success = get_hidden_layers(ctx, t.tokens_pos);
-        if (!success) break;
-
-        cb_data.is_eval_pos = false;
-        success = get_hidden_layers(ctx, t.tokens_neg);
-        if (!success) break;
-
-        // calculate diff and remove all zero rows
-        auto v_diff_filtered = cb_data.calc_diff();
-
-        // save & concat the filtered v_diff to ctx_train
-        ctx_train.concat_diff_tmp(v_diff_filtered);
-
-        // reset for next iteration
-        cb_data.reset();
-    }
-
-    // done with the model, we can now free it to make gain some memory
-    printf("Done evaluate prompts, unload model...\n");
-    llama_free(ctx);
-    llama_free_model(model);
-
-    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
-
-    // prepare ctx_train for PCA
-    ctx_train.build_v_diff(use_pca);
-
-    if (use_pca) {
-        // run PCA
-        PCA::pca_params pca_params;
-        pca_params.n_threads    = params.cpuparams.n_threads;
-        pca_params.n_batch      = params.n_pca_batch;
-        pca_params.n_iterations = params.n_pca_iterations;
-        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
-    } else {
-        // run mean
-        mean::run(ctx_train.v_diff, ctx_train.v_final);
-    }
-
-    // write output vectors to gguf
-    export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/cvector-generator/mean.hpp
+++ b/examples/cvector-generator/mean.hpp
@@ -1,48 +0,0 @@
-#include "common.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <string>
-#include <vector>
-#include <math.h>
-
-namespace mean {
-
-static void run(
-        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
-        const std::vector<struct ggml_tensor *> & v_output) {
-    printf("%s: Running mean...\n", __func__);
-    for (size_t il = 0; il < v_input.size(); ++il) {
-        // prepare output vector
-        struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%ld", il+1);
-
-        // calculate mean vector
-        struct ggml_tensor * t_layer = v_input[il];
-        GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
-        for (int ic = 0; ic < t_layer->ne[0]; ic++) {
-            float f = 0.0;
-            for (int ir = 0; ir < t_layer->ne[1]; ir++) {
-                f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
-            }
-            f /= t_layer->ne[1];
-            ggml_set_f32_1d(ctrl_out, ic, f);
-        }
-
-        // normalize output vector
-        float norm = 0.0;
-        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
-            float f = ggml_get_f32_1d(ctrl_out, i);
-            norm += f*f;
-        }
-        norm = sqrt(norm);
-        for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
-            float f = ggml_get_f32_1d(ctrl_out, i);
-            ggml_set_f32_1d(ctrl_out, i, f / norm);
-        }
-
-        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
-    }
-}
-
-}
--- a/examples/cvector-generator/negative.txt
+++ b/examples/cvector-generator/negative.txt
@@ -1,4 +0,0 @@
-<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
-<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
-<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
-<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -1,315 +0,0 @@
-#include "common.h"
-#include "llama.h"
-#include "ggml.h"
-
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include <cstdio>
-#include <ctime>
-#include <random>
-#include <string>
-#include <vector>
-
-#define DEBUG_POS 5
-
-static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
-    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
-    if (!with_data) return;
-    printf("%s: %s[0] = [", __func__, t->name);
-    for (size_t i = 0; i <= DEBUG_POS; i++) {
-        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
-    }
-    printf(" ... ]\n");
-}
-
-namespace PCA {
-
-// input params for PCA computations
-struct pca_params {
-    int n_threads = 1;
-    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
-    int n_iterations = 1000;
-    float tolerance = 1e-7;
-
-    // for debugging
-    int i_layer = 0;
-    int n_layers = 0;
-};
-
-// result from each iteration
-struct pca_result {
-    struct ggml_tensor * calculated_square = NULL;
-    std::vector<struct ggml_tensor *> eigenvectors;
-    std::vector<float> distances;
-};
-
-struct pca_model {
-    ggml_backend_t backend = NULL;
-    ggml_backend_buffer_t buffer;
-    struct ggml_context * ctx;      // context to compute graph on target device
-    struct ggml_context * ctx_host; // host context to store results
-
-    // tensors on target device
-    struct ggml_tensor * dev_input;
-    struct ggml_tensor * dev_square;
-    struct ggml_tensor * dev_eigenvector;
-
-    pca_model(struct ggml_tensor * t_input) {
-#ifdef GGML_USE_CUDA
-        fprintf(stderr, "%s: using CUDA backend\n", __func__);
-        backend = ggml_backend_cuda_init(0); // init device 0
-        if (!backend) {
-            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-        }
-#endif
-
-// TODO: enable Metal support when support for GGML_OP_SQRT is added
-// #ifdef GGML_USE_METAL
-//         fprintf(stderr, "%s: using Metal backend\n", __func__);
-//         backend = ggml_backend_metal_init();
-//         if (!backend) {
-//             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-//         }
-// #endif
-
-        // if there aren't GPU Backends fallback to CPU backend
-        if (!backend) {
-            backend = ggml_backend_cpu_init();
-        }
-
-        const int num_tensors = 4;
-        struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ctx = ggml_init(params);
-
-        auto n_samples = t_input->ne[0];
-        auto n_embd    = t_input->ne[1];
-
-        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
-        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
-        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        ggml_set_name(dev_input,       "dev_input");
-        ggml_set_name(dev_square,      "dev_square");
-        ggml_set_name(dev_eigenvector, "dev_eigenvector");
-        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
-
-        // initialize eigenvector to random normalized vector
-        {
-            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
-            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
-            std::uniform_real_distribution<float> distribution(0.0, 1.0);
-            float sum_sqr = 0.0; // for normalizing random_vec
-            for (size_t i = 0; i < random_vec.size(); ++i) {
-                float f = distribution(generator);
-                sum_sqr += f * f;
-                random_vec[i] = f;
-            }
-            // normalize it
-            float random_vec_norm = std::sqrt(sum_sqr);
-            for (size_t i = 0; i < random_vec.size(); ++i) {
-                random_vec[i] /= random_vec_norm;
-            }
-            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
-        }
-    }
-
-    ~pca_model() {
-        ggml_free(ctx);
-        ggml_backend_buffer_free(buffer);
-        ggml_backend_free(backend);
-    }
-};
-
-static struct ggml_cgraph * build_graph_piter(
-        const struct pca_params & params,
-        const pca_model & model,
-        bool calc_square = false) {
-    GGML_ASSERT(params.n_batch > 0);
-    // TODO: buf_size must be able to scale with params.n_batch
-    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
-
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
-    };
-    // create a temporally context to build the graph
-    struct ggml_context * ctx0 = ggml_init(params0);
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    // turn v_diff_original into square matrix if needed
-    struct ggml_tensor * tmp_square;
-    if (calc_square) {
-        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
-        ggml_set_name(tmp_square, "tmp_square");
-    }
-
-    struct ggml_tensor * b_tensor;
-    struct ggml_tensor * distance;
-    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
-    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
-
-    for (int i = 0; i < params.n_batch; ++i) {
-        // b_tensor = square * eigenvector^T
-        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
-        ggml_set_name(b_tensor, "b_tensor");
-
-        // normalize
-        b_tensor = ggml_div_inplace(ctx0,
-            b_tensor,
-            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
-        );
-        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
-
-        // calculate distance(new eigenvector - old eigenvector)
-        // we don't use ggml_sub because it may not be implemented on GPU backend
-        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
-        distance = ggml_sqrt_inplace(ctx0,
-            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
-        ggml_format_name(distance, "distance_%d", i);
-
-        old_eigen = b_tensor;
-
-        // build operations nodes
-        ggml_build_forward_expand(gf, distance);
-    }
-
-    // delete the temporally context used to build the graph
-    ggml_free(ctx0);
-    return gf;
-}
-
-static ggml_status compute_piter(
-        const struct pca_params & params,
-        const pca_model & model,
-        struct ggml_cgraph * gf,
-        ggml_gallocr_t allocr,
-        struct pca_result & result) {
-    // allocate tensors
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
-    }
-
-    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
-    if (res == GGML_STATUS_SUCCESS) {
-        auto extract_i = [](std::string prefix, std::string str) -> int {
-            int i = -1;
-            if (str.rfind(prefix, 0) == 0) {
-                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
-            }
-            return i;
-        };
-        result.calculated_square = NULL;
-        result.eigenvectors.clear();
-        result.distances.clear();
-        result.eigenvectors.resize(params.n_batch);
-        result.distances.resize(params.n_batch);
-        // get output nodes
-        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
-            auto node = ggml_graph_node(gf, i);
-            int iter = -1;
-            // find b_tensor (without copying data from device)
-            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
-                result.eigenvectors[iter] = node;
-            }
-            // find distances, then copy data from device
-            if ((iter = extract_i("distance_", node->name)) > -1) {
-                float d;
-                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
-                result.distances[iter] = d;
-                // std::cout << node->name << " = " << d << "\n";
-            }
-            // find tmp_square if it exists (without copying data from device)
-            if (std::string(node->name) == "tmp_square") {
-                result.calculated_square = node;
-            }
-        }
-    }
-    return res;
-}
-
-static void power_iteration(
-        const struct pca_params & params,
-        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
-        struct ggml_tensor * output) {
-    //printf("in power iteration\n");
-    struct pca_model model(input);
-
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-    struct pca_result result;
-    struct ggml_tensor * last_eigenvector = NULL;
-
-    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
-    for (int iter = 0; iter < n_iters; ++iter) {
-        bool calc_square = (iter == 0); // only need to calculate square for first iteration
-        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
-        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
-        compute_piter(params, model, gf, allocr, result);
-
-        for (size_t k = 0; k < result.distances.size(); ++k) {
-            last_eigenvector = result.eigenvectors[k];
-            if (result.distances[k] < params.tolerance) {
-                break; // done
-            }
-        }
-
-        if (calc_square) {
-            // copy and store the square matrix if needed
-            GGML_ASSERT(result.calculated_square != NULL);
-            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
-        }
-
-        {
-            // copy last eigen vector and store as input for next iteration
-            GGML_ASSERT(last_eigenvector != NULL);
-            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
-        }
-
-        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
-            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
-    }
-
-    // get output tensor
-    GGML_ASSERT(last_eigenvector);
-    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
-    //print_debug_tensor(output);
-    ggml_gallocr_free(allocr);
-
-    // TODO @ngxson : The output vector is randomly inverted
-    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
-}
-
-static void run_pca(
-        struct pca_params & params,
-        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
-        const std::vector<struct ggml_tensor *> & v_output) {
-    printf("%s: Running PCA...\n", __func__);
-    for (size_t il = 0; il < v_input.size(); ++il) {
-
-        // prepare output vector
-        struct ggml_tensor * ctrl_out = v_output[il];
-        ggml_format_name(ctrl_out, "direction.%ld", il+1);
-
-        // run power_iteration
-        params.i_layer = il;
-        params.n_layers = v_input.size();
-        power_iteration(params, v_input[il], ctrl_out);
-        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
-    }
-}
-
-}
--- a/examples/cvector-generator/positive.txt
+++ b/examples/cvector-generator/positive.txt
@@ -1,4 +0,0 @@
-<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
-<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
-<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
-<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -1,35 +0,0 @@
-// Warns users that this filename was deprecated, and provides a link for more information.
-
-#include <cstdio>
-#include <string>
-#include <unordered_map>
-
-// Main
-int main(int argc, char** argv) {
-    std::string filename = "main";
-    if (argc >= 1) {
-        filename = argv[0];
-    }
-
-    // Get only the program name from the full path
-    auto pos = filename.find_last_of('/');
-    if (pos != std::string::npos) {
-        filename = filename.substr(pos+1);
-    }
-
-    // Append "llama-" to the beginning of filename to get the replacemnt filename
-    auto replacement_filename = "llama-" + filename;
-
-    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
-    if (filename == "main") {
-        replacement_filename = "llama-cli";
-    }
-
-    fprintf(stdout, "\n");
-    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
-    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
-    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
-    fprintf(stdout, "\n");
-
-    return EXIT_FAILURE;
-}
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,193 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <cstdio>
-#include <string>
-#include <vector>
-
-/**
- * This the arbitrary data which will be passed to each callback.
- * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
- */
-struct callback_data {
-    std::vector<uint8_t> data;
-};
-
-static std::string ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
-    GGML_ASSERT(n > 0);
-    float sum = 0;
-    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2*n) {
-                LOG("                                      ..., \n");
-                i2 = ne[2] - n;
-            }
-            LOG("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2*n) {
-                    LOG("                                       ..., \n");
-                    i1 = ne[1] - n;
-                }
-                LOG("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2*n) {
-                        LOG("..., ");
-                        i0 = ne[0] - n;
-                    }
-                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
-                    float v;
-                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
-                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) &data[i];
-                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) &data[i];
-                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) &data[i];
-                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) &data[i];
-                    } else {
-                        GGML_ABORT("fatal error");
-                    }
-                    LOG("%12.4f", v);
-                    sum += v;
-                    if (i0 < ne[0] - 1) LOG(", ");
-                }
-                LOG("],\n");
-            }
-            LOG("                                      ],\n");
-        }
-        LOG("                                     ]\n");
-        LOG("                                     sum = %f\n", sum);
-    }
-}
-
-/**
- * GGML operations callback during the graph execution.
- *
- * @param t current tensor
- * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
- *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
- *            see ggml_backend_sched_eval_callback
- * @param user_data user data to pass at each call back
- * @return true to receive data or continue the graph, false otherwise
- */
-static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (callback_data *) user_data;
-
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
-    if (ask) {
-        return true; // Always retrieve data
-    }
-
-    char src1_str[128] = {0};
-    if (src1) {
-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
-    }
-
-    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-         t->name, ggml_type_name(t->type), ggml_op_desc(t),
-         src0->name, ggml_ne_string(src0).c_str(),
-         src1 ? src1_str : "",
-         ggml_ne_string(t).c_str());
-
-
-    // copy the data from the GPU memory if needed
-    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
-
-    if (!is_host) {
-        auto n_bytes = ggml_nbytes(t);
-        cb_data->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
-    }
-
-    if (!ggml_is_quantized(t->type)) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
-        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
-    }
-
-    return true;
-}
-
-static bool run(llama_context * ctx, const common_params & params) {
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-
-    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
-
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
-        LOG_ERR("%s : failed to eval\n", __func__);
-        return false;
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-    callback_data cb_data;
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
-
-    common_init();
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // pass the callback to the backend scheduler
-    // it will be executed for each node during the graph computation
-    params.cb_eval = ggml_debug;
-    params.cb_eval_user_data = &cb_data;
-    params.warmup = false;
-
-    // init
-    common_init_result llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
-        return 1;
-    }
-
-    // print system information
-    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-        LOG_INF("\n");
-    }
-
-    bool OK = run(ctx, params);
-    if (!OK) {
-        return 1;
-    }
-
-    LOG("\n");
-    llama_perf_context_print(ctx);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,226 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "llama.h"
-
-#include <string>
-#include <vector>
-
-// #define GRIT_DEBUG
-
-static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
-    std::vector<std::vector<float>> result;
-
-    const llama_model * model = llama_get_model(ctx);
-
-    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
-
-    for (uint64_t i = 0; i < sentences.size(); i++) {
-        common_batch_clear(batch);
-
-        const std::string input_string = instruction + sentences[i];
-
-        std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
-
-        const int32_t n_toks = inputs.size();
-
-        // GritLM seems to have EOS = ""
-        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_token_eos(model));
-
-        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
-
-#ifdef GRIT_DEBUG
-        // debug tokens - should be matching as referenced in the GritLM sample
-        std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
-            std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
-        });
-        std::printf("\n");
-#endif
-
-        // add input to batch (this increments n_tokens)
-        for (int32_t j = 0; j < n_toks; j++) {
-            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
-        }
-
-        // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(ctx);
-        llama_set_embeddings(ctx, true);
-        llama_set_causal_attn(ctx, false);
-
-        // run model
-        llama_decode(ctx, batch);
-
-        // get embedding dimensions
-        uint64_t n_embd = llama_n_embd(model);
-
-        // allocate embedding output
-        std::vector<float> emb_unorm(n_embd, 0.0f);
-
-        // sum up all token embeddings
-        for (int32_t k = n_inst; k < n_toks; k++) {
-            float * emb = llama_get_embeddings_ith(ctx, k);
-            for (uint64_t j = 0; j < n_embd; j++) {
-                emb_unorm[j] += emb[j];
-            }
-        }
-
-        // divide by number of tokens (mean pooling)
-        {
-            const uint64_t n_sent = n_toks - n_inst;
-
-            for (uint64_t j = 0; j < n_embd; j++) {
-                emb_unorm[j] /= n_sent;
-            }
-        }
-
-        std::vector<float> emb_norm(emb_unorm.size());
-        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
-        result.push_back(emb_norm);
-
-#ifdef GRIT_DEBUG
-        // print out emb_norm
-        std::printf("embedding %ld: ", i);
-        for (uint64_t j = 0; j < n_embd; j++) {
-            std::printf("%.5f ", emb_norm[j]);
-        }
-        std::printf("\n\n");
-#endif
-    }
-
-    llama_batch_free(batch);
-
-    return result;
-}
-
-static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
-    std::string result;
-
-    const llama_model * model = llama_get_model(ctx);
-    llama_token eos_token = llama_token_eos(model);
-
-    llama_kv_cache_clear(ctx);
-    llama_set_embeddings(ctx, false);
-    llama_set_causal_attn(ctx, true);
-
-    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
-
-    std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
-    int32_t i_current_token = 0;
-
-    while (true) {
-        common_batch_clear(bat);
-        {
-            const int32_t n_inputs = inputs.size();
-
-            for (int32_t i = 0; i < n_inputs; i++) {
-                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
-            }
-        }
-        inputs.clear();
-
-        llama_decode(ctx, bat);
-
-        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
-
-        if (token == eos_token) {
-            break;
-        }
-
-        std::string piece = common_token_to_piece(ctx, token);
-        if (stream) {
-            std::printf("%s", piece.c_str());
-            std::fflush(stdout);
-        }
-
-        inputs.push_back(token);
-
-        result += piece;
-    }
-
-    if (stream) {
-        std::printf("\n");
-    }
-
-    llama_batch_free(bat);
-
-    return result;
-}
-
-static std::string gritlm_instruction(const std::string & instruction) {
-    return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
-}
-
-int main(int argc, char * argv[]) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
-
-    common_init();
-
-    llama_model_params mparams = common_model_params_to_llama(params);
-    llama_context_params cparams = common_context_params_to_llama(params);
-
-    llama_backend_init();
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
-
-    // create generation context
-    llama_context * ctx = llama_new_context_with_model(model, cparams);
-
-    auto sparams = llama_sampler_chain_default_params();
-
-    sparams.no_perf = false;
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
-    // ### Embedding/Representation ###
-    // samples taken from: https://github.com/ContextualAI/gritlm#basic
-    {
-        const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
-
-        const std::vector<std::string> queries = {
-            "Bitcoin: A Peer-to-Peer Electronic Cash System",
-            "Generative Representational Instruction Tuning",
-        };
-
-        const std::vector<std::string> documents = {
-            "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
-            "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
-        };
-
-        // No need to add instruction for retrieval documents
-        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
-        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
-
-        const int n_embd = llama_n_embd(model);
-
-        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
-        const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
-
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
-    }
-
-    // ### Generation ###
-    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
-    {
-        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, smpl, prompt, true);
-    }
-
-    llama_sampler_free(smpl);
-    llama_free(ctx);
-    llama_free_model(model);
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-import matplotlib.pyplot as plt
-import os
-import csv
-
-labels = []
-numbers = []
-numEntries = 1
-
-rows = []
-
-
-def bar_chart(numbers, labels, pos):
-    plt.bar(pos, numbers, color='blue')
-    plt.xticks(ticks=pos, labels=labels)
-    plt.title("Jeopardy Results by Model")
-    plt.xlabel("Model")
-    plt.ylabel("Questions Correct")
-    plt.show()
-
-
-def calculatecorrect():
-    directory = os.fsencode("./examples/jeopardy/results/")
-    csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
-    for row in csv_reader:
-        global rows
-        rows.append(row)
-    for listing in os.listdir(directory):
-        filename = os.fsdecode(listing)
-        if filename.endswith(".txt"):
-            file = open("./examples/jeopardy/results/" + filename, "rt")
-            global labels
-            global numEntries
-            global numbers
-            labels.append(filename[:-4])
-            numEntries += 1
-            i = 1
-            totalcorrect = 0
-            for line in file.readlines():
-                if line.strip() != "------":
-                    print(line)
-                else:
-                    print("Correct answer: " + rows[i][2] + "\n")
-                    i += 1
-                    print("Did the AI get the question right? (y/n)")
-                    if input() == "y":
-                        totalcorrect += 1
-            numbers.append(totalcorrect)
-
-
-if __name__ == '__main__':
-    calculatecorrect()
-    pos = list(range(numEntries))
-    labels.append("Human")
-    numbers.append(48.11)
-    bar_chart(numbers, labels, pos)
-    print(labels)
-    print(numbers)
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -e
-
-MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
-MODEL_NAME=Vicuna
-
-# exec options
-prefix="Human: " # Ex. Vicuna uses "Human: "
-opts="--temp 0 -n 80" # additional flags
-nl='
-'
-introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
-
-# file options
-question_file=./examples/jeopardy/questions.txt
-touch ./examples/jeopardy/results/$MODEL_NAME.txt
-output_file=./examples/jeopardy/results/$MODEL_NAME.txt
-
-counter=1
-
-echo 'Running'
-while IFS= read -r question
-do
-  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
-  echo $counter
-  echo "Current Question: $question"
-  eval "$exe_cmd"
-  echo -e "\n------" >> $output_file
-  counter=$((counter+1))
-done < "$question_file"
--- a/examples/jeopardy/qasheet.csv
+++ b/examples/jeopardy/qasheet.csv
@@ -1,103 +0,0 @@
-Index,Original Category,Original Correct Question,Model Prompt
-1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
-2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
-3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
-4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
-5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
-6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
-7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
-8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
-9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
-10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
-11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
-12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
-13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
-14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
-15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
-16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
-17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
-18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
-19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
-20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
-21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
-22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
-23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
-24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
-25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
-26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
-27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
-28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
-29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
-30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
-31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
-32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
-33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
-34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
-35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
-36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
-37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?"
-38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?"
-39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
-40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
-41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
-42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
-43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
-44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
-45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
-46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
-47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
-48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
-49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
-50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
-51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
-52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
-53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
-54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
-55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
-56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
-57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
-58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
-59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
-60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
-61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
-62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
-63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
-64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
-65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
-66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
-67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
-68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
-69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
-70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
-71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
-72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
-73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
-74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
-75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
-76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
-77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
-78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
-79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
-80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
-81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
-82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
-83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
-84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
-85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
-86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
-87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
-88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
-89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
-90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
-91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
-92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
-93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
-94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
-95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
-96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
-97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
-98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
-99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
-100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
-,,,
-TOTALS,,,
--- a/examples/jeopardy/questions.txt
+++ b/examples/jeopardy/questions.txt
@@ -1,100 +0,0 @@
-Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
-What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
-Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
-James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
-England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
-Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
-In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
-Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
-Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
-What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
-A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
-A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
-Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
-The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
-In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
-What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
-Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
-What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
-In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
-At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
-Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
-A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
-In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
-Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
-A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
-Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
-After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
-The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
-This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
-An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
-Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
-What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
-A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
-Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
-Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
-The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
-For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
-Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
-In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
-In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
-What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
-In 2010 who introduced the 4-point shot, 35 feet from the basket?
-Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
-A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
-In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
-Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
-In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
-Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
-This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
-1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
-Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
-Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
-Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
-5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
-Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
-The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
-Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
-Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
-In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
-At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
-Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
-Like Sir Thomas More, 3 16th century English queens are buried at what British location?
-In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
-The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
-What was first sold in 1908, at a price equivalent to about $27,000 today?
-The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
-The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
-In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
-In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
-Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
-After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
-Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
-Until a 1903 secession, what country's contiguous territory spanned 2 continents?
-Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
-Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
-Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
-Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
-Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
-Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
-In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
-In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
-In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
-The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
-The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
-Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
-What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
-What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
-Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
-Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
-Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
-The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
-A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
-Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
-A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
-In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
-Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
-Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
-The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
-What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
-Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?
--- a/examples/json_schema_pydantic_example.py
+++ b/examples/json_schema_pydantic_example.py
@@ -1,82 +0,0 @@
-# Usage:
-#! ./llama-server -m some-model.gguf &
-#! pip install pydantic
-#! python json_schema_pydantic_example.py
-
-from pydantic import BaseModel, Field, TypeAdapter
-from annotated_types import MinLen
-from typing import Annotated, List, Optional
-import json, requests
-
-if True:
-
-    def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
-        '''
-        Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
-        (llama.cpp server, llama-cpp-python, Anyscale / Together...)
-
-        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
-        '''
-        response_format = None
-        type_adapter = None
-
-        if response_model:
-            type_adapter = TypeAdapter(response_model)
-            schema = type_adapter.json_schema()
-            messages = [{
-                "role": "system",
-                "content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
-            }] + messages
-            response_format={"type": "json_object", "schema": schema}
-
-        data = requests.post(endpoint, headers={"Content-Type": "application/json"},
-                             json=dict(messages=messages, response_format=response_format, **kwargs)).json()
-        if 'error' in data:
-            raise Exception(data['error']['message'])
-
-        content = data["choices"][0]["message"]["content"]
-        return type_adapter.validate_json(content) if type_adapter else content
-
-else:
-
-    # This alternative branch uses Instructor + OpenAI client lib.
-    # Instructor support streamed iterable responses, retry & more.
-    # (see https://python.useinstructor.com/)
-    #! pip install instructor openai
-    import instructor, openai
-    client = instructor.patch(
-        openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
-        mode=instructor.Mode.JSON_SCHEMA)
-    create_completion = client.chat.completions.create
-
-
-if __name__ == '__main__':
-
-    class QAPair(BaseModel):
-        class Config:
-            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
-        question: str
-        concise_answer: str
-        justification: str
-        stars: Annotated[int, Field(ge=1, le=5)]
-
-    class PyramidalSummary(BaseModel):
-        class Config:
-            extra = 'forbid'  # triggers additionalProperties: false in the JSON schema
-        title: str
-        summary: str
-        question_answers: Annotated[List[QAPair], MinLen(2)]
-        sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
-
-    print("# Summary\n", create_completion(
-        model="...",
-        response_model=PyramidalSummary,
-        messages=[{
-            "role": "user",
-            "content": f"""
-                You are a highly efficient corporate document summarizer.
-                Create a pyramidal summary of an imaginary internal document about our company processes
-                (starting high-level, going down to each sub sections).
-                Keep questions short, and answers even shorter (trivia / quizz style).
-            """
-        }]))
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -1,811 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import itertools
-import json
-import re
-import sys
-from typing import Any, List, Optional, Set, Tuple, Union
-
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
-
-    if min_items == 0 and max_items == 1:
-        return f'{item_rule}?'
-
-    if not separator_rule:
-        if min_items == 1 and max_items is None:
-            return f'{item_rule}+'
-        elif min_items == 0 and max_items is None:
-            return f'{item_rule}*'
-        else:
-            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
-
-    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
-    return f'({result})?' if min_items == 0 else result
-
-def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
-    has_min = min_value != None
-    has_max = max_value != None
-
-    def digit_range(from_char: str, to_char: str):
-        out.append("[")
-        if from_char == to_char:
-            out.append(from_char)
-        else:
-            out.append(from_char)
-            out.append("-")
-            out.append(to_char)
-        out.append("]")
-
-    def more_digits(min_digits: int, max_digits: int):
-        out.append("[0-9]")
-        if min_digits == max_digits and min_digits == 1:
-            return
-        out.append("{")
-        out.append(str(min_digits))
-        if max_digits != min_digits:
-            out.append(",")
-            if max_digits != sys.maxsize:
-                out.append(str(max_digits))
-        out.append("}")
-
-    def uniform_range(from_str: str, to_str: str):
-        i = 0
-        while i < len(from_str) and from_str[i] == to_str[i]:
-            i += 1
-        if i > 0:
-            out.append("\"")
-            out.append(from_str[:i])
-            out.append("\"")
-        if i < len(from_str):
-            if i > 0:
-                out.append(" ")
-            sub_len = len(from_str) - i - 1
-            if sub_len > 0:
-                from_sub = from_str[i+1:]
-                to_sub = to_str[i+1:]
-                sub_zeros = "0" * sub_len
-                sub_nines = "9" * sub_len
-
-                to_reached = False
-                out.append("(")
-                if from_sub == sub_zeros:
-                    digit_range(from_str[i], chr(ord(to_str[i]) - 1))
-                    out.append(" ")
-                    more_digits(sub_len, sub_len)
-                else:
-                    out.append("[")
-                    out.append(from_str[i])
-                    out.append("] ")
-                    out.append("(")
-                    uniform_range(from_sub, sub_nines)
-                    out.append(")")
-                    if ord(from_str[i]) < ord(to_str[i]) - 1:
-                        out.append(" | ")
-                        if to_sub == sub_nines:
-                            digit_range(chr(ord(from_str[i]) + 1), to_str[i])
-                            to_reached = True
-                        else:
-                            digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
-                        out.append(" ")
-                        more_digits(sub_len, sub_len)
-                if not to_reached:
-                    out.append(" | ")
-                    digit_range(to_str[i], to_str[i])
-                    out.append(" ")
-                    uniform_range(sub_zeros, to_sub)
-                out.append(")")
-            else:
-                out.append("[")
-                out.append(from_str[i])
-                out.append("-")
-                out.append(to_str[i])
-                out.append("]")
-
-    if has_min and has_max:
-        if min_value < 0 and max_value < 0:
-            out.append("\"-\" (")
-            _generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
-            out.append(")")
-            return
-
-        if min_value < 0:
-            out.append("\"-\" (")
-            _generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
-            out.append(") | ")
-            min_value = 0
-
-        min_s = str(min_value)
-        max_s = str(max_value)
-        min_digits = len(min_s)
-        max_digits = len(max_s)
-
-        for digits in range(min_digits, max_digits):
-            uniform_range(min_s, "9" * digits)
-            min_s = "1" + "0" * digits
-            out.append(" | ")
-        uniform_range(min_s, max_s)
-        return
-
-    less_decimals = max(decimals_left - 1, 1)
-
-    if has_min:
-        if min_value < 0:
-            out.append("\"-\" (")
-            _generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
-            out.append(") | [0] | [1-9] ")
-            more_digits(0, decimals_left - 1)
-        elif min_value == 0:
-            if top_level:
-                out.append("[0] | [1-9] ")
-                more_digits(0, less_decimals)
-            else:
-                more_digits(1, decimals_left)
-        elif min_value <= 9:
-            c = str(min_value)
-            range_start = '1' if top_level else '0'
-            if c > range_start:
-                digit_range(range_start, chr(ord(c) - 1))
-                out.append(" ")
-                more_digits(1, less_decimals)
-                out.append(" | ")
-            digit_range(c, "9")
-            out.append(" ")
-            more_digits(0, less_decimals)
-        else:
-            min_s = str(min_value)
-            length = len(min_s)
-            c = min_s[0]
-
-            if c > "1":
-                digit_range("1" if top_level else "0", chr(ord(c) - 1))
-                out.append(" ")
-                more_digits(length, less_decimals)
-                out.append(" | ")
-            digit_range(c, c)
-            out.append(" (")
-            _generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
-            out.append(")")
-            if c < "9":
-                out.append(" | ")
-                digit_range(chr(ord(c) + 1), "9")
-                out.append(" ")
-                more_digits(length - 1, less_decimals)
-        return
-
-    if has_max:
-        if max_value >= 0:
-            if top_level:
-                out.append("\"-\" [1-9] ")
-                more_digits(0, less_decimals)
-                out.append(" | ")
-            _generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
-        else:
-            out.append("\"-\" (")
-            _generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
-            out.append(")")
-        return
-
-    raise RuntimeError("At least one of min_value or max_value must be set")
-
-class BuiltinRule:
-    def __init__(self, content: str, deps: list | None = None):
-        self.content = content
-        self.deps = deps or []
-
-# Constraining spaces to prevent model "running away".
-SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
-
-PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
-    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
-    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
-    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null'         : BuiltinRule('"null" space', []),
-}
-
-# TODO: support "uri", "email" string formats
-STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
-    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
-}
-
-DOTALL = '[\\U00000000-\\U0010FFFF]'
-DOT = '[^\\x0A\\x0D]'
-
-RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
-
-INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
-GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
-GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
-GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
-
-NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
-
-
-class SchemaConverter:
-    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
-        self._prop_order = prop_order
-        self._allow_fetch = allow_fetch
-        self._dotall = dotall
-        self._raw_pattern = raw_pattern
-        self._rules = {
-            'space': SPACE_RULE,
-        }
-        self._refs = {}
-        self._refs_being_resolved = set()
-
-    def _format_literal(self, literal):
-        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
-        )
-        return f'"{escaped}"'
-
-    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
-        '''
-            not_literal('a') -> '[^a]'
-            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
-        '''
-        assert len(literal) > 0, 'Empty literal not supported'
-        def recurse(i: int):
-            c = literal[i]
-            if maybe_escaped_underscores and c == '_':
-                yield f'[^{c}\\\\]'
-                yield ' | '
-                yield f'"\\\\"? "{c}"'
-            else:
-                yield f'[^{c}]'
-            if i < len(literal) - 1:
-                yield ' | '
-                yield self._format_literal(c)
-                yield ' ('
-                yield from recurse(i + 1)
-                yield ')?'
-
-        return ''.join(('(', *recurse(0), ')'))
-
-    def _not_strings(self, strings):
-        class TrieNode:
-            def __init__(self):
-                self.children = {}
-                self.is_end_of_string = False
-
-            def insert(self, string):
-                node = self
-                for c in string:
-                    node = node.children.setdefault(c, TrieNode())
-                node.is_end_of_string = True
-
-        trie = TrieNode()
-        for s in strings:
-            trie.insert(s)
-
-        char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
-        out = ['["] ( ']
-
-        def visit(node):
-            rejects = []
-            first = True
-            for c in sorted(node.children.keys()):
-                child = node.children[c]
-                rejects.append(c)
-                if first:
-                    first = False
-                else:
-                    out.append(' | ')
-                out.append(f'[{c}]')
-                if child.children:
-                    out.append(f' (')
-                    visit(child)
-                    out.append(')')
-                elif child.is_end_of_string:
-                    out.append(f' {char_rule}+')
-            if node.children:
-                if not first:
-                    out.append(' | ')
-                out.append(f'[^"{"".join(rejects)}] {char_rule}*')
-        visit(trie)
-
-        out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
-        return ''.join(out)
-
-    def _add_rule(self, name, rule):
-        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
-        if esc_name not in self._rules or self._rules[esc_name] == rule:
-            key = esc_name
-        else:
-            i = 0
-            while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule:
-                i += 1
-            key = f'{esc_name}{i}'
-        self._rules[key] = rule
-        return key
-
-    def resolve_refs(self, schema: dict, url: str):
-        '''
-            Resolves all $ref fields in the given schema, fetching any remote schemas,
-            replacing $ref with absolute reference URL and populating self._refs with the
-            respective referenced (sub)schema dictionaries.
-        '''
-        def visit(n: dict):
-            if isinstance(n, list):
-                return [visit(x) for x in n]
-            elif isinstance(n, dict):
-                ref = n.get('$ref')
-                if ref is not None and ref not in self._refs:
-                    if ref.startswith('https://'):
-                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
-                        import requests
-
-                        frag_split = ref.split('#')
-                        base_url = frag_split[0]
-
-                        target = self._refs.get(base_url)
-                        if target is None:
-                            target = self.resolve_refs(requests.get(ref).json(), base_url)
-                            self._refs[base_url] = target
-
-                        if len(frag_split) == 1 or frag_split[-1] == '':
-                            return target
-                    elif ref.startswith('#/'):
-                        target = schema
-                        ref = f'{url}{ref}'
-                        n['$ref'] = ref
-                    else:
-                        raise ValueError(f'Unsupported ref {ref}')
-
-                    for sel in ref.split('#')[-1].split('/')[1:]:
-                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
-                        target = target[sel]
-
-                    self._refs[ref] = target
-                else:
-                    for v in n.values():
-                        visit(v)
-
-            return n
-        return visit(schema)
-
-    def _generate_union_rule(self, name, alt_schemas):
-        return ' | '.join((
-            self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
-            for i, alt_schema in enumerate(alt_schemas)
-        ))
-
-    def _visit_pattern(self, pattern, name):
-        '''
-            Transforms a regular expression pattern into a GBNF rule.
-
-            Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
-            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
-
-            Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
-
-            Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
-            we define sub-rules to keep the output lean.
-        '''
-
-        assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
-        pattern = pattern[1:-1]
-        sub_rule_ids = {}
-
-        i = 0
-        length = len(pattern)
-
-        def to_rule(s: tuple[str, bool]) -> str:
-            (txt, is_literal) = s
-            return "\"" + txt + "\"" if is_literal else txt
-
-        def transform() -> tuple[str, bool]:
-            '''
-                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
-            '''
-            nonlocal i
-            nonlocal pattern
-            nonlocal sub_rule_ids
-
-            start = i
-            # For each component of this sequence, store its string representation and whether it's a literal.
-            # We only need a flat structure here to apply repetition operators to the last item, and
-            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
-            # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[tuple[str, bool]] = []
-
-            def get_dot():
-                if self._dotall:
-                    rule = DOTALL
-                else:
-                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
-                    rule = DOT
-                return self._add_rule(f'dot', rule)
-
-            def join_seq():
-                nonlocal seq
-                ret = []
-                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
-                    if is_literal:
-                        ret.append((''.join(x[0] for x in g), True))
-                    else:
-                        ret.extend(g)
-                if len(ret) == 1:
-                    return ret[0]
-                return (' '.join(to_rule(x) for x in seq), False)
-
-            while i < length:
-                c = pattern[i]
-                if c == '.':
-                    seq.append((get_dot(), False))
-                    i += 1
-                elif c == '(':
-                    i += 1
-                    if i < length:
-                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
-                    seq.append((f'({to_rule(transform())})', False))
-                elif c == ')':
-                    i += 1
-                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
-                    return join_seq()
-                elif c == '[':
-                    square_brackets = c
-                    i += 1
-                    while i < length and pattern[i] != ']':
-                        if pattern[i] == '\\':
-                            square_brackets += pattern[i:i+2]
-                            i += 2
-                        else:
-                            square_brackets += pattern[i]
-                            i += 1
-                    assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}'
-                    square_brackets += ']'
-                    i += 1
-                    seq.append((square_brackets, False))
-                elif c == '|':
-                    seq.append(('|', False))
-                    i += 1
-                elif c in ('*', '+', '?'):
-                    seq[-1] = (to_rule(seq[-1]) + c, False)
-                    i += 1
-                elif c == '{':
-                    curly_brackets = c
-                    i += 1
-                    while i < length and pattern[i] != '}':
-                        curly_brackets += pattern[i]
-                        i += 1
-                    assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}'
-                    curly_brackets += '}'
-                    i += 1
-                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
-                    min_times = 0
-                    max_times = None
-                    try:
-                        if len(nums) == 1:
-                            min_times = int(nums[0])
-                            max_times = min_times
-                        else:
-                            assert len(nums) == 2
-                            min_times = int(nums[0]) if nums[0] else 0
-                            max_times = int(nums[1]) if nums[1] else None
-                    except ValueError:
-                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
-
-                    (sub, sub_is_literal) = seq[-1]
-
-                    if not sub_is_literal:
-                        id = sub_rule_ids.get(sub)
-                        if id is None:
-                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
-                            sub_rule_ids[sub] = id
-                        sub = id
-
-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
-                else:
-                    literal = ''
-                    while i < length:
-                        if pattern[i] == '\\' and i < length - 1:
-                            next = pattern[i + 1]
-                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
-                                i += 1
-                                literal += pattern[i]
-                                i += 1
-                            else:
-                                literal += pattern[i:i+2]
-                                i += 2
-                        elif pattern[i] == '"' and not self._raw_pattern:
-                            literal += '\\"'
-                            i += 1
-                        elif pattern[i] not in NON_LITERAL_SET and \
-                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
-                            literal += pattern[i]
-                            i += 1
-                        else:
-                            break
-                    if literal:
-                        seq.append((literal, True))
-
-            return join_seq()
-
-        return self._add_rule(
-            name,
-            to_rule(transform()) if self._raw_pattern \
-                else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
-
-
-    def _resolve_ref(self, ref):
-        ref_name = ref.split('/')[-1]
-        if ref_name not in self._rules and ref not in self._refs_being_resolved:
-            self._refs_being_resolved.add(ref)
-            resolved = self._refs[ref]
-            ref_name = self.visit(resolved, ref_name)
-            self._refs_being_resolved.remove(ref)
-        return ref_name
-
-    def _generate_constant_rule(self, value):
-        return self._format_literal(json.dumps(value))
-
-    def visit(self, schema, name):
-        schema_type = schema.get('type')
-        schema_format = schema.get('format')
-        rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'
-
-        if (ref := schema.get('$ref')) is not None:
-            return self._add_rule(rule_name, self._resolve_ref(ref))
-
-        elif 'oneOf' in schema or 'anyOf' in schema:
-            return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
-
-        elif isinstance(schema_type, list):
-            return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
-
-        elif 'const' in schema:
-            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
-
-        elif 'enum' in schema:
-            rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
-            return self._add_rule(rule_name, rule)
-
-        elif schema_type in (None, 'object') and \
-             ('properties' in schema or \
-              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
-            required = set(schema.get('required', []))
-            properties = list(schema.get('properties', {}).items())
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
-
-        elif schema_type in (None, 'object') and 'allOf' in schema:
-            required = set()
-            properties = []
-            hybrid_name = name
-            def add_component(comp_schema, is_required):
-                if (ref := comp_schema.get('$ref')) is not None:
-                    comp_schema = self._refs[ref]
-
-                if 'properties' in comp_schema:
-                    for prop_name, prop_schema in comp_schema['properties'].items():
-                        properties.append((prop_name, prop_schema))
-                        if is_required:
-                            required.add(prop_name)
-
-            for t in schema['allOf']:
-                if 'anyOf' in t:
-                    for tt in t['anyOf']:
-                        add_component(tt, is_required=False)
-                else:
-                    add_component(t, is_required=True)
-
-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
-
-        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
-            items = schema.get('items') or schema['prefixItems']
-            if isinstance(items, list):
-                return self._add_rule(
-                    rule_name,
-                    '"[" space ' +
-                    ' "," space '.join(
-                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
-                        for i, item in enumerate(items)) +
-                    ' "]" space')
-            else:
-                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
-                min_items = schema.get("minItems", 0)
-                max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
-
-        elif schema_type in (None, 'string') and 'pattern' in schema:
-            return self._visit_pattern(schema['pattern'], rule_name)
-
-        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
-            return self._add_primitive(
-                'root' if rule_name == 'root' else schema_format,
-                PRIMITIVE_RULES['uuid']
-            )
-
-        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
-            prim_name = f'{schema_format}-string'
-            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
-
-        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
-            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
-            min_len = schema.get('minLength', 0)
-            max_len = schema.get('maxLength')
-
-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
-
-        elif schema_type in (None, 'integer') and \
-                ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
-            min_value = None
-            max_value = None
-            if 'minimum' in schema:
-                min_value = schema['minimum']
-            elif 'exclusiveMinimum' in schema:
-                min_value = schema['exclusiveMinimum'] + 1
-            if 'maximum' in schema:
-                max_value = schema['maximum']
-            elif 'exclusiveMaximum' in schema:
-                max_value = schema['exclusiveMaximum'] - 1
-
-            out = ["("]
-            _generate_min_max_int(min_value, max_value, out)
-            out.append(") space")
-            return self._add_rule(rule_name, ''.join(out))
-
-        elif (schema_type == 'object') or (len(schema) == 0):
-            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
-
-        else:
-            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
-            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
-
-    def _add_primitive(self, name: str, rule: BuiltinRule):
-        n = self._add_rule(name, rule.content)
-
-        for dep in rule.deps:
-            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
-            assert dep_rule, f'Rule {dep} not known'
-            if dep not in self._rules:
-                self._add_primitive(dep, dep_rule)
-        return n
-
-    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
-        prop_order = self._prop_order
-        # sort by position in prop_order (if specified) then by original order
-        sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
-
-        prop_kv_rule_names = {}
-        for prop_name, prop_schema in properties:
-            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
-            prop_kv_rule_names[prop_name] = self._add_rule(
-                f'{name}{"-" if name else ""}{prop_name}-kv',
-                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
-            )
-        required_props = [k for k in sorted_props if k in required]
-        optional_props = [k for k in sorted_props if k not in required]
-
-        if additional_properties is not None and additional_properties != False:
-            sub_name = f'{name}{"-" if name else ""}additional'
-            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
-                self._add_primitive('value', PRIMITIVE_RULES['value'])
-            key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
-                else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
-
-            prop_kv_rule_names["*"] = self._add_rule(
-                f'{sub_name}-kv',
-                f'{key_rule} ":" space {value_rule}'
-            )
-            optional_props.append("*")
-
-        rule = '"{" space '
-        rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
-
-        if optional_props:
-            rule += ' ('
-            if required_props:
-                rule += ' "," space ( '
-
-            def get_recursive_refs(ks, first_is_optional):
-                [k, *rest] = ks
-                kv_rule_name = prop_kv_rule_names[k]
-                comma_ref = f'( "," space {kv_rule_name} )'
-                if first_is_optional:
-                    res = comma_ref + ('*' if k == '*' else '?')
-                else:
-                    res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
-                if len(rest) > 0:
-                    res += ' ' + self._add_rule(
-                        f'{name}{"-" if name else ""}{k}-rest',
-                        get_recursive_refs(rest, first_is_optional=True)
-                    )
-                return res
-
-            rule += ' | '.join(
-                get_recursive_refs(optional_props[i:], first_is_optional=False)
-                for i in range(len(optional_props))
-            )
-            if required_props:
-                rule += ' )'
-            rule += ' )?'
-
-        rule += ' "}" space'
-
-        return rule
-
-    def format_grammar(self):
-        return '\n'.join(
-            f'{name} ::= {rule}'
-            for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
-        )
-
-
-def main(args_in = None):
-    parser = argparse.ArgumentParser(
-        description='''
-            Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
-            given JSON schema. Only a subset of JSON schema features are supported; more may be
-            added in the future.
-        ''',
-    )
-    parser.add_argument(
-        '--prop-order',
-        default=[],
-        type=lambda s: s.split(','),
-        help='''
-            comma-separated property names defining the order of precedence for object properties;
-            properties not specified here are given lower precedence than those that are, and
-            are kept in their original order from the schema. Required properties are always
-            given precedence over optional properties.
-        '''
-    )
-    parser.add_argument(
-        '--allow-fetch',
-        action='store_true',
-        default=False,
-        help='Whether to allow fetching referenced schemas over HTTPS')
-    parser.add_argument(
-        '--dotall',
-        action='store_true',
-        default=False,
-        help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
-    parser.add_argument(
-        '--raw-pattern',
-        action='store_true',
-        default=False,
-        help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
-
-    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
-    args = parser.parse_args(args_in)
-
-    if args.schema.startswith('https://'):
-        url = args.schema
-        import requests
-        schema = requests.get(url).json()
-    elif args.schema == '-':
-        url = 'stdin'
-        schema = json.load(sys.stdin)
-    else:
-        url = f'file://{args.schema}'
-        with open(args.schema) as f:
-            schema = json.load(f)
-    converter = SchemaConverter(
-        prop_order={name: idx for idx, name in enumerate(args.prop_order)},
-        allow_fetch=args.allow_fetch,
-        dotall=args.dotall,
-        raw_pattern=args.raw_pattern)
-    schema = converter.resolve_refs(schema, url)
-    converter.visit(schema, '')
-    print(converter.format_grammar())
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/llama.android/.gitignore
+++ b/examples/llama.android/.gitignore
@@ -1,33 +0,0 @@
-# Gradle files
-.gradle/
-build/
-
-# Local configuration file (sdk path, etc)
-local.properties
-
-# Log/OS Files
-*.log
-
-# Android Studio generated files and folders
-captures/
-.externalNativeBuild/
-.cxx/
-*.apk
-output.json
-
-# IntelliJ
-*.iml
-.idea/
-misc.xml
-deploymentTargetDropDown.xml
-render.experimental.xml
-
-# Keystore files
-*.jks
-*.keystore
-
-# Google Services (e.g. APIs or Firebase)
-google-services.json
-
-# Android Profiling
-*.hprof
--- a/examples/llama.android/app/.gitignore
+++ b/examples/llama.android/app/.gitignore
@@ -1 +0,0 @@
-/build
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -1,65 +0,0 @@
-plugins {
-    id("com.android.application")
-    id("org.jetbrains.kotlin.android")
-}
-
-android {
-    namespace = "com.example.llama"
-    compileSdk = 34
-
-    defaultConfig {
-        applicationId = "com.example.llama"
-        minSdk = 33
-        targetSdk = 34
-        versionCode = 1
-        versionName = "1.0"
-
-        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-        vectorDrawables {
-            useSupportLibrary = true
-        }
-    }
-
-    buildTypes {
-        release {
-            isMinifyEnabled = false
-            proguardFiles(
-                getDefaultProguardFile("proguard-android-optimize.txt"),
-                "proguard-rules.pro"
-            )
-        }
-    }
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
-    }
-    buildFeatures {
-        compose = true
-    }
-    composeOptions {
-        kotlinCompilerExtensionVersion = "1.5.1"
-    }
-}
-
-dependencies {
-
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
-    implementation("androidx.activity:activity-compose:1.8.2")
-    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    implementation("androidx.compose.ui:ui")
-    implementation("androidx.compose.ui:ui-graphics")
-    implementation("androidx.compose.ui:ui-tooling-preview")
-    implementation("androidx.compose.material3:material3")
-    implementation(project(":llama"))
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
-    debugImplementation("androidx.compose.ui:ui-tooling")
-    debugImplementation("androidx.compose.ui:ui-test-manifest")
-}
--- a/examples/llama.android/app/proguard-rules.pro
+++ b/examples/llama.android/app/proguard-rules.pro
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
--- a/examples/llama.android/app/src/main/AndroidManifest.xml
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@@ -1,30 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools">
-
-    <uses-permission android:name="android.permission.INTERNET" />
-
-    <application
-        android:allowBackup="true"
-        android:dataExtractionRules="@xml/data_extraction_rules"
-        android:fullBackupContent="@xml/backup_rules"
-        android:icon="@mipmap/ic_launcher"
-        android:label="@string/app_name"
-        android:roundIcon="@mipmap/ic_launcher_round"
-        android:supportsRtl="true"
-        android:theme="@style/Theme.LlamaAndroid"
-        >
-
-        <activity
-            android:name=".MainActivity"
-            android:exported="true"
-            android:theme="@style/Theme.LlamaAndroid">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-
-</manifest>
--- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
@@ -1,119 +0,0 @@
-package com.example.llama
-
-import android.app.DownloadManager
-import android.net.Uri
-import android.util.Log
-import androidx.compose.material3.Button
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableDoubleStateOf
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.remember
-import androidx.compose.runtime.rememberCoroutineScope
-import androidx.compose.runtime.setValue
-import androidx.core.database.getLongOrNull
-import androidx.core.net.toUri
-import kotlinx.coroutines.delay
-import kotlinx.coroutines.launch
-import java.io.File
-
-data class Downloadable(val name: String, val source: Uri, val destination: File) {
-    companion object {
-        @JvmStatic
-        private val tag: String? = this::class.qualifiedName
-
-        sealed interface State
-        data object Ready: State
-        data class Downloading(val id: Long): State
-        data class Downloaded(val downloadable: Downloadable): State
-        data class Error(val message: String): State
-
-        @JvmStatic
-        @Composable
-        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
-            var status: State by remember {
-                mutableStateOf(
-                    if (item.destination.exists()) Downloaded(item)
-                    else Ready
-                )
-            }
-            var progress by remember { mutableDoubleStateOf(0.0) }
-
-            val coroutineScope = rememberCoroutineScope()
-
-            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
-                while (true) {
-                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
-
-                    if (cursor == null) {
-                        Log.e(tag, "dm.query() returned null")
-                        return Error("dm.query() returned null")
-                    }
-
-                    if (!cursor.moveToFirst() || cursor.count < 1) {
-                        cursor.close()
-                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
-                        return Ready
-                    }
-
-                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
-                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
-                    val sofar = cursor.getLongOrNull(pix) ?: 0
-                    val total = cursor.getLongOrNull(tix) ?: 1
-                    cursor.close()
-
-                    if (sofar == total) {
-                        return Downloaded(item)
-                    }
-
-                    progress = (sofar * 1.0) / total
-
-                    delay(1000L)
-                }
-            }
-
-            fun onClick() {
-                when (val s = status) {
-                    is Downloaded -> {
-                        viewModel.load(item.destination.path)
-                    }
-
-                    is Downloading -> {
-                        coroutineScope.launch {
-                            status = waitForDownload(s, item)
-                        }
-                    }
-
-                    else -> {
-                        item.destination.delete()
-
-                        val request = DownloadManager.Request(item.source).apply {
-                            setTitle("Downloading model")
-                            setDescription("Downloading model: ${item.name}")
-                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
-                            setDestinationUri(item.destination.toUri())
-                        }
-
-                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
-                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
-
-                        val id = dm.enqueue(request)
-                        status = Downloading(id)
-                        onClick()
-                    }
-                }
-            }
-
-            Button(onClick = { onClick() }, enabled = status !is Downloading) {
-                when (status) {
-                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
-                    is Downloaded -> Text("Load ${item.name}")
-                    is Ready -> Text("Download ${item.name}")
-                    is Error -> Text("Download ${item.name}")
-                }
-            }
-        }
-
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -1,154 +0,0 @@
-package com.example.llama
-
-import android.app.ActivityManager
-import android.app.DownloadManager
-import android.content.ClipData
-import android.content.ClipboardManager
-import android.net.Uri
-import android.os.Bundle
-import android.os.StrictMode
-import android.os.StrictMode.VmPolicy
-import android.text.format.Formatter
-import androidx.activity.ComponentActivity
-import androidx.activity.compose.setContent
-import androidx.activity.viewModels
-import androidx.compose.foundation.layout.Box
-import androidx.compose.foundation.layout.Column
-import androidx.compose.foundation.layout.Row
-import androidx.compose.foundation.layout.fillMaxSize
-import androidx.compose.foundation.layout.padding
-import androidx.compose.foundation.lazy.LazyColumn
-import androidx.compose.foundation.lazy.items
-import androidx.compose.foundation.lazy.rememberLazyListState
-import androidx.compose.material3.Button
-import androidx.compose.material3.LocalContentColor
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.OutlinedTextField
-import androidx.compose.material3.Surface
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.ui.Modifier
-import androidx.compose.ui.unit.dp
-import androidx.core.content.getSystemService
-import com.example.llama.ui.theme.LlamaAndroidTheme
-import java.io.File
-
-class MainActivity(
-    activityManager: ActivityManager? = null,
-    downloadManager: DownloadManager? = null,
-    clipboardManager: ClipboardManager? = null,
-): ComponentActivity() {
-    private val tag: String? = this::class.simpleName
-
-    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
-    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
-    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
-
-    private val viewModel: MainViewModel by viewModels()
-
-    // Get a MemoryInfo object for the device's current memory status.
-    private fun availableMemory(): ActivityManager.MemoryInfo {
-        return ActivityManager.MemoryInfo().also { memoryInfo ->
-            activityManager.getMemoryInfo(memoryInfo)
-        }
-    }
-
-    override fun onCreate(savedInstanceState: Bundle?) {
-        super.onCreate(savedInstanceState)
-
-        StrictMode.setVmPolicy(
-            VmPolicy.Builder(StrictMode.getVmPolicy())
-                .detectLeakedClosableObjects()
-                .build()
-        )
-
-        val free = Formatter.formatFileSize(this, availableMemory().availMem)
-        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
-
-        viewModel.log("Current memory: $free / $total")
-        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
-
-        val extFilesDir = getExternalFilesDir(null)
-
-        val models = listOf(
-            Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
-            ),
-            Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
-            ),
-        )
-
-        setContent {
-            LlamaAndroidTheme {
-                // A surface container using the 'background' color from the theme
-                Surface(
-                    modifier = Modifier.fillMaxSize(),
-                    color = MaterialTheme.colorScheme.background
-                ) {
-                    MainCompose(
-                        viewModel,
-                        clipboardManager,
-                        downloadManager,
-                        models,
-                    )
-                }
-
-            }
-        }
-    }
-}
-
-@Composable
-fun MainCompose(
-    viewModel: MainViewModel,
-    clipboard: ClipboardManager,
-    dm: DownloadManager,
-    models: List<Downloadable>
-) {
-    Column {
-        val scrollState = rememberLazyListState()
-
-        Box(modifier = Modifier.weight(1f)) {
-            LazyColumn(state = scrollState) {
-                items(viewModel.messages) {
-                    Text(
-                        it,
-                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
-                        modifier = Modifier.padding(16.dp)
-                    )
-                }
-            }
-        }
-        OutlinedTextField(
-            value = viewModel.message,
-            onValueChange = { viewModel.updateMessage(it) },
-            label = { Text("Message") },
-        )
-        Row {
-            Button({ viewModel.send() }) { Text("Send") }
-            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
-            Button({ viewModel.clear() }) { Text("Clear") }
-            Button({
-                viewModel.messages.joinToString("\n").let {
-                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
-                }
-            }) { Text("Copy") }
-        }
-
-        Column {
-            for (model in models) {
-                Downloadable.Button(viewModel, dm, model)
-            }
-        }
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,105 +0,0 @@
-package com.example.llama
-
-import android.llama.cpp.LLamaAndroid
-import android.util.Log
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.setValue
-import androidx.lifecycle.ViewModel
-import androidx.lifecycle.viewModelScope
-import kotlinx.coroutines.flow.catch
-import kotlinx.coroutines.launch
-
-class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
-    companion object {
-        @JvmStatic
-        private val NanosPerSecond = 1_000_000_000.0
-    }
-
-    private val tag: String? = this::class.simpleName
-
-    var messages by mutableStateOf(listOf("Initializing..."))
-        private set
-
-    var message by mutableStateOf("")
-        private set
-
-    override fun onCleared() {
-        super.onCleared()
-
-        viewModelScope.launch {
-            try {
-                llamaAndroid.unload()
-            } catch (exc: IllegalStateException) {
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun send() {
-        val text = message
-        message = ""
-
-        // Add to messages console.
-        messages += text
-        messages += ""
-
-        viewModelScope.launch {
-            llamaAndroid.send(text)
-                .catch {
-                    Log.e(tag, "send() failed", it)
-                    messages += it.message!!
-                }
-                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
-        }
-    }
-
-    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
-        viewModelScope.launch {
-            try {
-                val start = System.nanoTime()
-                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
-                val end = System.nanoTime()
-
-                messages += warmupResult
-
-                val warmup = (end - start).toDouble() / NanosPerSecond
-                messages += "Warm up time: $warmup seconds, please wait..."
-
-                if (warmup > 5.0) {
-                    messages += "Warm up took too long, aborting benchmark"
-                    return@launch
-                }
-
-                messages += llamaAndroid.bench(512, 128, 1, 3)
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "bench() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun load(pathToModel: String) {
-        viewModelScope.launch {
-            try {
-                llamaAndroid.load(pathToModel)
-                messages += "Loaded $pathToModel"
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "load() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun updateMessage(newMessage: String) {
-        message = newMessage
-    }
-
-    fun clear() {
-        messages = listOf()
-    }
-
-    fun log(message: String) {
-        messages += message
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
@@ -1,11 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.ui.graphics.Color
-
-val Purple80 = Color(0xFFD0BCFF)
-val PurpleGrey80 = Color(0xFFCCC2DC)
-val Pink80 = Color(0xFFEFB8C8)
-
-val Purple40 = Color(0xFF6650a4)
-val PurpleGrey40 = Color(0xFF625b71)
-val Pink40 = Color(0xFF7D5260)
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
@@ -1,70 +0,0 @@
-package com.example.llama.ui.theme
-
-import android.app.Activity
-import android.os.Build
-import androidx.compose.foundation.isSystemInDarkTheme
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.darkColorScheme
-import androidx.compose.material3.dynamicDarkColorScheme
-import androidx.compose.material3.dynamicLightColorScheme
-import androidx.compose.material3.lightColorScheme
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.SideEffect
-import androidx.compose.ui.graphics.toArgb
-import androidx.compose.ui.platform.LocalContext
-import androidx.compose.ui.platform.LocalView
-import androidx.core.view.WindowCompat
-
-private val DarkColorScheme = darkColorScheme(
-    primary = Purple80,
-    secondary = PurpleGrey80,
-    tertiary = Pink80
-)
-
-private val LightColorScheme = lightColorScheme(
-    primary = Purple40,
-    secondary = PurpleGrey40,
-    tertiary = Pink40
-
-    /* Other default colors to override
-    background = Color(0xFFFFFBFE),
-    surface = Color(0xFFFFFBFE),
-    onPrimary = Color.White,
-    onSecondary = Color.White,
-    onTertiary = Color.White,
-    onBackground = Color(0xFF1C1B1F),
-    onSurface = Color(0xFF1C1B1F),
-    */
-)
-
-@Composable
-fun LlamaAndroidTheme(
-    darkTheme: Boolean = isSystemInDarkTheme(),
-    // Dynamic color is available on Android 12+
-    dynamicColor: Boolean = true,
-    content: @Composable () -> Unit
-) {
-    val colorScheme = when {
-        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
-            val context = LocalContext.current
-            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
-        }
-
-        darkTheme -> DarkColorScheme
-        else -> LightColorScheme
-    }
-    val view = LocalView.current
-    if (!view.isInEditMode) {
-        SideEffect {
-            val window = (view.context as Activity).window
-            window.statusBarColor = colorScheme.primary.toArgb()
-            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
-        }
-    }
-
-    MaterialTheme(
-        colorScheme = colorScheme,
-        typography = Typography,
-        content = content
-    )
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
@@ -1,34 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.material3.Typography
-import androidx.compose.ui.text.TextStyle
-import androidx.compose.ui.text.font.FontFamily
-import androidx.compose.ui.text.font.FontWeight
-import androidx.compose.ui.unit.sp
-
-// Set of Material typography styles to start with
-val Typography = Typography(
-    bodyLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 16.sp,
-        lineHeight = 24.sp,
-        letterSpacing = 0.5.sp
-    )
-    /* Other default text styles to override
-    titleLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 22.sp,
-        lineHeight = 28.sp,
-        letterSpacing = 0.sp
-    ),
-    labelSmall = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Medium,
-        fontSize = 11.sp,
-        lineHeight = 16.sp,
-        letterSpacing = 0.5.sp
-    )
-    */
-)
--- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
@@ -1,170 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path
-        android:fillColor="#3DDC84"
-        android:pathData="M0,0h108v108h-108z" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M9,0L9,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,0L19,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,0L29,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,0L39,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,0L49,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,0L59,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,0L69,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,0L79,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M89,0L89,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M99,0L99,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,9L108,9"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,19L108,19"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,29L108,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,39L108,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,49L108,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,59L108,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,69L108,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,79L108,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,89L108,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,99L108,99"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,29L89,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,39L89,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,49L89,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,59L89,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,69L89,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,79L89,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,19L29,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,19L39,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,19L49,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,19L59,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,19L69,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,19L79,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-</vector>
--- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
@@ -1,30 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:aapt="http://schemas.android.com/aapt"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
-        <aapt:attr name="android:fillColor">
-            <gradient
-                android:endX="85.84757"
-                android:endY="92.4963"
-                android:startX="42.9492"
-                android:startY="49.59793"
-                android:type="linear">
-                <item
-                    android:color="#44000000"
-                    android:offset="0.0" />
-                <item
-                    android:color="#00000000"
-                    android:offset="1.0" />
-            </gradient>
-        </aapt:attr>
-    </path>
-    <path
-        android:fillColor="#FFFFFF"
-        android:fillType="nonZero"
-        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
-        android:strokeWidth="1"
-        android:strokeColor="#00000000" />
-</vector>
--- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
--- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
--- a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/values/colors.xml
+++ b/examples/llama.android/app/src/main/res/values/colors.xml
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <color name="purple_200">#FFBB86FC</color>
-    <color name="purple_500">#FF6200EE</color>
-    <color name="purple_700">#FF3700B3</color>
-    <color name="teal_200">#FF03DAC5</color>
-    <color name="teal_700">#FF018786</color>
-    <color name="black">#FF000000</color>
-    <color name="white">#FFFFFFFF</color>
-</resources>
--- a/examples/llama.android/app/src/main/res/values/strings.xml
+++ b/examples/llama.android/app/src/main/res/values/strings.xml
@@ -1,3 +0,0 @@
-<resources>
-    <string name="app_name">LlamaAndroid</string>
-</resources>
--- a/examples/llama.android/app/src/main/res/values/themes.xml
+++ b/examples/llama.android/app/src/main/res/values/themes.xml
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-
-    <style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
-</resources>
--- a/examples/llama.android/app/src/main/res/xml/backup_rules.xml
+++ b/examples/llama.android/app/src/main/res/xml/backup_rules.xml
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample backup rules file; uncomment and customize as necessary.
-   See https://developer.android.com/guide/topics/data/autobackup
-   for details.
-   Note: This file is ignored for devices older that API 31
-   See https://developer.android.com/about/versions/12/backup-restore
-->
-<full-backup-content>
-    <!--
-   <include domain="sharedpref" path="."/>
-   <exclude domain="sharedpref" path="device.xml"/>
-->
-</full-backup-content>
--- a/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
+++ b/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample data extraction rules file; uncomment and customize as necessary.
-   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
-   for details.
-->
-<data-extraction-rules>
-    <cloud-backup>
-        <!-- TODO: Use <include> and <exclude> to control what is backed up.
-        <include .../>
-        <exclude .../>
-        -->
-    </cloud-backup>
-    <!--
-    <device-transfer>
-        <include .../>
-        <exclude .../>
-    </device-transfer>
-    -->
-</data-extraction-rules>
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -1,6 +0,0 @@
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-plugins {
-    id("com.android.application") version "8.2.0" apply false
-    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
-    id("com.android.library") version "8.2.0" apply false
-}
--- a/examples/llama.android/gradle.properties
+++ b/examples/llama.android/gradle.properties
@@ -1,23 +0,0 @@
-# Project-wide Gradle settings.
-# IDE (e.g. Android Studio) users:
-# Gradle settings configured through the IDE *will override*
-# any settings specified in this file.
-# For more details on how to configure your build environment visit
-# http://www.gradle.org/docs/current/userguide/build_environment.html
-# Specifies the JVM arguments used for the daemon process.
-# The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
-# When configured, Gradle will run in incubating parallel mode.
-# This option should only be used with decoupled projects. More details, visit
-# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
-# org.gradle.parallel=true
-# AndroidX package structure to make it clearer which packages are bundled with the
-# Android operating system, and which are packaged with your app's APK
-# https://developer.android.com/topic/libraries/support-library/androidx-rn
-android.useAndroidX=true
-# Kotlin code style for this project: "official" or "obsolete":
-kotlin.code.style=official
-# Enables namespacing of each library's R class so that its R class includes only the
-# resources declared in the library itself and none from the library's dependencies,
-# thereby reducing the size of the R class for that library
-android.nonTransitiveRClass=true
--- a/examples/llama.android/gradle/wrapper/gradle-wrapper.jar
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.jar
--- a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +0,0 @@
-#Thu Dec 21 14:31:09 AEDT 2023
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
--- a/examples/llama.android/gradlew
+++ b/examples/llama.android/gradlew
@@ -1,185 +0,0 @@
-#!/usr/bin/env sh
-
-#
-# Copyright 2015 the original author or authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn () {
-    echo "$*"
-}
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=`expr $i + 1`
-    done
-    case $i in
-        0) set -- ;;
-        1) set -- "$args0" ;;
-        2) set -- "$args0" "$args1" ;;
-        3) set -- "$args0" "$args1" "$args2" ;;
-        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=`save "$@"`
-
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
-
-exec "$JAVACMD" "$@"
--- a/examples/llama.android/llama/.gitignore
+++ b/examples/llama.android/llama/.gitignore
@@ -1 +0,0 @@
-/build
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -1,69 +0,0 @@
-plugins {
-    id("com.android.library")
-    id("org.jetbrains.kotlin.android")
-}
-
-android {
-    namespace = "android.llama.cpp"
-    compileSdk = 34
-
-    defaultConfig {
-        minSdk = 33
-
-        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-        consumerProguardFiles("consumer-rules.pro")
-        ndk {
-            // Add NDK properties if wanted, e.g.
-            // abiFilters += listOf("arm64-v8a")
-        }
-        externalNativeBuild {
-            cmake {
-                arguments += "-DLLAMA_BUILD_COMMON=ON"
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
-                cppFlags += listOf()
-                arguments += listOf()
-
-                cppFlags("")
-            }
-        }
-    }
-
-    buildTypes {
-        release {
-            isMinifyEnabled = false
-            proguardFiles(
-                getDefaultProguardFile("proguard-android-optimize.txt"),
-                "proguard-rules.pro"
-            )
-        }
-    }
-    externalNativeBuild {
-        cmake {
-            path("src/main/cpp/CMakeLists.txt")
-            version = "3.22.1"
-        }
-    }
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
-    }
-
-    packaging {
-        resources {
-            excludes += "/META-INF/{AL2.0,LGPL2.1}"
-        }
-    }
-}
-
-dependencies {
-
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.appcompat:appcompat:1.6.1")
-    implementation("com.google.android.material:material:1.11.0")
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-}
--- a/examples/llama.android/llama/consumer-rules.pro
+++ b/examples/llama.android/llama/consumer-rules.pro
--- a/examples/llama.android/llama/proguard-rules.pro
+++ b/examples/llama.android/llama/proguard-rules.pro
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
--- a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@@ -1,24 +0,0 @@
-package android.llama.cpp
-
-import androidx.test.platform.app.InstrumentationRegistry
-import androidx.test.ext.junit.runners.AndroidJUnit4
-
-import org.junit.Test
-import org.junit.runner.RunWith
-
-import org.junit.Assert.*
-
-/**
- * Instrumented test, which will execute on an Android device.
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-@RunWith(AndroidJUnit4::class)
-class ExampleInstrumentedTest {
-    @Test
-    fun useAppContext() {
-        // Context of the app under test.
-        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
-        assertEquals("android.llama.cpp.test", appContext.packageName)
-    }
-}
--- a/examples/llama.android/llama/src/main/AndroidManifest.xml
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android">
-
-</manifest>
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -1,447 +0,0 @@
-#include <android/log.h>
-#include <jni.h>
-#include <iomanip>
-#include <math.h>
-#include <string>
-#include <unistd.h>
-#include "llama.h"
-#include "common.h"
-
-// Write C++ code here.
-//
-// Do not forget to dynamically load the C++ library into your application.
-//
-// For instance,
-//
-// In MainActivity.java:
-//    static {
-//       System.loadLibrary("llama-android");
-//    }
-//
-// Or, in MainActivity.kt:
-//    companion object {
-//      init {
-//         System.loadLibrary("llama-android")
-//      }
-//    }
-
-#define TAG "llama-android.cpp"
-#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
-#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
-
-jclass la_int_var;
-jmethodID la_int_var_value;
-jmethodID la_int_var_inc;
-
-std::string cached_token_chars;
-
-bool is_valid_utf8(const char * string) {
-    if (!string) {
-        return true;
-    }
-
-    const unsigned char * bytes = (const unsigned char *)string;
-    int num;
-
-    while (*bytes != 0x00) {
-        if ((*bytes & 0x80) == 0x00) {
-            // U+0000 to U+007F
-            num = 1;
-        } else if ((*bytes & 0xE0) == 0xC0) {
-            // U+0080 to U+07FF
-            num = 2;
-        } else if ((*bytes & 0xF0) == 0xE0) {
-            // U+0800 to U+FFFF
-            num = 3;
-        } else if ((*bytes & 0xF8) == 0xF0) {
-            // U+10000 to U+10FFFF
-            num = 4;
-        } else {
-            return false;
-        }
-
-        bytes += 1;
-        for (int i = 1; i < num; ++i) {
-            if ((*bytes & 0xC0) != 0x80) {
-                return false;
-            }
-            bytes += 1;
-        }
-    }
-
-    return true;
-}
-
-static void log_callback(ggml_log_level level, const char * fmt, void * data) {
-    if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
-    else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
-    else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
-    else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
-    llama_model_params model_params = llama_model_default_params();
-
-    auto path_to_model = env->GetStringUTFChars(filename, 0);
-    LOGi("Loading model from %s", path_to_model);
-
-    auto model = llama_load_model_from_file(path_to_model, model_params);
-    env->ReleaseStringUTFChars(filename, path_to_model);
-
-    if (!model) {
-        LOGe("load_model() failed");
-        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
-        return 0;
-    }
-
-    return reinterpret_cast<jlong>(model);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
-    llama_free_model(reinterpret_cast<llama_model *>(model));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
-    auto model = reinterpret_cast<llama_model *>(jmodel);
-
-    if (!model) {
-        LOGe("new_context(): model cannot be null");
-        env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
-        return 0;
-    }
-
-    int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
-    LOGi("Using %d threads", n_threads);
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.n_ctx           = 2048;
-    ctx_params.n_threads       = n_threads;
-    ctx_params.n_threads_batch = n_threads;
-
-    llama_context * context = llama_new_context_with_model(model, ctx_params);
-
-    if (!context) {
-        LOGe("llama_new_context_with_model() returned null)");
-        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
-                      "llama_new_context_with_model() returned null)");
-        return 0;
-    }
-
-    return reinterpret_cast<jlong>(context);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
-    llama_free(reinterpret_cast<llama_context *>(context));
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
-    llama_backend_free();
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
-    llama_log_set(log_callback, NULL);
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_bench_1model(
-        JNIEnv *env,
-        jobject,
-        jlong context_pointer,
-        jlong model_pointer,
-        jlong batch_pointer,
-        jint pp,
-        jint tg,
-        jint pl,
-        jint nr
-        ) {
-    auto pp_avg = 0.0;
-    auto tg_avg = 0.0;
-    auto pp_std = 0.0;
-    auto tg_std = 0.0;
-
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto model = reinterpret_cast<llama_model *>(model_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-
-    const int n_ctx = llama_n_ctx(context);
-
-    LOGi("n_ctx = %d", n_ctx);
-
-    int i, j;
-    int nri;
-    for (nri = 0; nri < nr; nri++) {
-        LOGi("Benchmark prompt processing (pp)");
-
-        common_batch_clear(*batch);
-
-        const int n_tokens = pp;
-        for (i = 0; i < n_tokens; i++) {
-            common_batch_add(*batch, 0, i, { 0 }, false);
-        }
-
-        batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_cache_clear(context);
-
-        const auto t_pp_start = ggml_time_us();
-        if (llama_decode(context, *batch) != 0) {
-            LOGi("llama_decode() failed during prompt processing");
-        }
-        const auto t_pp_end = ggml_time_us();
-
-        // bench text generation
-
-        LOGi("Benchmark text generation (tg)");
-
-        llama_kv_cache_clear(context);
-        const auto t_tg_start = ggml_time_us();
-        for (i = 0; i < tg; i++) {
-
-            common_batch_clear(*batch);
-            for (j = 0; j < pl; j++) {
-                common_batch_add(*batch, 0, i, { j }, true);
-            }
-
-            LOGi("llama_decode() text generation: %d", i);
-            if (llama_decode(context, *batch) != 0) {
-                LOGi("llama_decode() failed during text generation");
-            }
-        }
-
-        const auto t_tg_end = ggml_time_us();
-
-        llama_kv_cache_clear(context);
-
-        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
-        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
-
-        const auto speed_pp = double(pp) / t_pp;
-        const auto speed_tg = double(pl * tg) / t_tg;
-
-        pp_avg += speed_pp;
-        tg_avg += speed_tg;
-
-        pp_std += speed_pp * speed_pp;
-        tg_std += speed_tg * speed_tg;
-
-        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
-    }
-
-    pp_avg /= double(nr);
-    tg_avg /= double(nr);
-
-    if (nr > 1) {
-        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
-        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
-    } else {
-        pp_std = 0;
-        tg_std = 0;
-    }
-
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-
-    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
-    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
-
-    const auto backend    = "(Android)"; // TODO: What should this be?
-
-    std::stringstream result;
-    result << std::setprecision(2);
-    result << "| model | size | params | backend | test | t/s |\n";
-    result << "| --- | --- | --- | --- | --- | --- |\n";
-    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
-    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
-
-    return env->NewStringUTF(result.str().c_str());
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
-
-    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
-
-    llama_batch *batch = new llama_batch {
-        0,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-    };
-
-    if (embd) {
-        batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
-    } else {
-        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
-    }
-
-    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
-    batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
-    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
-    for (int i = 0; i < n_tokens; ++i) {
-        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
-    }
-    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
-
-    return reinterpret_cast<jlong>(batch);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
-    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = true;
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
-    return reinterpret_cast<jlong>(smpl);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
-    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
-    llama_backend_init();
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
-    return env->NewStringUTF(llama_print_system_info());
-}
-
-extern "C"
-JNIEXPORT jint JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1init(
-        JNIEnv *env,
-        jobject,
-        jlong context_pointer,
-        jlong batch_pointer,
-        jstring jtext,
-        jint n_len
-    ) {
-
-    cached_token_chars.clear();
-
-    const auto text = env->GetStringUTFChars(jtext, 0);
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-
-    const auto tokens_list = common_tokenize(context, text, 1);
-
-    auto n_ctx = llama_n_ctx(context);
-    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
-
-    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
-
-    if (n_kv_req > n_ctx) {
-        LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
-    }
-
-    for (auto id : tokens_list) {
-        LOGi("%s", common_token_to_piece(context, id).c_str());
-    }
-
-    common_batch_clear(*batch);
-
-    // evaluate the initial prompt
-    for (auto i = 0; i < tokens_list.size(); i++) {
-        common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch->logits[batch->n_tokens - 1] = true;
-
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() failed");
-    }
-
-    env->ReleaseStringUTFChars(jtext, text);
-
-    return batch->n_tokens;
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1loop(
-        JNIEnv * env,
-        jobject,
-        jlong context_pointer,
-        jlong batch_pointer,
-        jlong sampler_pointer,
-        jint n_len,
-        jobject intvar_ncur
-) {
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
-    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
-    const auto model = llama_get_model(context);
-
-    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
-    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
-    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
-
-    // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
-
-    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return nullptr;
-    }
-
-    auto new_token_chars = common_token_to_piece(context, new_token_id);
-    cached_token_chars += new_token_chars;
-
-    jstring new_token = nullptr;
-    if (is_valid_utf8(cached_token_chars.c_str())) {
-        new_token = env->NewStringUTF(cached_token_chars.c_str());
-        LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
-        cached_token_chars.clear();
-    } else {
-        new_token = env->NewStringUTF("");
-    }
-
-    common_batch_clear(*batch);
-    common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
-
-    env->CallVoidMethod(intvar_ncur, la_int_var_inc);
-
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() returned null");
-    }
-
-    return new_token;
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
-}
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -1,179 +0,0 @@
-package android.llama.cpp
-
-import android.util.Log
-import kotlinx.coroutines.CoroutineDispatcher
-import kotlinx.coroutines.asCoroutineDispatcher
-import kotlinx.coroutines.flow.Flow
-import kotlinx.coroutines.flow.flow
-import kotlinx.coroutines.flow.flowOn
-import kotlinx.coroutines.withContext
-import java.util.concurrent.Executors
-import kotlin.concurrent.thread
-
-class LLamaAndroid {
-    private val tag: String? = this::class.simpleName
-
-    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
-
-    private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
-        thread(start = false, name = "Llm-RunLoop") {
-            Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
-
-            // No-op if called more than once.
-            System.loadLibrary("llama-android")
-
-            // Set llama log handler to Android
-            log_to_android()
-            backend_init(false)
-
-            Log.d(tag, system_info())
-
-            it.run()
-        }.apply {
-            uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
-                Log.e(tag, "Unhandled exception", exception)
-            }
-        }
-    }.asCoroutineDispatcher()
-
-    private val nlen: Int = 64
-
-    private external fun log_to_android()
-    private external fun load_model(filename: String): Long
-    private external fun free_model(model: Long)
-    private external fun new_context(model: Long): Long
-    private external fun free_context(context: Long)
-    private external fun backend_init(numa: Boolean)
-    private external fun backend_free()
-    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
-    private external fun free_batch(batch: Long)
-    private external fun new_sampler(): Long
-    private external fun free_sampler(sampler: Long)
-    private external fun bench_model(
-        context: Long,
-        model: Long,
-        batch: Long,
-        pp: Int,
-        tg: Int,
-        pl: Int,
-        nr: Int
-    ): String
-
-    private external fun system_info(): String
-
-    private external fun completion_init(
-        context: Long,
-        batch: Long,
-        text: String,
-        nLen: Int
-    ): Int
-
-    private external fun completion_loop(
-        context: Long,
-        batch: Long,
-        sampler: Long,
-        nLen: Int,
-        ncur: IntVar
-    ): String?
-
-    private external fun kv_cache_clear(context: Long)
-
-    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
-        return withContext(runLoop) {
-            when (val state = threadLocalState.get()) {
-                is State.Loaded -> {
-                    Log.d(tag, "bench(): $state")
-                    bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
-                }
-
-                else -> throw IllegalStateException("No model loaded")
-            }
-        }
-    }
-
-    suspend fun load(pathToModel: String) {
-        withContext(runLoop) {
-            when (threadLocalState.get()) {
-                is State.Idle -> {
-                    val model = load_model(pathToModel)
-                    if (model == 0L)  throw IllegalStateException("load_model() failed")
-
-                    val context = new_context(model)
-                    if (context == 0L) throw IllegalStateException("new_context() failed")
-
-                    val batch = new_batch(512, 0, 1)
-                    if (batch == 0L) throw IllegalStateException("new_batch() failed")
-
-                    val sampler = new_sampler()
-                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
-
-                    Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
-                }
-                else -> throw IllegalStateException("Model already loaded")
-            }
-        }
-    }
-
-    fun send(message: String): Flow<String> = flow {
-        when (val state = threadLocalState.get()) {
-            is State.Loaded -> {
-                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
-                while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
-                    if (str == null) {
-                        break
-                    }
-                    emit(str)
-                }
-                kv_cache_clear(state.context)
-            }
-            else -> {}
-        }
-    }.flowOn(runLoop)
-
-    /**
-     * Unloads the model and frees resources.
-     *
-     * This is a no-op if there's no model loaded.
-     */
-    suspend fun unload() {
-        withContext(runLoop) {
-            when (val state = threadLocalState.get()) {
-                is State.Loaded -> {
-                    free_context(state.context)
-                    free_model(state.model)
-                    free_batch(state.batch)
-                    free_sampler(state.sampler);
-
-                    threadLocalState.set(State.Idle)
-                }
-                else -> {}
-            }
-        }
-    }
-
-    companion object {
-        private class IntVar(value: Int) {
-            @Volatile
-            var value: Int = value
-                private set
-
-            fun inc() {
-                synchronized(this) {
-                    value += 1
-                }
-            }
-        }
-
-        private sealed interface State {
-            data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
-        }
-
-        // Enforce only one instance of Llm.
-        private val _instance: LLamaAndroid = LLamaAndroid()
-
-        fun instance(): LLamaAndroid = _instance
-    }
-}
--- a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@@ -1,17 +0,0 @@
-package android.llama.cpp
-
-import org.junit.Test
-
-import org.junit.Assert.*
-
-/**
- * Example local unit test, which will execute on the development machine (host).
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-class ExampleUnitTest {
-    @Test
-    fun addition_isCorrect() {
-        assertEquals(4, 2 + 2)
-    }
-}
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -1,18 +0,0 @@
-pluginManagement {
-    repositories {
-        google()
-        mavenCentral()
-        gradlePluginPortal()
-    }
-}
-dependencyResolutionManagement {
-    repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
-    repositories {
-        google()
-        mavenCentral()
-    }
-}
-
-rootProject.name = "LlamaAndroid"
-include(":app")
-include(":llama")
--- a/examples/llama.swiftui/.gitignore
+++ b/examples/llama.swiftui/.gitignore
@@ -1,2 +0,0 @@
-xcuserdata
-xcshareddata
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -1,335 +0,0 @@
-import Foundation
-import llama
-
-enum LlamaError: Error {
-    case couldNotInitializeContext
-}
-
-func llama_batch_clear(_ batch: inout llama_batch) {
-    batch.n_tokens = 0
-}
-
-func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
-    batch.token   [Int(batch.n_tokens)] = id
-    batch.pos     [Int(batch.n_tokens)] = pos
-    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
-    for i in 0..<seq_ids.count {
-        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
-    }
-    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
-
-    batch.n_tokens += 1
-}
-
-actor LlamaContext {
-    private var model: OpaquePointer
-    private var context: OpaquePointer
-    private var sampling: UnsafeMutablePointer<llama_sampler>
-    private var batch: llama_batch
-    private var tokens_list: [llama_token]
-    var is_done: Bool = false
-
-    /// This variable is used to store temporarily invalid cchars
-    private var temporary_invalid_cchars: [CChar]
-
-    var n_len: Int32 = 1024
-    var n_cur: Int32 = 0
-
-    var n_decode: Int32 = 0
-
-    init(model: OpaquePointer, context: OpaquePointer) {
-        self.model = model
-        self.context = context
-        self.tokens_list = []
-        self.batch = llama_batch_init(512, 0, 1)
-        self.temporary_invalid_cchars = []
-        let sparams = llama_sampler_chain_default_params()
-        self.sampling = llama_sampler_chain_init(sparams)
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
-    }
-
-    deinit {
-        llama_sampler_free(sampling)
-        llama_batch_free(batch)
-        llama_free(context)
-        llama_free_model(model)
-        llama_backend_free()
-    }
-
-    static func create_context(path: String) throws -> LlamaContext {
-        llama_backend_init()
-        var model_params = llama_model_default_params()
-
-#if targetEnvironment(simulator)
-        model_params.n_gpu_layers = 0
-        print("Running on simulator, force use n_gpu_layers = 0")
-#endif
-        let model = llama_load_model_from_file(path, model_params)
-        guard let model else {
-            print("Could not load model at \(path)")
-            throw LlamaError.couldNotInitializeContext
-        }
-
-        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
-        print("Using \(n_threads) threads")
-
-        var ctx_params = llama_context_default_params()
-        ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = Int32(n_threads)
-        ctx_params.n_threads_batch = Int32(n_threads)
-
-        let context = llama_new_context_with_model(model, ctx_params)
-        guard let context else {
-            print("Could not load context!")
-            throw LlamaError.couldNotInitializeContext
-        }
-
-        return LlamaContext(model: model, context: context)
-    }
-
-    func model_info() -> String {
-        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
-        result.initialize(repeating: Int8(0), count: 256)
-        defer {
-            result.deallocate()
-        }
-
-        // TODO: this is probably very stupid way to get the string from C
-
-        let nChars = llama_model_desc(model, result, 256)
-        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
-
-        var SwiftString = ""
-        for char in bufferPointer {
-            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
-        }
-
-        return SwiftString
-    }
-
-    func get_n_tokens() -> Int32 {
-        return batch.n_tokens;
-    }
-
-    func completion_init(text: String) {
-        print("attempting to complete \"\(text)\"")
-
-        tokens_list = tokenize(text: text, add_bos: true)
-        temporary_invalid_cchars = []
-
-        let n_ctx = llama_n_ctx(context)
-        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
-
-        print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
-
-        if n_kv_req > n_ctx {
-            print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
-        }
-
-        for id in tokens_list {
-            print(String(cString: token_to_piece(token: id) + [0]))
-        }
-
-        llama_batch_clear(&batch)
-
-        for i1 in 0..<tokens_list.count {
-            let i = Int(i1)
-            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
-        }
-        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
-
-        if llama_decode(context, batch) != 0 {
-            print("llama_decode() failed")
-        }
-
-        n_cur = batch.n_tokens
-    }
-
-    func completion_loop() -> String {
-        var new_token_id: llama_token = 0
-
-        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
-
-        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
-            print("\n")
-            is_done = true
-            let new_token_str = String(cString: temporary_invalid_cchars + [0])
-            temporary_invalid_cchars.removeAll()
-            return new_token_str
-        }
-
-        let new_token_cchars = token_to_piece(token: new_token_id)
-        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
-        let new_token_str: String
-        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
-            temporary_invalid_cchars.removeAll()
-            new_token_str = string
-        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
-            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
-            let string = String(cString: temporary_invalid_cchars + [0])
-            temporary_invalid_cchars.removeAll()
-            new_token_str = string
-        } else {
-            new_token_str = ""
-        }
-        print(new_token_str)
-        // tokens_list.append(new_token_id)
-
-        llama_batch_clear(&batch)
-        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
-
-        n_decode += 1
-        n_cur    += 1
-
-        if llama_decode(context, batch) != 0 {
-            print("failed to evaluate llama!")
-        }
-
-        return new_token_str
-    }
-
-    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
-        var pp_avg: Double = 0
-        var tg_avg: Double = 0
-
-        var pp_std: Double = 0
-        var tg_std: Double = 0
-
-        for _ in 0..<nr {
-            // bench prompt processing
-
-            llama_batch_clear(&batch)
-
-            let n_tokens = pp
-
-            for i in 0..<n_tokens {
-                llama_batch_add(&batch, 0, Int32(i), [0], false)
-            }
-            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
-
-            llama_kv_cache_clear(context)
-
-            let t_pp_start = ggml_time_us()
-
-            if llama_decode(context, batch) != 0 {
-                print("llama_decode() failed during prompt")
-            }
-            llama_synchronize(context)
-
-            let t_pp_end = ggml_time_us()
-
-            // bench text generation
-
-            llama_kv_cache_clear(context)
-
-            let t_tg_start = ggml_time_us()
-
-            for i in 0..<tg {
-                llama_batch_clear(&batch)
-
-                for j in 0..<pl {
-                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
-                }
-
-                if llama_decode(context, batch) != 0 {
-                    print("llama_decode() failed during text generation")
-                }
-                llama_synchronize(context)
-            }
-
-            let t_tg_end = ggml_time_us()
-
-            llama_kv_cache_clear(context)
-
-            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
-            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
-
-            let speed_pp = Double(pp)    / t_pp
-            let speed_tg = Double(pl*tg) / t_tg
-
-            pp_avg += speed_pp
-            tg_avg += speed_tg
-
-            pp_std += speed_pp * speed_pp
-            tg_std += speed_tg * speed_tg
-
-            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
-        }
-
-        pp_avg /= Double(nr)
-        tg_avg /= Double(nr)
-
-        if nr > 1 {
-            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
-            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
-        } else {
-            pp_std = 0
-            tg_std = 0
-        }
-
-        let model_desc     = model_info();
-        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
-        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
-        let backend        = "Metal";
-        let pp_avg_str     = String(format: "%.2f", pp_avg);
-        let tg_avg_str     = String(format: "%.2f", tg_avg);
-        let pp_std_str     = String(format: "%.2f", pp_std);
-        let tg_std_str     = String(format: "%.2f", tg_std);
-
-        var result = ""
-
-        result += String("| model | size | params | backend | test | t/s |\n")
-        result += String("| --- | --- | --- | --- | --- | --- |\n")
-        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
-        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
-
-        return result;
-    }
-
-    func clear() {
-        tokens_list.removeAll()
-        temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
-    }
-
-    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let utf8Count = text.utf8.count
-        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
-        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
-
-        var swiftTokens: [llama_token] = []
-        for i in 0..<tokenCount {
-            swiftTokens.append(tokens[Int(i)])
-        }
-
-        tokens.deallocate()
-
-        return swiftTokens
-    }
-
-    /// - note: The result does not contain null-terminator
-    private func token_to_piece(token: llama_token) -> [CChar] {
-        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
-        result.initialize(repeating: Int8(0), count: 8)
-        defer {
-            result.deallocate()
-        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
-
-        if nTokens < 0 {
-            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
-            newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
-            defer {
-                newResult.deallocate()
-            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
-            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
-            return Array(bufferPointer)
-        } else {
-            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
-            return Array(bufferPointer)
-        }
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -1,439 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 56;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
-		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
-		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
-		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
-		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
-		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
-		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
-		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
-		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
-		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
-		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
-		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
-		79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
-		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
-		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
-		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
-		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
-		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
-		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
-		DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
-		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		8A1C83702AC328BD0096AF73 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				DF810E132B4A5BA200301144 /* llama in Frameworks */,
-				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
-				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		8A1C836A2AC328BD0096AF73 = {
-			isa = PBXGroup;
-			children = (
-				DF2D2FE72B4A59BE00FCB72D /* llama.cpp */,
-				8A907F312AC7134E006146EA /* llama.cpp.swift */,
-				8A3F84232AC4C891005E2EE8 /* models */,
-				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
-				8A1C83742AC328BD0096AF73 /* Products */,
-				8A39BE082AC7601000BFEB40 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		8A1C83742AC328BD0096AF73 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
-			isa = PBXGroup;
-			children = (
-				8A3F84102AC4BD85005E2EE8 /* Resources */,
-				8A9F7C4B2AC332DC008AE1EA /* Models */,
-				8A9F7C4A2AC332BF008AE1EA /* UI */,
-				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
-				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
-			);
-			path = llama.swiftui;
-			sourceTree = "<group>";
-		};
-		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				549479CA2AC9E16000E0F78B /* Metal.framework */,
-				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		8A3F84102AC4BD85005E2EE8 /* Resources */ = {
-			isa = PBXGroup;
-			children = (
-				8A3F84112AC4BD8C005E2EE8 /* models */,
-			);
-			path = Resources;
-			sourceTree = "<group>";
-		};
-		8A3F84112AC4BD8C005E2EE8 /* models */ = {
-			isa = PBXGroup;
-			children = (
-			);
-			path = models;
-			sourceTree = "<group>";
-		};
-		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
-			isa = PBXGroup;
-			children = (
-				8A907F322AC7134E006146EA /* LibLlama.swift */,
-			);
-			path = llama.cpp.swift;
-			sourceTree = "<group>";
-		};
-		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
-			isa = PBXGroup;
-			children = (
-				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
-				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
-				F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
-				79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */,
-			);
-			path = UI;
-			sourceTree = "<group>";
-		};
-		8A9F7C4B2AC332DC008AE1EA /* Models */ = {
-			isa = PBXGroup;
-			children = (
-				8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
-			);
-			path = Models;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
-			buildPhases = (
-				8A1C836F2AC328BD0096AF73 /* Sources */,
-				8A1C83702AC328BD0096AF73 /* Frameworks */,
-				8A1C83712AC328BD0096AF73 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = llama.swiftui;
-			packageProductDependencies = (
-				DF810E122B4A5BA200301144 /* llama */,
-			);
-			productName = llama.swiftui;
-			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		8A1C836B2AC328BD0096AF73 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				BuildIndependentTargetsInParallel = 1;
-				LastSwiftUpdateCheck = 1500;
-				LastUpgradeCheck = 1500;
-				TargetAttributes = {
-					8A1C83722AC328BD0096AF73 = {
-						CreatedOnToolsVersion = 15.0;
-						LastSwiftMigration = 1500;
-					};
-				};
-			};
-			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
-			compatibilityVersion = "Xcode 14.0";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 8A1C836A2AC328BD0096AF73;
-			packageReferences = (
-			);
-			productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				8A1C83722AC328BD0096AF73 /* llama.swiftui */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		8A1C83712AC328BD0096AF73 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				8A3F84242AC4C891005E2EE8 /* models in Resources */,
-				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		8A1C836F2AC328BD0096AF73 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
-				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
-				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
-				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
-				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
-				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
-				79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		8A1C837F2AC328BE0096AF73 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu17;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				MTL_FAST_MATH = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-			};
-			name = Debug;
-		};
-		8A1C83802AC328BE0096AF73 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu17;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_FAST_MATH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		8A1C83822AC328BE0096AF73 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_STYLE = Automatic;
-				CURRENT_PROJECT_VERSION = 1;
-				DEVELOPMENT_TEAM = K5UQJPP73A;
-				ENABLE_PREVIEWS = YES;
-				GENERATE_INFOPLIST_FILE = YES;
-				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_EMIT_LOC_STRINGS = YES;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2,7";
-			};
-			name = Debug;
-		};
-		8A1C83832AC328BE0096AF73 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_STYLE = Automatic;
-				CURRENT_PROJECT_VERSION = 1;
-				DEVELOPMENT_TEAM = K5UQJPP73A;
-				ENABLE_PREVIEWS = YES;
-				GENERATE_INFOPLIST_FILE = YES;
-				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_EMIT_LOC_STRINGS = YES;
-				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2,7";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				8A1C837F2AC328BE0096AF73 /* Debug */,
-				8A1C83802AC328BE0096AF73 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				8A1C83822AC328BE0096AF73 /* Debug */,
-				8A1C83832AC328BE0096AF73 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-
-/* Begin XCSwiftPackageProductDependency section */
-		DF810E122B4A5BA200301144 /* llama */ = {
-			isa = XCSwiftPackageProductDependency;
-			productName = llama;
-		};
-/* End XCSwiftPackageProductDependency section */
-	};
-	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
-}
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:">
-   </FileRef>
-</Workspace>
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>IDEDidComputeMac32BitWarning</key>
-    <true/>
-</dict>
-</plist>
--- a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -1,13 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "universal",
-      "platform" : "ios",
-      "size" : "1024x1024"
-    }
-  ],
-  "info" : {
-    "author" : "xcode",
-    "version" : 1
-  }
-}
--- a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
@@ -1,6 +0,0 @@
-{
-  "info" : {
-    "author" : "xcode",
-    "version" : 1
-  }
-}
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -1,196 +0,0 @@
-import Foundation
-
-struct Model: Identifiable {
-    var id = UUID()
-    var name: String
-    var url: String
-    var filename: String
-    var status: String?
-}
-
-@MainActor
-class LlamaState: ObservableObject {
-    @Published var messageLog = ""
-    @Published var cacheCleared = false
-    @Published var downloadedModels: [Model] = []
-    @Published var undownloadedModels: [Model] = []
-    let NS_PER_S = 1_000_000_000.0
-
-    private var llamaContext: LlamaContext?
-    private var defaultModelUrl: URL? {
-        Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
-        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
-    }
-
-    init() {
-        loadModelsFromDisk()
-        loadDefaultModels()
-    }
-
-    private func loadModelsFromDisk() {
-        do {
-            let documentsURL = getDocumentsDirectory()
-            let modelURLs = try FileManager.default.contentsOfDirectory(at: documentsURL, includingPropertiesForKeys: nil, options: [.skipsHiddenFiles, .skipsSubdirectoryDescendants])
-            for modelURL in modelURLs {
-                let modelName = modelURL.deletingPathExtension().lastPathComponent
-                downloadedModels.append(Model(name: modelName, url: "", filename: modelURL.lastPathComponent, status: "downloaded"))
-            }
-        } catch {
-            print("Error loading models from disk: \(error)")
-        }
-    }
-
-    private func loadDefaultModels() {
-        do {
-            try loadModel(modelUrl: defaultModelUrl)
-        } catch {
-            messageLog += "Error!\n"
-        }
-
-        for model in defaultModels {
-            let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
-            if FileManager.default.fileExists(atPath: fileURL.path) {
-
-            } else {
-                var undownloadedModel = model
-                undownloadedModel.status = "download"
-                undownloadedModels.append(undownloadedModel)
-            }
-        }
-    }
-
-    func getDocumentsDirectory() -> URL {
-        let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
-        return paths[0]
-    }
-    private let defaultModels: [Model] = [
-        Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"),
-        Model(
-            name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)",
-            url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true",
-            filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download"
-        ),
-
-        Model(
-            name: "TinyLlama-1.1B (F16, 2.2 GiB)",
-            url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
-            filename: "tinyllama-1.1b-f16.gguf", status: "download"
-        ),
-
-        Model(
-            name: "Phi-2.7B (Q4_0, 1.6 GiB)",
-            url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
-            filename: "phi-2-q4_0.gguf", status: "download"
-        ),
-
-        Model(
-            name: "Phi-2.7B (Q8_0, 2.8 GiB)",
-            url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
-            filename: "phi-2-q8_0.gguf", status: "download"
-        ),
-
-        Model(
-            name: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
-            url: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
-            filename: "mistral-7b-v0.1.Q4_0.gguf", status: "download"
-        ),
-        Model(
-            name: "OpenHermes-2.5-Mistral-7B (Q3_K_M, 3.52 GiB)",
-            url: "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q3_K_M.gguf?download=true",
-            filename: "openhermes-2.5-mistral-7b.Q3_K_M.gguf", status: "download"
-        )
-    ]
-    func loadModel(modelUrl: URL?) throws {
-        if let modelUrl {
-            messageLog += "Loading model...\n"
-            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
-            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
-
-            // Assuming that the model is successfully loaded, update the downloaded models
-            updateDownloadedModels(modelName: modelUrl.lastPathComponent, status: "downloaded")
-        } else {
-            messageLog += "Load a model from the list below\n"
-        }
-    }
-
-
-    private func updateDownloadedModels(modelName: String, status: String) {
-        undownloadedModels.removeAll { $0.name == modelName }
-    }
-
-
-    func complete(text: String) async {
-        guard let llamaContext else {
-            return
-        }
-
-        let t_start = DispatchTime.now().uptimeNanoseconds
-        await llamaContext.completion_init(text: text)
-        let t_heat_end = DispatchTime.now().uptimeNanoseconds
-        let t_heat = Double(t_heat_end - t_start) / NS_PER_S
-
-        messageLog += "\(text)"
-
-        Task.detached {
-            while await !llamaContext.is_done {
-                let result = await llamaContext.completion_loop()
-                await MainActor.run {
-                    self.messageLog += "\(result)"
-                }
-            }
-
-            let t_end = DispatchTime.now().uptimeNanoseconds
-            let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
-            let tokens_per_second = Double(await llamaContext.n_len) / t_generation
-
-            await llamaContext.clear()
-
-            await MainActor.run {
-                self.messageLog += """
-                    \n
-                    Done
-                    Heat up took \(t_heat)s
-                    Generated \(tokens_per_second) t/s\n
-                    """
-            }
-        }
-    }
-
-    func bench() async {
-        guard let llamaContext else {
-            return
-        }
-
-        messageLog += "\n"
-        messageLog += "Running benchmark...\n"
-        messageLog += "Model info: "
-        messageLog += await llamaContext.model_info() + "\n"
-
-        let t_start = DispatchTime.now().uptimeNanoseconds
-        let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
-        let t_end = DispatchTime.now().uptimeNanoseconds
-
-        let t_heat = Double(t_end - t_start) / NS_PER_S
-        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
-
-        // if more than 5 seconds, then we're probably running on a slow device
-        if t_heat > 5.0 {
-            messageLog += "Heat up time is too long, aborting benchmark\n"
-            return
-        }
-
-        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
-
-        messageLog += "\(result)"
-        messageLog += "\n"
-    }
-
-    func clear() async {
-        guard let llamaContext else {
-            return
-        }
-
-        await llamaContext.clear()
-        messageLog = ""
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
+++ b/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -1,145 +0,0 @@
-import SwiftUI
-
-struct ContentView: View {
-    @StateObject var llamaState = LlamaState()
-    @State private var multiLineText = ""
-    @State private var showingHelp = false    // To track if Help Sheet should be shown
-
-    var body: some View {
-        NavigationView {
-            VStack {
-                ScrollView(.vertical, showsIndicators: true) {
-                    Text(llamaState.messageLog)
-                        .font(.system(size: 12))
-                        .frame(maxWidth: .infinity, alignment: .leading)
-                        .padding()
-                        .onTapGesture {
-                            UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
-                        }
-                }
-
-                TextEditor(text: $multiLineText)
-                    .frame(height: 80)
-                    .padding()
-                    .border(Color.gray, width: 0.5)
-
-                HStack {
-                    Button("Send") {
-                        sendText()
-                    }
-
-                    Button("Bench") {
-                        bench()
-                    }
-
-                    Button("Clear") {
-                        clear()
-                    }
-
-                    Button("Copy") {
-                        UIPasteboard.general.string = llamaState.messageLog
-                    }
-                }
-                .buttonStyle(.bordered)
-                .padding()
-
-                NavigationLink(destination: DrawerView(llamaState: llamaState)) {
-                    Text("View Models")
-                }
-                .padding()
-
-            }
-            .padding()
-            .navigationBarTitle("Model Settings", displayMode: .inline)
-
-        }
-    }
-
-    func sendText() {
-        Task {
-            await llamaState.complete(text: multiLineText)
-            multiLineText = ""
-        }
-    }
-
-    func bench() {
-        Task {
-            await llamaState.bench()
-        }
-    }
-
-    func clear() {
-        Task {
-            await llamaState.clear()
-        }
-    }
-    struct DrawerView: View {
-
-        @ObservedObject var llamaState: LlamaState
-        @State private var showingHelp = false
-        func delete(at offsets: IndexSet) {
-            offsets.forEach { offset in
-                let model = llamaState.downloadedModels[offset]
-                let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename)
-                do {
-                    try FileManager.default.removeItem(at: fileURL)
-                } catch {
-                    print("Error deleting file: \(error)")
-                }
-            }
-
-            // Remove models from downloadedModels array
-            llamaState.downloadedModels.remove(atOffsets: offsets)
-        }
-
-        func getDocumentsDirectory() -> URL {
-            let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
-            return paths[0]
-        }
-        var body: some View {
-            List {
-                Section(header: Text("Download Models From Hugging Face")) {
-                    HStack {
-                        InputButton(llamaState: llamaState)
-                    }
-                }
-                Section(header: Text("Downloaded Models")) {
-                    ForEach(llamaState.downloadedModels) { model in
-                        DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
-                    }
-                    .onDelete(perform: delete)
-                }
-                Section(header: Text("Default Models")) {
-                    ForEach(llamaState.undownloadedModels) { model in
-                        DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename)
-                    }
-                }
-
-            }
-            .listStyle(GroupedListStyle())
-            .navigationBarTitle("Model Settings", displayMode: .inline).toolbar {
-                ToolbarItem(placement: .navigationBarTrailing) {
-                    Button("Help") {
-                        showingHelp = true
-                    }
-                }
-            }.sheet(isPresented: $showingHelp) {    // Sheet for help modal
-                VStack(alignment: .leading) {
-                    VStack(alignment: .leading) {
-                        Text("1. Make sure the model is in GGUF Format")
-                               .padding()
-                        Text("2. Copy the download link of the quantized model")
-                               .padding()
-                    }
-                    Spacer()
-                   }
-            }
-        }
-    }
-}
-
-struct ContentView_Previews: PreviewProvider {
-    static var previews: some View {
-        ContentView()
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@@ -1,124 +0,0 @@
-import SwiftUI
-
-struct DownloadButton: View {
-    @ObservedObject private var llamaState: LlamaState
-    private var modelName: String
-    private var modelUrl: String
-    private var filename: String
-
-    @State private var status: String
-
-    @State private var downloadTask: URLSessionDownloadTask?
-    @State private var progress = 0.0
-    @State private var observation: NSKeyValueObservation?
-
-    private static func getFileURL(filename: String) -> URL {
-        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
-    }
-
-    private func checkFileExistenceAndUpdateStatus() {
-    }
-
-    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
-        self.llamaState = llamaState
-        self.modelName = modelName
-        self.modelUrl = modelUrl
-        self.filename = filename
-
-        let fileURL = DownloadButton.getFileURL(filename: filename)
-        status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
-    }
-
-    private func download() {
-        status = "downloading"
-        print("Downloading model \(modelName) from \(modelUrl)")
-        guard let url = URL(string: modelUrl) else { return }
-        let fileURL = DownloadButton.getFileURL(filename: filename)
-
-        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
-            if let error = error {
-                print("Error: \(error.localizedDescription)")
-                return
-            }
-
-            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
-                print("Server error!")
-                return
-            }
-
-            do {
-                if let temporaryURL = temporaryURL {
-                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
-                    print("Writing to \(filename) completed")
-
-                    llamaState.cacheCleared = false
-
-                    let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded")
-                    llamaState.downloadedModels.append(model)
-                    status = "downloaded"
-                }
-            } catch let err {
-                print("Error: \(err.localizedDescription)")
-            }
-        }
-
-        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
-            self.progress = progress.fractionCompleted
-        }
-
-        downloadTask?.resume()
-    }
-
-    var body: some View {
-        VStack {
-            if status == "download" {
-                Button(action: download) {
-                    Text("Download " + modelName)
-                }
-            } else if status == "downloading" {
-                Button(action: {
-                    downloadTask?.cancel()
-                    status = "download"
-                }) {
-                    Text("\(modelName) (Downloading \(Int(progress * 100))%)")
-                }
-            } else if status == "downloaded" {
-                Button(action: {
-                    let fileURL = DownloadButton.getFileURL(filename: filename)
-                    if !FileManager.default.fileExists(atPath: fileURL.path) {
-                        download()
-                        return
-                    }
-                    do {
-                        try llamaState.loadModel(modelUrl: fileURL)
-                    } catch let err {
-                        print("Error: \(err.localizedDescription)")
-                    }
-                }) {
-                    Text("Load \(modelName)")
-                }
-            } else {
-                Text("Unknown status")
-            }
-        }
-        .onDisappear() {
-            downloadTask?.cancel()
-        }
-        .onChange(of: llamaState.cacheCleared) { newValue in
-            if newValue {
-                downloadTask?.cancel()
-                let fileURL = DownloadButton.getFileURL(filename: filename)
-                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
-            }
-        }
-    }
-}
-
-// #Preview {
-//    DownloadButton(
-//        llamaState: LlamaState(),
-//        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
-//        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
-//        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
-//    )
-// }
--- a/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift
@@ -1,131 +0,0 @@
-import SwiftUI
-
-struct InputButton: View {
-    @ObservedObject var llamaState: LlamaState
-    @State private var inputLink: String = ""
-    @State private var status: String = "download"
-    @State private var filename: String = ""
-
-    @State private var downloadTask: URLSessionDownloadTask?
-    @State private var progress = 0.0
-    @State private var observation: NSKeyValueObservation?
-
-    private static func extractModelInfo(from link: String) -> (modelName: String, filename: String)? {
-        guard let url = URL(string: link),
-              let lastPathComponent = url.lastPathComponent.components(separatedBy: ".").first,
-              let modelName = lastPathComponent.components(separatedBy: "-").dropLast().joined(separator: "-").removingPercentEncoding,
-              let filename = lastPathComponent.removingPercentEncoding else {
-            return nil
-        }
-
-        return (modelName, filename)
-    }
-
-    private static func getFileURL(filename: String) -> URL {
-        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
-    }
-
-    private func download() {
-        guard let extractedInfo = InputButton.extractModelInfo(from: inputLink) else {
-            // Handle invalid link or extraction failure
-            return
-        }
-
-        let (modelName, filename) = extractedInfo
-        self.filename = filename  // Set the state variable
-
-        status = "downloading"
-        print("Downloading model \(modelName) from \(inputLink)")
-        guard let url = URL(string: inputLink) else { return }
-        let fileURL = InputButton.getFileURL(filename: filename)
-
-        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
-            if let error = error {
-                print("Error: \(error.localizedDescription)")
-                return
-            }
-
-            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
-                print("Server error!")
-                return
-            }
-
-            do {
-                if let temporaryURL = temporaryURL {
-                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
-                    print("Writing to \(filename) completed")
-
-                    llamaState.cacheCleared = false
-
-                    let model = Model(name: modelName, url: self.inputLink, filename: filename, status: "downloaded")
-                    llamaState.downloadedModels.append(model)
-                    status = "downloaded"
-                }
-            } catch let err {
-                print("Error: \(err.localizedDescription)")
-            }
-        }
-
-        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
-            self.progress = progress.fractionCompleted
-        }
-
-        downloadTask?.resume()
-    }
-
-    var body: some View {
-        VStack {
-            HStack {
-                TextField("Paste Quantized Download Link", text: $inputLink)
-                    .textFieldStyle(RoundedBorderTextFieldStyle())
-
-                Button(action: {
-                    downloadTask?.cancel()
-                    status = "download"
-                }) {
-                    Text("Cancel")
-                }
-            }
-
-            if status == "download" {
-                Button(action: download) {
-                    Text("Download Custom Model")
-                }
-            } else if status == "downloading" {
-                Button(action: {
-                    downloadTask?.cancel()
-                    status = "download"
-                }) {
-                    Text("Downloading \(Int(progress * 100))%")
-                }
-            } else if status == "downloaded" {
-                Button(action: {
-                    let fileURL = InputButton.getFileURL(filename: self.filename)
-                    if !FileManager.default.fileExists(atPath: fileURL.path) {
-                        download()
-                        return
-                    }
-                    do {
-                        try llamaState.loadModel(modelUrl: fileURL)
-                    } catch let err {
-                        print("Error: \(err.localizedDescription)")
-                    }
-                }) {
-                    Text("Load Custom Model")
-                }
-            } else {
-                Text("Unknown status")
-            }
-        }
-        .onDisappear() {
-            downloadTask?.cancel()
-        }
-        .onChange(of: llamaState.cacheCleared) { newValue in
-            if newValue {
-                downloadTask?.cancel()
-                let fileURL = InputButton.getFileURL(filename: self.filename)
-                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
-            }
-        }
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
@@ -1,44 +0,0 @@
-import SwiftUI
-import UniformTypeIdentifiers
-
-struct LoadCustomButton: View {
-    @ObservedObject private var llamaState: LlamaState
-    @State private var showFileImporter = false
-
-    init(llamaState: LlamaState) {
-        self.llamaState = llamaState
-    }
-
-    var body: some View {
-        VStack {
-            Button(action: {
-                showFileImporter = true
-            }) {
-                Text("Load Custom Model")
-            }
-        }
-        .fileImporter(
-            isPresented: $showFileImporter,
-            allowedContentTypes: [UTType(filenameExtension: "gguf", conformingTo: .data)!],
-            allowsMultipleSelection: false
-        ) { result in
-            switch result {
-            case .success(let files):
-                files.forEach { file in
-                    let gotAccess = file.startAccessingSecurityScopedResource()
-                    if !gotAccess { return }
-
-                    do {
-                        try llamaState.loadModel(modelUrl: file.absoluteURL)
-                    } catch let err {
-                        print("Error: \(err.localizedDescription)")
-                    }
-
-                    file.stopAccessingSecurityScopedResource()
-                }
-            case .failure(let error):
-                print(error)
-            }
-        }
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
+++ b/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
@@ -1,10 +0,0 @@
-import SwiftUI
-
-@main
-struct llama_swiftuiApp: App {
-    var body: some Scene {
-        WindowGroup {
-            ContentView()
-        }
-    }
-}
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -1,783 +0,0 @@
-" LLM-based text completion using llama.cpp
-"
-" requires:
-"
-"   - neovim or vim
-"   - curl
-"   - llama.cpp server instance
-"   - FIM-compatible model
-"
-" sample config:
-"
-"   - Tab       - accept the current suggestion
-"   - Shift+Tab - accept just the first line of the suggestion
-"   - Ctrl+F    - toggle FIM completion manually
-"
-" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
-"
-" start the llama.cpp server with a FIM-compatible model. for example:
-"
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
-"
-"   --batch-size [512, model max context]
-"
-"     adjust the batch size to control how much of the provided local context will be used during the inference
-"     lower values will use smaller part of the context around the cursor, which will result in faster processing
-"
-"   --ubatch-size [64, 2048]
-"
-"     chunks the batch into smaller chunks for faster processing
-"     depends on the specific hardware. use llama-bench to profile and determine the best size
-"
-"   --cache-reuse (ge:llama_config.n_predict, 1024]
-"
-"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
-"     using non-zero value enables context reuse on the server side which dramatically improves the performance at
-"     large contexts. a value of 256 should be good for all cases
-"
-" run this once to initialise llama.vim:
-"
-"   :call llama#init()
-"
-" more info: https://github.com/ggerganov/llama.cpp/pull/9787
-"
-
-" colors (adjust to your liking)
-highlight llama_hl_hint guifg=#ff772f ctermfg=202
-highlight llama_hl_info guifg=#77ff2f ctermfg=119
-
-" general parameters:
-"
-"   endpoint:         llama.cpp server endpoint
-"   n_prefix:         number of lines before the cursor location to include in the local prefix
-"   n_suffix:         number of lines after  the cursor location to include in the local suffix
-"   n_predict:        max number of tokens to predict
-"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
-"   t_max_predict_ms: max alloted time for the prediction
-"   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
-"   auto_fim:         trigger FIM completion automatically on cursor movement
-"   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
-"
-" ring buffer of chunks, accumulated with time upon:
-"
-"  - completion request
-"  - yank
-"  - entering a buffer
-"  - leaving a buffer
-"  - writing a file
-"
-" parameters for the ring-buffer with extra context:
-"
-"   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
-"   ring_chunk_size:  max size of the chunks (in number of lines)
-"                     note: adjust these numbers so that you don't overrun your context
-"                           at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
-"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks after FIM
-"   ring_update_ms:   how often to process queued chunks in normal mode
-"
-let s:default_config = {
-    \ 'endpoint':         'http://127.0.0.1:8012/infill',
-    \ 'n_prefix':         256,
-    \ 'n_suffix':         64,
-    \ 'n_predict':        128,
-    \ 't_max_prompt_ms':  500,
-    \ 't_max_predict_ms': 3000,
-    \ 'show_info':        2,
-    \ 'auto_fim':         v:true,
-    \ 'max_line_suffix':  8,
-    \ 'ring_n_chunks':    64,
-    \ 'ring_chunk_size':  64,
-    \ 'ring_scope':       1024,
-    \ 'ring_update_ms':   1000,
-    \ }
-
-let g:llama_config = get(g:, 'llama_config', s:default_config)
-
-function! s:get_indent(str)
-    let l:count = 0
-    for i in range(len(a:str))
-        if a:str[i] == "\t"
-            let l:count += &tabstop - 1
-        else
-            break
-        endif
-    endfor
-    return l:count
-endfunction
-
-function! s:rand(i0, i1) abort
-    return a:i0 + rand() % (a:i1 - a:i0 + 1)
-endfunction
-
-function! llama#init()
-    if !executable('curl')
-        echohl WarningMsg
-        echo 'llama.vim requires the "curl" command to be available'
-        echohl None
-        return
-    endif
-
-    let s:pos_x = 0 " cursor position upon start of completion
-    let s:pos_y = 0
-
-    let s:line_cur = ''
-
-    let s:line_cur_prefix = ''
-    let s:line_cur_suffix = ''
-
-    let s:ring_chunks = [] " current set of chunks used as extra context
-    let s:ring_queued = [] " chunks that are queued to be sent for processing
-    let s:ring_n_evict = 0
-
-    let s:hint_shown = v:false
-    let s:pos_y_pick = -9999 " last y where we picked a chunk
-    let s:pos_dx = 0
-    let s:content = []
-    let s:can_accept = v:false
-
-    let s:timer_fim = -1
-    let s:t_fim_start = reltime() " used to measure total FIM time
-    let s:t_last_move = reltime() " last time the cursor moved
-
-    let s:current_job = v:null
-
-    let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
-    let s:ghost_text_vim = has('textprop')
-
-    if s:ghost_text_vim
-        let s:hlgroup_hint = 'llama_hl_hint'
-        let s:hlgroup_info = 'llama_hl_info'
-
-        if empty(prop_type_get(s:hlgroup_hint))
-            call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
-        endif
-        if empty(prop_type_get(s:hlgroup_info))
-            call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
-        endif
-    endif
-
-    augroup llama
-        autocmd!
-        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
-        autocmd InsertLeavePre  * call llama#fim_cancel()
-
-        autocmd CursorMoved     * call s:on_move()
-        autocmd CursorMovedI    * call s:on_move()
-        autocmd CompleteChanged * call llama#fim_cancel()
-
-        if g:llama_config.auto_fim
-            autocmd CursorMovedI * call llama#fim(v:true)
-        endif
-
-        " gather chunks upon yanking
-        autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
-
-        " gather chunks upon entering/leaving a buffer
-        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
-        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
-
-        " gather chunk upon saving the file
-        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
-    augroup END
-
-    silent! call llama#fim_cancel()
-
-    " init background update of the ring buffer
-    if g:llama_config.ring_n_chunks > 0
-        call s:ring_update()
-    endif
-endfunction
-
-" compute how similar two chunks of text are
-" 0 - no similarity, 1 - high similarity
-" TODO: figure out something better
-function! s:chunk_sim(c0, c1)
-    let l:lines0 = len(a:c0)
-    let l:lines1 = len(a:c1)
-
-    let l:common = 0
-
-    for l:line0 in a:c0
-        for l:line1 in a:c1
-            if l:line0 == l:line1
-                let l:common += 1
-                break
-            endif
-        endfor
-    endfor
-
-    return 2.0 * l:common / (l:lines0 + l:lines1)
-endfunction
-
-" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
-"
-" no_mod   - do not pick chunks from buffers with pending changes
-" do_evict - evict chunks that are very similar to the new one
-"
-function! s:pick_chunk(text, no_mod, do_evict)
-    " do not pick chunks from buffers with pending changes or buffers that are not files
-    if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
-        return
-    endif
-
-    " if the extra context option is disabled - do nothing
-    if g:llama_config.ring_n_chunks <= 0
-        return
-    endif
-
-    " don't pick very small chunks
-    if len(a:text) < 3
-        return
-    endif
-
-    if len(a:text) + 1 < g:llama_config.ring_chunk_size
-        let l:chunk = a:text
-    else
-        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
-        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
-
-        let l:chunk = a:text[l:l0:l:l1]
-    endif
-
-    let l:chunk_str = join(l:chunk, "\n") . "\n"
-
-    " check if this chunk is already added
-    let l:exist = v:false
-
-    for i in range(len(s:ring_chunks))
-        if s:ring_chunks[i].data == l:chunk
-            let l:exist = v:true
-            break
-        endif
-    endfor
-
-    for i in range(len(s:ring_queued))
-        if s:ring_queued[i].data == l:chunk
-            let l:exist = v:true
-            break
-        endif
-    endfor
-
-    if l:exist
-        return
-    endif
-
-    " evict queued chunks that are very similar to the new one
-    for i in range(len(s:ring_queued) - 1, 0, -1)
-        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
-            if a:do_evict
-                call remove(s:ring_queued, i)
-                let s:ring_n_evict += 1
-            else
-                return
-            endif
-        endif
-    endfor
-
-    " also from s:ring_chunks
-    for i in range(len(s:ring_chunks) - 1, 0, -1)
-        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
-            if a:do_evict
-                call remove(s:ring_chunks, i)
-                let s:ring_n_evict += 1
-            else
-                return
-            endif
-        endif
-    endfor
-
-    " TODO: become parameter ?
-    if len(s:ring_queued) == 16
-        call remove(s:ring_queued, 0)
-    endif
-
-    call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
-
-    "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
-endfunction
-
-" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
-" called every g:llama_config.ring_update_ms
-function! s:ring_update()
-    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
-
-    " update only if in normal mode or if the cursor hasn't moved for a while
-    if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
-        return
-    endif
-
-    if len(s:ring_queued) == 0
-        return
-    endif
-
-    " move the first queued chunk to the ring buffer
-    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
-        call remove(s:ring_chunks, 0)
-    endif
-
-    call add(s:ring_chunks, remove(s:ring_queued, 0))
-
-    "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
-
-    " send asynchronous job with the new extra context so that it is ready for the next FIM
-    let l:extra_context = []
-    for l:chunk in s:ring_chunks
-        call add(l:extra_context, {
-            \ 'text':     l:chunk.str,
-            \ 'time':     l:chunk.time,
-            \ 'filename': l:chunk.filename
-            \ })
-    endfor
-
-    " no samplers needed here
-    let l:request = json_encode({
-        \ 'input_prefix':     "",
-        \ 'input_suffix':     "",
-        \ 'input_extra':      l:extra_context,
-        \ 'prompt':           "",
-        \ 'n_predict':        1,
-        \ 'temperature':      0.0,
-        \ 'stream':           v:false,
-        \ 'samplers':         ["temperature"],
-        \ 'cache_prompt':     v:true,
-        \ 't_max_prompt_ms':  1,
-        \ 't_max_predict_ms': 1
-        \ })
-
-    let l:curl_command = [
-        \ "curl",
-        \ "--silent",
-        \ "--no-buffer",
-        \ "--request", "POST",
-        \ "--url", g:llama_config.endpoint,
-        \ "--header", "Content-Type: application/json",
-        \ "--data", l:request
-        \ ]
-
-    " no callbacks because we don't need to process the response
-    if s:ghost_text_nvim
-        call jobstart(l:curl_command, {})
-    elseif s:ghost_text_vim
-        call job_start(l:curl_command, {})
-    endif
-endfunction
-
-" necessary for 'inoremap <expr>'
-function! llama#fim_inline(is_auto) abort
-    call llama#fim(a:is_auto)
-    return ''
-endfunction
-
-" the main FIM call
-" takes local context around the cursor and sends it together with the extra context to the server for completion
-function! llama#fim(is_auto) abort
-    " we already have a suggestion for the current cursor position
-    if s:hint_shown && !a:is_auto
-        call llama#fim_cancel()
-        return
-    endif
-
-    call llama#fim_cancel()
-
-    " avoid sending repeated requests too fast
-    if reltimefloat(reltime(s:t_fim_start)) < 0.6
-        if s:timer_fim != -1
-            call timer_stop(s:timer_fim)
-            let s:timer_fim = -1
-        endif
-
-        let s:t_fim_start = reltime()
-        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
-        return
-    endif
-
-    let s:t_fim_start = reltime()
-
-    let s:content = []
-    let s:can_accept = v:false
-
-    let s:pos_x = col('.') - 1
-    let s:pos_y = line('.')
-    let l:max_y = line('$')
-
-    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
-    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
-
-    let s:line_cur = getline('.')
-
-    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
-    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
-
-    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
-        return
-    endif
-
-    let l:prefix = ""
-        \ . join(l:lines_prefix, "\n")
-        \ . "\n"
-
-    let l:prompt = ""
-        \ . s:line_cur_prefix
-
-    let l:suffix = ""
-        \ . s:line_cur_suffix
-        \ . "\n"
-        \ . join(l:lines_suffix, "\n")
-        \ . "\n"
-
-    " prepare the extra context data
-    let l:extra_context = []
-    for l:chunk in s:ring_chunks
-        call add(l:extra_context, {
-            \ 'text':     l:chunk.str,
-            \ 'time':     l:chunk.time,
-            \ 'filename': l:chunk.filename
-            \ })
-    endfor
-
-    " the indentation of the current line
-    let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
-
-    let l:request = json_encode({
-        \ 'input_prefix':     l:prefix,
-        \ 'input_suffix':     l:suffix,
-        \ 'input_extra':      l:extra_context,
-        \ 'prompt':           l:prompt,
-        \ 'n_predict':        g:llama_config.n_predict,
-        \ 'n_indent':         l:indent,
-        \ 'top_k':            40,
-        \ 'top_p':            0.99,
-        \ 'stream':           v:false,
-        \ 'samplers':         ["top_k", "top_p", "infill"],
-        \ 'cache_prompt':     v:true,
-        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
-        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
-        \ })
-
-    let l:curl_command = [
-        \ "curl",
-        \ "--silent",
-        \ "--no-buffer",
-        \ "--request", "POST",
-        \ "--url", g:llama_config.endpoint,
-        \ "--header", "Content-Type: application/json",
-        \ "--data", l:request
-        \ ]
-
-    if s:current_job != v:null
-        if s:ghost_text_nvim
-            call jobstop(s:current_job)
-        elseif s:ghost_text_vim
-            call job_stop(s:current_job)
-        endif
-    endif
-
-    " send the request asynchronously
-    if s:ghost_text_nvim
-        let s:current_job = jobstart(l:curl_command, {
-            \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
-            \ 'on_exit':   function('s:fim_on_exit'),
-            \ 'stdout_buffered': v:true
-            \ })
-    elseif s:ghost_text_vim
-        let s:current_job = job_start(l:curl_command, {
-            \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
-            \ 'exit_cb':   function('s:fim_on_exit')
-            \ })
-    endif
-
-    " TODO: per-file location
-    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
-
-    " gather some extra context nearby and process it in the background
-    " only gather chunks if the cursor has moved a lot
-    " TODO: something more clever? reranking?
-    if a:is_auto && l:delta_y > 32
-        " expand the prefix even further
-        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
-
-        " pick a suffix chunk
-        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
-
-        let s:pos_y_pick = s:pos_y
-    endif
-endfunction
-
-" if first_line == v:true accept only the first line of the response
-function! llama#fim_accept(first_line)
-    " insert the suggestion at the cursor location
-    if s:can_accept && len(s:content) > 0
-        call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
-        if len(s:content) > 1
-            if !a:first_line
-                call append(s:pos_y, s:content[1:-1])
-            endif
-        endif
-
-        " move the cursor to the end of the accepted text
-        if !a:first_line && len(s:content) > 1
-            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
-        else
-            call cursor(s:pos_y, s:pos_x + len(s:content[0]))
-        endif
-    endif
-
-    call llama#fim_cancel()
-endfunction
-
-function! llama#fim_cancel()
-    let s:hint_shown = v:false
-
-    " clear the virtual text
-    let l:bufnr = bufnr('%')
-
-    if s:ghost_text_nvim
-        let l:id_vt_fim = nvim_create_namespace('vt_fim')
-        call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
-    elseif s:ghost_text_vim
-        call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
-        call prop_remove({'type': s:hlgroup_info, 'all': v:true})
-    endif
-
-    " remove the mappings
-    silent! iunmap <buffer> <Tab>
-    silent! iunmap <buffer> <S-Tab>
-    silent! iunmap <buffer> <Esc>
-endfunction
-
-function! s:on_move()
-    let s:t_last_move = reltime()
-
-    call llama#fim_cancel()
-endfunction
-
-" callback that processes the FIM result from the server and displays the suggestion
-function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
-    if s:ghost_text_nvim
-        let l:raw = join(a:data, "\n")
-    elseif s:ghost_text_vim
-        let l:raw = a:data
-    endif
-
-    if len(l:raw) == 0
-        return
-    endif
-
-    if a:pos_x != col('.') - 1 || a:pos_y != line('.')
-        return
-    endif
-
-    " show the suggestion only in insert mode
-    if mode() !=# 'i'
-        return
-    endif
-
-    let s:pos_x = a:pos_x
-    let s:pos_y = a:pos_y
-
-    let s:can_accept = v:true
-    let l:has_info   = v:false
-
-    if s:can_accept && v:shell_error
-        if !a:is_auto
-            call add(s:content, "<| curl error: is the server on? |>")
-        endif
-        let s:can_accept = v:false
-    endif
-
-    let l:n_prompt    = 0
-    let l:t_prompt_ms = 1.0
-    let l:s_prompt    = 0
-
-    let l:n_predict    = 0
-    let l:t_predict_ms = 1.0
-    let l:s_predict    = 0
-
-    " get the generated suggestion
-    if s:can_accept
-        let l:response = json_decode(l:raw)
-
-        for l:part in split(get(l:response, 'content', ''), "\n", 1)
-            call add(s:content, l:part)
-        endfor
-
-        " remove trailing new lines
-        while len(s:content) > 0 && s:content[-1] == ""
-            call remove(s:content, -1)
-        endwhile
-
-        let l:generation_settings = get(l:response, 'generation_settings', {})
-        let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
-
-        let l:n_cached  = get(l:response, 'tokens_cached', 0)
-        let l:truncated = get(l:response, 'truncated', v:false)
-
-        " if response.timings is available
-        if len(get(l:response, 'timings', {})) > 0
-            let l:has_info = v:true
-            let l:timings  = get(l:response, 'timings', {})
-
-            let l:n_prompt    = get(l:timings, 'prompt_n', 0)
-            let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
-            let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
-
-            let l:n_predict    = get(l:timings, 'predicted_n', 0)
-            let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
-            let l:s_predict    = get(l:timings, 'predicted_per_second', 0)
-        endif
-    endif
-
-    if len(s:content) == 0
-        call add(s:content, "")
-        let s:can_accept = v:false
-    endif
-
-    if len(s:content) == 0
-        return
-    endif
-
-    " NOTE: the following is logic for discarding predictions that repeat existing text
-    "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
-    "
-    "       still, I wonder if there is some better way that avoids having to do these special hacks?
-    "       on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
-    "       start generating whatever we have given it via the extra context. but on the other hand, it's not very
-    "       helpful to re-generate the same code that is already there
-
-    " truncate the suggestion if the first line is empty
-    if len(s:content) == 1 && s:content[0] == ""
-        let s:content = [""]
-    endif
-
-    " ... and the next lines are repeated
-    if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
-        let s:content = [""]
-    endif
-
-    " truncate the suggestion if it repeats the suffix
-    if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
-        let s:content = [""]
-    endif
-
-    " find the first non-empty line (strip whitespace)
-    let l:cmp_y = s:pos_y + 1
-    while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
-        let l:cmp_y += 1
-    endwhile
-
-    if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
-        " truncate the suggestion if it repeats the next line
-        if len(s:content) == 1
-            let s:content = [""]
-        endif
-
-        " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
-        if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
-            let s:content = [""]
-        endif
-
-        " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
-        if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
-            let s:content = [""]
-        endif
-    endif
-
-    " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
-    "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
-    "for i in range(1, len(s:content) - 1)
-    "    if strlen(matchstr(s:content[i], '^\s*')) < l:indent
-    "        let s:content = s:content[:i - 1]
-    "        break
-    "    endif
-    "endfor
-
-    let s:pos_dx = len(s:content[-1])
-
-    let s:content[-1] .= s:line_cur_suffix
-
-    call llama#fim_cancel()
-
-    " display virtual text with the suggestion
-    let l:bufnr = bufnr('%')
-
-    if s:ghost_text_nvim
-        let l:id_vt_fim = nvim_create_namespace('vt_fim')
-    endif
-
-    " construct the info message
-    if g:llama_config.show_info > 0 && l:has_info
-        let l:prefix = '   '
-
-        if l:truncated
-            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
-                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
-                \ l:n_cached, l:n_ctx
-                \ )
-        else
-            let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
-                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
-                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
-                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
-                \ l:n_predict, l:t_predict_ms, l:s_predict,
-                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
-                \ )
-        endif
-
-        if g:llama_config.show_info == 1
-            " display the info in the statusline
-            let &statusline = l:info
-            let l:info = ''
-        endif
-    endif
-
-    " display the suggestion and append the info to the end of the first line
-    if s:ghost_text_nvim
-        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
-            \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
-            \ 'virt_text_win_col': virtcol('.') - 1
-            \ })
-
-        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
-            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
-            \ 'virt_text_win_col': virtcol('.')
-            \ })
-    elseif s:ghost_text_vim
-        let l:new_suffix = s:content[0]
-        if !empty(l:new_suffix)
-            call prop_add(s:pos_y, s:pos_x + 1, {
-                        \ 'type': s:hlgroup_hint,
-                        \ 'text': l:new_suffix
-                        \ })
-        endif
-        for line in s:content[1:]
-            call prop_add(s:pos_y, 0, {
-                        \ 'type': s:hlgroup_hint,
-                        \ 'text': line,
-                        \ 'text_padding_left': s:get_indent(line),
-                        \ 'text_align': 'below'
-                        \ })
-        endfor
-        if !empty(l:info)
-            call prop_add(s:pos_y, 0, {
-                        \ 'type': s:hlgroup_info,
-                        \ 'text': l:info,
-                        \ 'text_padding_left': col('$'),
-                        \ 'text_wrap': 'truncate'
-                        \ })
-        endif
-    endif
-
-    " setup accept shortcuts
-    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
-    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
-
-    let s:hint_shown = v:true
-endfunction
-
-function! s:fim_on_exit(job_id, exit_code, event = v:null)
-    if a:exit_code != 0
-        echom "Job failed with exit code: " . a:exit_code
-    endif
-
-    let s:current_job = v:null
-endfunction
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -1,28 +0,0 @@
-" Basic plugin example
-
-function! Llm()
-
-  let url = "http://127.0.0.1:8080/completion"
-
-  " Get the content of the current buffer
-  let buffer_content = join(getline(1, '$'), "\n")
-
-  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
-  let json_payload.prompt = buffer_content
-
-  " Define the curl command
-  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
-  let response = system(curl_command, json_encode(json_payload))
-
-  " Extract the content field from the response
-  let content = json_decode(response).content
-
-  let split_newlines = split(content, '\n', 1)
-
-  " Insert the content at the cursor position
-  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
-endfunction
-
-command! Llm call Llm()
-noremap <F2> :Llm<CR>
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,485 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "sampling.h"
-#include "log.h"
-#include "llama.h"
-
-#include <cstdio>
-#include <string>
-#include <vector>
-
-struct ngram_data {
-    bool active = false;
-
-    llama_seq_id seq_id = -1;
-
-    std::vector<int> i_batch;
-
-    std::vector<llama_token> tokens;
-};
-
-// n-gram container
-struct ngram_container {
-    ngram_container(int n_vocab, int N, int G) {
-        cnt.resize(n_vocab);
-        head.resize(n_vocab);
-        tokens.resize(n_vocab * G * (N - 1));
-    }
-
-    int n_total = 0;
-
-    std::vector<int> cnt;
-    std::vector<int> head;
-
-    // [n_vocab][G][N - 1]
-    // for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
-    std::vector<llama_token> tokens;
-};
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
-
-    common_init();
-
-    const int W = 15; // lookahead window
-    const int N = 5;  // n-gram size
-    const int G = 15; // max verification n-grams
-
-    const bool dump_kv_cache = params.dump_kv_cache;
-
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the target model
-    common_init_result llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-
-    // Tokenize the prompt
-    std::vector<llama_token> inp;
-    std::vector<llama_token> all;
-
-    inp = common_tokenize(ctx, params.prompt, true, true);
-    all = inp;
-
-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
-
-    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
-        return 1;
-    }
-
-    LOG("\n\n");
-
-    for (auto id : inp) {
-        LOG("%s", common_token_to_piece(ctx, id).c_str());
-    }
-
-    fflush(stderr);
-
-    const int n_input = inp.size();
-
-    const auto t_enc_start = ggml_time_us();
-
-    // eval the prompt
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
-
-    for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
-    }
-
-    const auto t_enc_end = ggml_time_us();
-
-    int n_predict = 0;
-    int n_accept  = 0;
-
-    int n_past = inp.size();
-
-    llama_token id = 0;
-
-    // used to determine end of generation
-    bool has_eos = false;
-
-    // for each decoded batch, we have at most W + G + 1 distinct sequences:
-    // seq_id == 0           : the current input token
-    // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
-    // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
-
-    // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
-
-    // verification n-grams
-    std::vector<ngram_data> ngrams_cur(G);
-
-    // tokens for the past N - 1 Jacobi iterations
-    std::vector<llama_token> tokens_j_prev(W);
-    std::vector<std::vector<llama_token>> tokens_j(N - 1);
-    for (int j = 0; j < N - 1; j++) {
-        tokens_j[j].resize(W);
-
-        for (int i = 0; i < W; i++) {
-            // there are different ways to init these tokens
-            if (0) {
-                // initialize randomly from the prompt tokens
-                tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
-            } else {
-                // initialize with a sequence of increasing numbers
-                tokens_j[j][i] = 100 + i;
-            }
-        }
-    }
-
-    std::vector<llama_seq_id> seq_id_look;
-
-    // the input token belongs both to all sequences
-    std::vector<llama_seq_id> seq_id_all(W + G + 1);
-    for (int i = 0; i < W + G + 1; i++) {
-        seq_id_all[i] = i;
-    }
-
-    // here we keep adding new n-grams as we go
-    ngram_container ngrams_observed(llama_n_vocab(model), N, G);
-
-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
-
-    const auto t_dec_start = ggml_time_us();
-
-    // sample first token
-    {
-        id = common_sampler_sample(smpl, ctx, 0);
-
-        common_sampler_accept(smpl, id, true);
-
-        {
-            const std::string token_str = common_token_to_piece(ctx, id);
-
-            LOG("%s", token_str.c_str());
-            fflush(stdout);
-        }
-    }
-
-    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
-        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
-        //
-        // Example for W = 5, N = 4, G = 2:
-        // (I = input, L = lookahead, V = verification)
-        //
-        // Batch:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
-        // T:        -2 -2 -2 -2 -1 -1 -1 -1 -1  0  0  0  0  0  0
-        // Info:   I  L  L  L  L  L  L  L  L  L  L  L  L  L  L  V  V  V  V  V  V
-        // Pos:    0  1  2  3  4  1  2  3  4  5  2  3  4  5  6  1  2  3  1  2  3   (+ n_past)
-        // Logits: 1  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  1
-        // ---------------------------------------------------------------------
-        // Seq:    0
-        //         1              1              1
-        //         2  2              2              2
-        //         3  3  3              3              3
-        //         4  4  4  4              4              4
-        //         5  5  5  5  5              5              5
-        //         6                                            6  6  6
-        //         7                                                     7  7  7
-        // ---------------------------------------------------------------------
-        //                                       |  |  |  |  |  |  |  |  |  |  |
-        //                                       V  V  V  V  V  |  |  |  |  |  |
-        //                                         j_tokens     |  |  |  |  |  |
-        //                                                      V  V  V  V  V  V
-        //                                                             id
-        {
-            common_batch_clear(batch);
-
-            // current token - first token of the first level
-            common_batch_add(batch, id, n_past, seq_id_all, true);
-
-            // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
-            {
-                const int g_cur = ngrams_observed.cnt[id];
-
-                ngrams_cur.resize(g_cur);
-                for (int g = 0; g < g_cur; g++) {
-                    ngrams_cur[g].active = true;
-                    ngrams_cur[g].tokens.resize(N);
-                    ngrams_cur[g].i_batch.resize(N);
-                    ngrams_cur[g].seq_id = W + 1 + g;
-                    ngrams_cur[g].i_batch[0] = 0;
-                    ngrams_cur[g].tokens [0] = id;
-                }
-
-                for (int j = 0; j < N - 1; j++) {
-                    for (int g = 0; g < g_cur; g++) {
-                        const int idx = id*(N - 1)*G + g*(N - 1);
-
-                        const llama_token t = ngrams_observed.tokens[idx + j];
-
-                        ngrams_cur[g].tokens [j + 1] = t;
-                        ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
-
-                        common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
-                    }
-                }
-            }
-
-            // fill the remaining W - 1 tokens for the first level
-            for (int i = 1; i < W; i++) {
-                seq_id_look.resize(W - i);
-                for (int j = 0; j < W - i; j++) {
-                    seq_id_look[j] = i + j + 1;
-                }
-
-                common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
-            }
-
-            // fill the rest of the levels
-            for (int j = 1; j < N - 1; j++) {
-                for (int i = 0; i < W; i++) {
-                    common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
-                }
-            }
-        }
-
-        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
-            return 1;
-        }
-
-        int seq_id_best = 0;
-
-        for (int v = 0; v < N; ++v) {
-            int i_batch = 0;
-
-            // if no active ngrams are left, it means the sampled token does not pass the verification
-            if (v > 0) {
-                for (int g = 0; g < (int) ngrams_cur.size(); g++) {
-                    if (ngrams_cur[g].active) {
-                        i_batch = ngrams_cur[g].i_batch[v];
-                        seq_id_best = ngrams_cur[g].seq_id;
-
-                        ++n_accept;
-                        break;
-                    }
-                }
-
-                // no more matches -> create a new batch
-                if (i_batch == 0) {
-                    break;
-                }
-            }
-
-            // sample the next token
-            id = common_sampler_sample(smpl, ctx, i_batch);
-
-            common_sampler_accept(smpl, id, true);
-
-            // print
-            {
-                const std::string token_str = common_token_to_piece(ctx, id);
-
-                if (v == 0) {
-                    LOG("%s", token_str.c_str());
-                } else {
-                    // print light cyan
-                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
-                }
-                fflush(stdout);
-
-                if (llama_token_is_eog(model, id)) {
-                    has_eos = true;
-                }
-
-                all.push_back(id);
-            }
-
-            ++n_predict;
-            ++n_past;
-
-            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
-                break;
-            }
-
-            // verify across active n-grams
-            for (int g = 0; g < (int) ngrams_cur.size(); g++) {
-                if (ngrams_cur[g].active) {
-                    if (v == N - 1) {
-                        ngrams_cur[g].active = false;
-                    } else {
-                        if (id != ngrams_cur[g].tokens[v + 1]) {
-                            ngrams_cur[g].active = false;
-                        }
-                    }
-                }
-            }
-
-            // print known n-grams starting with token id (debug)
-            if (0 && v == 0) {
-                if (ngrams_observed.cnt[id] > 0) {
-                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
-                }
-
-                for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    LOG("   - ngram %2d: ", i);
-
-                    const int idx = id*(N - 1)*G + i*(N - 1);
-
-                    for (int j = 0; j < N - 1; j++) {
-                        const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
-
-                        LOG("%s", token_str.c_str());
-                    }
-
-                    LOG("\n");
-                }
-            }
-
-            // update lookahead tokens
-            {
-                for (int i = 0; i < W; i++) {
-                    tokens_j_prev[i] = tokens_j[0][i];
-                }
-
-                for (int j = 0; j < N - 2; j++) {
-                    tokens_j[j] = tokens_j[j + 1];
-                }
-
-                if (v == 0) {
-                    // sample from the last level
-                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
-                    }
-                } else {
-                    for (int i = 0; i < W; i++) {
-                        // there are different ways to init these tokens
-                        if (0) {
-                            // random init
-                            tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
-                        } else {
-                            // init from the previous level
-                            tokens_j[N - 2][i] = tokens_j[0][i];
-                        }
-                    }
-                }
-            }
-
-            // update observed ngrams
-            if (v == 0) {
-                // the first token of the n-gram is determined by the index in the container so it is not stored
-                std::vector<llama_token> ngram(N - 1);
-
-                // n-gram generation
-                // ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
-                for (int f = 0; f < W; ++f) {
-                    const int ft = tokens_j_prev[f]; // first token of the n-gram
-
-                    for (int j = 0; j < N - 1; ++j) {
-                        ngram[j] = tokens_j[j][f];
-                    }
-
-                    // filter-out repeating n-grams
-                    {
-                        bool is_unique = true;
-
-                        for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
-                            const int idx = ft*(N - 1)*G + k*(N - 1);
-
-                            bool is_match = true;
-                            for (int j = 0; j < N - 1; ++j) {
-                                if (ngrams_observed.tokens[idx + j] != ngram[j]) {
-                                    is_match = false;
-                                    break;
-                                }
-                            }
-
-                            if (is_match) {
-                                is_unique = false;
-                                break;
-                            }
-                        }
-
-                        if (!is_unique) {
-                            continue;
-                        }
-                    }
-
-                    const int head = ngrams_observed.head[ft];
-                    const int idx  = ft*(N - 1)*G + head*(N - 1);
-
-                    for (int i = 0; i < N - 1; i++) {
-                        ngrams_observed.tokens[idx + i] = ngram[i];
-                    }
-
-                    ngrams_observed.cnt[ft]  = std::min(G, ngrams_observed.cnt[ft] + 1);
-                    ngrams_observed.head[ft] = (head + 1) % G;
-
-                    ngrams_observed.n_total++;
-                }
-            }
-        }
-
-        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
-            break;
-        }
-
-        // KV cache management
-        // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
-
-        if (seq_id_best != 0) {
-            // if a verification token matched, we keep the best sequence and remove the rest
-            // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(ctx, seq_id_best);
-            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
-
-            for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
-            }
-        }
-    }
-
-    auto t_dec_end = ggml_time_us();
-
-    LOG("\n\n");
-
-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
-    LOG_INF("\n");
-    LOG_INF("W = %2d\n", W);
-    LOG_INF("N = %2d\n", N);
-    LOG_INF("G = %2d\n", G);
-    LOG_INF("\n");
-    LOG_INF("n_predict = %d\n", n_predict);
-    LOG_INF("n_accept  = %d\n", n_accept);
-
-    LOG_INF("\n");
-    common_perf_print(ctx, smpl);
-
-    common_sampler_free(smpl);
-
-    llama_kv_cache_view_free(&kvc_view);
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    LOG("\n\n");
-
-    return 0;
-}
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,45 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "ngram-cache.h"
-#include "ggml.h"
-#include "llama.h"
-
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-int main(int argc, char ** argv){
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
-        return 1;
-    }
-
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the model
-    common_init_result llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-    GGML_ASSERT(model != nullptr);
-
-    // tokenize the prompt
-    std::vector<llama_token> inp;
-    inp = common_tokenize(ctx, params.prompt, true, true);
-    fprintf(stderr, "%s: tokenization done\n", __func__);
-
-
-    common_ngram_cache ngram_cache;
-    common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
-    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
-
-    common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
-
-    return 0;
-}
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -1,47 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-static void print_usage(char* argv0) {
-    fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
-    fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
-}
-
-int main(int argc, char ** argv){
-    if (argc < 3) {
-        print_usage(argv[0]);
-        exit(1);
-    }
-
-    std::vector<std::string> args;
-    args.resize(argc-1);
-    for (int i = 0; i < argc-1; ++i) {
-        args[i] = argv[i+1];
-        if (args[i] == "-h" || args[i] == "--help") {
-            print_usage(argv[0]);
-            exit(0);
-        }
-    }
-
-    fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
-
-    for (size_t i = 1; i < args.size()-1; ++i) {
-        fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
-
-        common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
-    }
-
-    fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
-    common_ngram_cache_save(ngram_cache_merged, args.back());
-}
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -1,160 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "ngram-cache.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <cinttypes>
-#include <fstream>
-#include <string>
-#include <vector>
-
-int main(int argc, char ** argv){
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
-        return 1;
-    }
-
-    common_init();
-
-    const int n_draft = params.n_draft;
-
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the model
-    common_init_result llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-
-    // tokenize the prompt
-    std::vector<llama_token> inp;
-    inp = common_tokenize(ctx, params.prompt, true, true);
-
-    common_ngram_cache ngram_cache_context;
-    common_ngram_cache ngram_cache_dynamic;
-    common_ngram_cache ngram_cache_static;
-    int64_t t_draft_flat_us = 0;
-    int64_t t_draft_us = 0;
-
-    {
-        const int64_t t_start_draft_us = ggml_time_us();
-
-        if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
-                exit(1);
-            }
-        }
-
-        if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
-        }
-
-        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
-    }
-
-    const int n_input = inp.size();
-    const int n_ctx = llama_n_ctx(ctx);
-
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    const int64_t t_start_ms = ggml_time_ms();
-
-    // Iterate over input tokens in chunks of size n_ctx.
-    // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
-    for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
-        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
-        std::vector<llama_token> pseudo_output;
-        pseudo_output.push_back(inp_slice[0]);
-
-        while ((int) pseudo_output.size() < n_ctx) {
-            // Simulate drafting and decoding from draft:
-            std::vector<llama_token> draft;
-            draft.push_back(pseudo_output.back());
-
-            {
-                const int64_t t_start_draft_us = ggml_time_us();
-                common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
-                t_draft_us += ggml_time_us() - t_start_draft_us;
-            }
-
-            n_drafted += draft.size() - 1;
-
-            for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
-                const llama_token ground_truth = inp_slice[pseudo_output.size()];
-                const llama_token drafted = draft[j];
-
-                if (ground_truth != drafted) {
-                    break;
-                }
-
-                ++n_accept;
-                pseudo_output.push_back(ground_truth);
-
-                {
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-            }
-
-            // After each simulated batch decoding simulate the sampling of a single token:
-            if ((int) pseudo_output.size() < n_ctx) {
-                pseudo_output.push_back(inp_slice[pseudo_output.size()]);
-                {
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-            }
-
-            draft.erase(draft.begin());
-
-        }
-        if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
-            const int64_t t_now_ms = ggml_time_ms();
-            const int64_t eta_ms   = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
-            const int64_t eta_min  = eta_ms / (60*1000);
-            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
-
-            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
-        }
-
-        // After each chunk, update the dynamic ngram cache with the context ngram cache:
-        common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-        ngram_cache_context.clear();
-    }
-
-    LOG("\n");
-
-    LOG_INF("\n");
-    LOG_INF("n_draft      = %d\n", n_draft);
-    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_INF("n_drafted    = %d\n", n_drafted);
-    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
-            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_INF("n_accept     = %d\n", n_accept);
-    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    LOG("\n\n");
-
-    return 0;
-}
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,254 +0,0 @@
-#include "arg.h"
-#include "ggml.h"
-#include "common.h"
-#include "ngram-cache.h"
-#include "sampling.h"
-#include "log.h"
-#include "llama.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <string>
-#include <vector>
-
-int main(int argc, char ** argv){
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
-        return 1;
-    }
-
-    common_init();
-
-    // max. number of additional tokens to draft if match is found
-    const int n_draft = params.n_draft;
-
-    const bool dump_kv_cache = params.dump_kv_cache;
-
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // load the model
-    common_init_result llama_init = common_init_from_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-
-    // tokenize the prompt
-    std::vector<llama_token> inp;
-    inp = common_tokenize(ctx, params.prompt, true, true);
-
-    common_ngram_cache ngram_cache_context;
-    common_ngram_cache ngram_cache_dynamic;
-    common_ngram_cache ngram_cache_static;
-    int64_t t_draft_flat_us = 0;
-    int64_t t_draft_us = 0;
-
-    {
-        // Fill up context ngram cache with tokens from user input:
-        const int64_t t_start_draft_us = ggml_time_us();
-        common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
-
-        if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
-                exit(1);
-            }
-        }
-
-        if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
-        }
-
-        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
-    }
-
-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
-
-    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
-        return 1;
-    }
-
-    LOG("\n\n");
-
-    for (auto id : inp) {
-        LOG("%s", common_token_to_piece(ctx, id).c_str());
-    }
-
-    fflush(stderr);
-
-    const int n_input = inp.size();
-
-    const auto t_enc_start = ggml_time_us();
-
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
-
-    const auto t_enc_end = ggml_time_us();
-
-    int n_predict = 0;
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    int n_past = inp.size();
-
-    bool has_eos = false;
-
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
-
-    std::vector<llama_token> draft;
-
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
-
-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
-
-    const auto t_dec_start = ggml_time_us();
-
-    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            common_kv_cache_dump_view_seqs(kvc_view, 40);
-        }
-
-        // print current draft sequence
-        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
-
-        int i_dft = 0;
-        while (true) {
-            // sample from the target model
-            llama_token id = common_sampler_sample(smpl, ctx, i_dft);
-
-            common_sampler_accept(smpl, id, true);
-
-            const std::string token_str = common_token_to_piece(ctx, id);
-
-            if (!params.use_color) {
-                LOG("%s", token_str.c_str());
-            }
-
-            if (llama_token_is_eog(model, id)) {
-                has_eos = true;
-            }
-
-            ++n_predict;
-
-            // check if the target token matches the draft
-            if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
-                ++n_accept;
-                ++n_past;
-                ++i_dft;
-                inp.push_back(id);
-                {
-                    // Update context ngram cache with the newly accepted token:
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-
-                if (params.use_color) {
-                    // color accepted draft token
-                    LOG("\033[34m%s\033[0m", token_str.c_str());
-                    fflush(stdout);
-                }
-                continue;
-            }
-
-            if (params.use_color) {
-                LOG("%s", token_str.c_str());
-            }
-            fflush(stdout);
-
-
-            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
-
-            draft.clear();
-            draft.push_back(id);
-            inp.push_back(id);
-            {
-                // Update context ngram cache with the newly accepted token:
-                const int64_t t_start_draft_us = ggml_time_us();
-                common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
-                t_draft_us += ggml_time_us() - t_start_draft_us;
-            }
-            break;
-        }
-
-        if ((params.n_predict > 0 && n_predict > params.n_predict) || has_eos) {
-            break;
-        }
-
-        // KV cache management
-        // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-        common_batch_clear(batch_tgt);
-        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
-
-        // Draft already contains a single token sampled from the model:
-        GGML_ASSERT(draft.size() == 1);
-        GGML_ASSERT(draft[0] == inp.back());
-        const int64_t t_start_draft_us = ggml_time_us();
-
-        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
-
-        for (size_t i = 1; i < draft.size(); ++i) {
-            common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
-        }
-
-        t_draft_us += ggml_time_us() - t_start_draft_us;
-        n_drafted += draft.size() - 1;
-
-        llama_decode(ctx, batch_tgt);
-        ++n_past;
-
-        draft.erase(draft.begin());
-    }
-
-    auto t_dec_end = ggml_time_us();
-
-    // Update dynamic ngram cache with context ngram cache and save it to disk:
-    common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
-
-    LOG("\n\n");
-
-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
-    LOG_INF("\n");
-    LOG_INF("n_draft      = %d\n", n_draft);
-    LOG_INF("n_predict    = %d\n", n_predict);
-    LOG_INF("n_drafted    = %d\n", n_drafted);
-    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
-            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_INF("n_accept     = %d\n", n_accept);
-    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
-
-    LOG_INF("\ntarget:\n\n");
-    common_perf_print(ctx, smpl);
-
-    common_sampler_free(smpl);
-
-    llama_batch_free(batch_tgt);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    LOG("\n\n");
-
-    return 0;
-}
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
@@ -1,50 +0,0 @@
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-
-*.gguf
-
-*.log
-.DS_Store
-.build/
-.cache/
-.direnv/
-.envrc
-.swiftpm
-.venv
-.clang-tidy
-.vs/
-.vscode/
-
-build*/
-out/
-tmp/
--- a/Show More
+++ b/Show More