Compare commits

...

10 Commits

Author SHA1 Message Date
eric8607242
ee1b497c98 llama : support more diverse tokenizers? (#2420)
* supporting more diverse tokenizers

* Update llama.cpp

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-07-28 21:10:05 +03:00
Georgi Gerganov
d73b8d48b4 examples : fix whitespace 2023-07-28 21:05:08 +03:00
nhamanasu
34ae1caf7f examples : server chat mode with llama2 (#2400)
* add: server chat mode with llama2

* fix: remove the unnecessary last \n
2023-07-28 21:02:10 +03:00
Weird Constructor
d91f3f0c55 readme : fix the description of the Tail free sampling (TFS) method (#2431) 2023-07-28 11:44:43 +03:00
Rand Xie
65cdf34bdc llama : use n_embd_gqa instead of n_embd to handle llama-2 70B (#2433) 2023-07-28 11:42:53 +03:00
niansa/tuxifan
edcc7ae7d2 Obtaining LLaMA 2 instructions (#2308)
* Obtaining LLaMA 2 instructions

* Removed sharing warning for LLaMA 2

* Linked TheBloke's GGML repos

* Add LLaMA 2 to list of supported models

* Added LLaMA 2 usage instructions

* Added links to LLaMA 2 70B models
2023-07-28 03:14:11 +02:00
mj-shifu
7c529cede6 convert.py : Update to support 70B HF format model files (#2427)
* convert.py : fix llama 2 70b conversion from Huggingface
2023-07-27 14:39:17 -06:00
Georgi Gerganov
1a941869cb metal : disable graph concurrency optimization due to bug (#2413) 2023-07-27 11:00:54 +03:00
slaren
b5472ea0ad ggml : fix assert in ggml_set_unary_op (#2410) 2023-07-26 23:57:23 +02:00
Cebtenzzre
6df1f5940f make : build with -Wmissing-prototypes (#2394) 2023-07-26 21:00:04 +03:00
10 changed files with 217 additions and 59 deletions

View File

@@ -357,6 +357,7 @@ if (LLAMA_ALL_WARNINGS)
-Wshadow
-Wstrict-prototypes
-Wpointer-arith
-Wmissing-prototypes
)
set(cxx_flags
-Wall

View File

@@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
endif
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-Wmissing-prototypes
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
# OS specific

View File

@@ -77,6 +77,7 @@ as the main playground for developing new features for the [ggml](https://github
**Supported models:**
- [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
@@ -650,6 +651,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
### Obtaining and using the Facebook LLaMA 2 model
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
- [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
- [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
- [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
- [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
- [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
- [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
- Specify `-eps 1e-5` for best generation quality
- Specify `-gqa 8` for 70B models to work
### Verifying the model files
Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.

96
convert.py Executable file → Normal file
View File

@@ -133,7 +133,7 @@ TENSORS_SET = set(TENSORS_LIST)
def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(256, 1, -1):
for n_mult in range(8192, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
@@ -141,11 +141,12 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
@dataclass
class Params:
n_vocab: int
n_embd: int
n_mult: int
n_head: int
n_layer: int
n_vocab: int
n_embd: int
n_mult: int
n_head: int
n_layer: int
n_kv_head: Optional[int] # This parameter is only used for Llama 2
@staticmethod
def guessed(model: 'LazyModel') -> 'Params':
@@ -167,11 +168,12 @@ class Params:
n_head=n_embd // 128 # guessed
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = 256,
n_head = n_head,
n_layer = n_layer,
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = 256,
n_head = n_head,
n_layer = n_layer,
n_kv_head = None,
)
@staticmethod
@@ -183,15 +185,17 @@ class Params:
n_head = config["num_attention_heads"];
n_layer = config["num_hidden_layers"];
n_ff = config["intermediate_size"];
n_kv_head = config.get("num_key_value_heads")
n_mult = find_n_mult(n_ff, n_embd);
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
n_kv_head = n_kv_head,
)
# LLaMA v2 70B params.json
@@ -200,21 +204,22 @@ class Params:
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
n_vocab = config["vocab_size"];
n_embd = config["dim"];
n_head = config["n_heads"];
n_layer = config["n_layers"];
n_mult = config["multiple_of"];
n_vocab = config["vocab_size"];
n_embd = config["dim"];
n_head = config["n_heads"];
n_layer = config["n_layers"];
n_mult = config["multiple_of"];
if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
n_kv_head = None,
)
@staticmethod
@@ -317,10 +322,12 @@ class GGMLVocab:
Vocab = Union[SentencePieceVocab, GGMLVocab]
def permute(weights: NDArray, n_head: int) -> NDArray:
def permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
.swapaxes(1, 2)
.reshape(weights.shape))
def dequantize_q4(qvalues_pack32: NDArray, scales: NDArray, addends: Optional[NDArray], g_idx: Optional[NDArray]) -> NDArray:
@@ -368,7 +375,7 @@ class Tensor(metaclass=ABCMeta):
@abstractmethod
def astype(self, data_type: DataType) -> 'Tensor': ...
@abstractmethod
def permute(self, n_head: int) -> 'Tensor': ...
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'Tensor': ...
@abstractmethod
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
@abstractmethod
@@ -406,8 +413,8 @@ class UnquantizedTensor(Tensor):
r = self.ndarray.shape[0] // 3
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
def permute(self, n_head: int) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head))
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'UnquantizedTensor':
return UnquantizedTensor(permute(self.ndarray, n_head, n_kv_head))
def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
@@ -455,26 +462,27 @@ class GGMLQuantizedTensor(Tensor):
def to_ggml(self) -> 'GGMLQuantizedTensor':
return self
def permute(self, n_head: int) -> 'GGMLQuantizedTensor':
return GGMLQuantizedTensor(permute(self.ndarray, n_head), self.shape, self.data_type)
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> 'GGMLQuantizedTensor':
return GGMLQuantizedTensor(permute(self.ndarray, n_head, n_kv_head), self.shape, self.data_type)
GGMLCompatibleTensor = Union[UnquantizedTensor, GGMLQuantizedTensor]
class DeferredPermutedTensor(Tensor):
def __init__(self, base: Tensor, n_head: int) -> None:
def __init__(self, base: Tensor, n_head: int, n_kv_head: Optional[int] = None) -> None:
self.base = base
self.n_head = n_head
self.n_kv_head = n_kv_head
self.data_type = self.base.data_type
def astype(self, data_type: DataType) -> Tensor:
return self.base.astype(data_type).permute(self.n_head)
return self.base.astype(data_type).permute(self.n_head, self.n_kv_head)
def to_ggml(self) -> GGMLCompatibleTensor:
return self.base.to_ggml().permute(self.n_head)
return self.base.to_ggml().permute(self.n_head, self.n_kv_head)
def permute(self, n_head: int) -> Tensor:
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
raise Exception("shouldn't permute twice")
@@ -566,8 +574,8 @@ class GPTQForLLaMaQuantizedTensor(Tensor):
ret.data_type = QuantizedDataType(groupsize=new_groupsize, have_addends=True, have_g_idx=False)
return ret
def permute(self, n_head: int) -> Tensor:
return DeferredPermutedTensor(self, n_head)
def permute(self, n_head: int, n_kv_head: Optional[int] = None) -> Tensor:
return DeferredPermutedTensor(self, n_head, n_kv_head)
def to_ggml(self) -> GGMLQuantizedTensor:
# The output format looks like this:
@@ -698,10 +706,10 @@ def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
return ModelPlus(model, paths, format, vocab)
def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_kv_head: Optional[int] = None) -> LazyTensor:
def load() -> Tensor:
return lazy_tensor.load().permute(n_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
return lazy_tensor.load().permute(n_head, n_kv_head)
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_kv_head}) ' + lazy_tensor.description)
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
def load() -> Tensor:
@@ -726,7 +734,7 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_kv_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)

View File

@@ -202,9 +202,9 @@ Example usage: `--top-p 0.95`
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
Example usage: `--tfs 2.0`
Example usage: `--tfs 0.95`
### Locally Typical Sampling

View File

@@ -26,6 +26,7 @@ int main(int argc, char ** argv) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_gqa = params.n_gqa;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;

View File

@@ -0,0 +1,26 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
# Specify the model you want to use here:
MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-12}"
# Note: you can also override the generation options by specifying them on the command line:
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./server $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--rope-freq-scale 1.0 \
"$@"
# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
# -ngl 1 \

View File

@@ -0,0 +1,109 @@
#!/bin/bash
API_URL="${API_URL:-http://127.0.0.1:8080}"
CHAT=(
"Hello, Assistant."
"Hello. How may I help you today?"
)
INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
trim() {
shopt -s extglob
set -- "${1##+([[:space:]])}"
printf "%s" "${1%%+([[:space:]])}"
}
trim_trailing() {
shopt -s extglob
printf "%s" "${1%%+([[:space:]])}"
}
format_prompt() {
if [[ "${#CHAT[@]}" -eq 0 ]]; then
echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
else
LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
fi
}
tokenize() {
curl \
--silent \
--request POST \
--url "${API_URL}/tokenize" \
--header "Content-Type: application/json" \
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
| jq '.tokens[]'
}
N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
chat_completion() {
PROMPT="$(trim_trailing "$(format_prompt "$1")")"
DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
prompt: .,
temperature: 0.2,
top_k: 40,
top_p: 0.9,
n_keep: $n_keep,
n_predict: 1024,
stop: ["[INST]"],
stream: true
}')"
# Create a temporary file to hold the Python output
TEMPFILE=$(mktemp)
exec 3< <(curl \
--silent \
--no-buffer \
--request POST \
--url "${API_URL}/completion" \
--header "Content-Type: application/json" \
--data-raw "${DATA}")
python -c "
import json
import sys
answer = ''
while True:
line = sys.stdin.readline()
if not line:
break
if line.startswith('data: '):
json_content = line[6:].strip()
content = json.loads(json_content)['content']
sys.stdout.write(content)
sys.stdout.flush()
answer += content
answer = answer.rstrip('\n')
# Write the answer to the temporary file
with open('$TEMPFILE', 'w') as f:
f.write(answer)
" <&3
exec 3<&-
# Read the answer from the temporary file
ANSWER=$(cat $TEMPFILE)
# Clean up the temporary file
rm $TEMPFILE
printf "\n"
CHAT+=("$1" "$(trim "$ANSWER")")
}
while true; do
echo -en "\033[0;32m" # Green color
read -r -e -p "> " QUESTION
echo -en "\033[0m" # Reset color
chat_completion "${QUESTION}"
done

7
ggml.c
View File

@@ -4982,11 +4982,6 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
}
static void ggml_set_unary_op(struct ggml_tensor * tensor, enum ggml_unary_op op) {
GGML_ASSERT(tensor->op = GGML_OP_UNARY);
ggml_set_op_params_i32(tensor, 0, (int32_t) op);
}
const char * ggml_get_name(const struct ggml_tensor * tensor) {
return tensor->name;
}
@@ -7226,7 +7221,7 @@ static struct ggml_tensor * ggml_unary_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
ggml_set_unary_op(result, op);
ggml_set_op_params_i32(result, 0, (int32_t) op);
result->op = GGML_OP_UNARY;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;

View File

@@ -1722,9 +1722,10 @@ static bool llama_eval_internal(
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
}
// TODO: disabled until #2413 is resolved
//if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
// ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
//}
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
@@ -1923,7 +1924,9 @@ struct llama_tokenizer {
if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < (int) symbol.n; ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
// NOTE: old version, before #2420 - not sure what are the implications of this
//llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
output.push_back(token_id);
}
} else {
@@ -3662,7 +3665,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd;
const int n_embd = hparams.n_embd_gqa();
const int n_ctx = hparams.n_ctx;
const size_t kv_size = kv_self.buf.size;
@@ -3765,7 +3768,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
const int n_embd = hparams.n_embd;
const int n_embd = hparams.n_embd_gqa();
const int n_ctx = hparams.n_ctx;
size_t kv_size;