Compare commits

...

4 Commits

Author SHA1 Message Date
Alessandro de Oliveira Faria (A.K.A.CABELO)
a09a00e502 vendor : update cpp-httplib to 0.43.3 (#22686) 2026-05-05 09:04:57 +02:00
Georgi Gerganov
2bacb1eb77 server : validate --tools CLI argument against known tool names (#22538)
Previously, unknown tool names passed via --tools were silently ignored.
Now the server validates each tool name at startup and exits with an
error if an unrecognized tool is specified, listing the available tools.

Assisted-by: llama.cpp:local pi
2026-05-05 06:35:27 +03:00
Georgi Gerganov
d6e7b033a4 llama : add option to save memory in device buffers (#22679)
* llama : add option to save memory in device buffers

* tests : extend llama-save-load-state
2026-05-05 06:35:07 +03:00
Sigbjørn Skjæret
fa595462ca graph : handle non-contiguous Q/K/V in mul_mat_aux (#22630)
* qkv may not always be contiguous

* cont : make the cont conditional

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-05-05 06:34:44 +03:00
17 changed files with 458 additions and 65 deletions

View File

@@ -252,14 +252,14 @@ struct common_speculative_state_draft : public common_speculative_state {
size_t create_checkpoint(int n_tokens_prompt) {
int slot_id = 0;
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
ckpt.n_tokens = n_tokens_prompt;
ckpt.data.resize(checkpoint_size);
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
if (n != checkpoint_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
}
@@ -272,7 +272,7 @@ struct common_speculative_state_draft : public common_speculative_state {
size_t restore_checkpoint() {
int slot_id = 0;
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
if (n != ckpt.size()) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());

View File

@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
std::string result0;
std::string result1;
std::string result2;
std::string result3;
// init
auto llama_init = common_init_from_params(params);
@@ -213,11 +214,83 @@ int main(int argc, char ** argv) {
n_past += 1;
}
// test on-device state save/load
auto params_ctx4 = common_context_params_to_llama(params);
params_ctx4.n_seq_max = 2;
llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
printf("\nsingle seq run: %s", params.prompt.c_str());
// load state (rng, logits, embedding and kv_cache) from file
n_token_count_out = 0;
if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
fprintf(stderr, "\n%s : failed to load state\n", __func__);
return 1;
}
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
// restore state (last tokens)
n_past = n_token_count_out;
if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
return 1;
}
++n_past;
// save seq 0 and load into seq 1
{
// save kv of seq 0
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
if (ncopy != seq_store.size()) {
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
return 1;
}
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
// erase whole kv
llama_memory_clear(llama_get_memory(ctx4), true);
fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 0
const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
if (nset != seq_store.size()) {
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
return 1;
}
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
}
// forth run
for (auto i = 0; i < params.n_predict; i++) {
auto next_token = llama_sampler_sample(smpl4, ctx4, -1);
auto next_token_str = common_token_to_piece(ctx4, next_token);
printf("%s", next_token_str.c_str());
result3 += next_token_str;
common_batch_clear(batch);
common_batch_add(batch, next_token, n_past, {1}, true);
if (llama_decode(ctx4, batch)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_batch_free(batch);
return 1;
}
n_past += 1;
}
printf("\n");
llama_sampler_free(smpl);
llama_sampler_free(smpl2);
llama_sampler_free(smpl3);
llama_sampler_free(smpl4);
llama_batch_free(batch);
@@ -226,12 +299,18 @@ int main(int argc, char ** argv) {
llama_free(ctx2);
llama_free(ctx3);
llama_free(ctx4);
if (result0 != result2) {
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
return 1;
}
if (result0 != result3) {
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
return 1;
}
fprintf(stderr, "\n%s : success\n", __func__);
return 0;

View File

@@ -282,6 +282,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf);
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void ggml_metal_buffer_set_tensor (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_metal_buffer_get_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool ggml_metal_buffer_cpy_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * src, struct ggml_tensor * dst);
void ggml_metal_buffer_clear (ggml_metal_buffer_t buf, uint8_t value);
// finds the Metal buffer that contains the tensor data on the GPU device

View File

@@ -1,6 +1,7 @@
#import "ggml-metal-device.h"
#import "ggml-impl.h"
#import "ggml-backend-impl.h"
#include <Foundation/Foundation.h>
@@ -1737,6 +1738,47 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
}
}
bool ggml_metal_buffer_cpy_tensor(ggml_metal_buffer_t buf_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_metal_buffer_t buf_src = (ggml_metal_buffer_t)src->buffer->context;
const size_t size = ggml_nbytes(src);
// if both buffers are shared, we can use memcpy directly
if (buf_dst->is_shared && buf_src->is_shared) {
memcpy(dst->data, src->data, size);
return true;
}
// for private buffers, we need to use Metal blit commands
@autoreleasepool {
struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf_src, src);
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf_dst, dst);
if (bid_src.metal == nil || bid_dst.metal == nil) {
return false;
}
id<MTLCommandBuffer> cmd_buf = [buf_dst->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs
size:size];
[encoder endEncoding];
}
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
}
return true;
}
void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
if (buf->is_shared) {
memset(buf->all_data, value, buf->all_size);

View File

@@ -17,6 +17,9 @@
// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
static int g_devices = 1;
// forward declaration
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer);
////////////////////////////////////////////////////////////////////////////////
// backend interface
////////////////////////////////////////////////////////////////////////////////
@@ -68,11 +71,11 @@ static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t bu
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
GGML_UNUSED(buffer);
GGML_UNUSED(src);
GGML_UNUSED(dst);
if (!ggml_backend_buffer_is_metal(src->buffer)) {
return false;
}
return false;
return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
}
static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -144,11 +147,11 @@ static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t b
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
GGML_UNUSED(buffer);
GGML_UNUSED(src);
GGML_UNUSED(dst);
if (!ggml_backend_buffer_is_metal(src->buffer)) {
return false;
}
return false;
return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
}
static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) {

View File

@@ -864,6 +864,9 @@ extern "C" {
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
#define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
typedef uint32_t llama_state_seq_flags;
LLAMA_API size_t llama_state_seq_get_size_ext(

View File

@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "refs/tags/v0.43.2"
HTTPLIB_VERSION = "refs/tags/v0.43.3"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

View File

@@ -2230,13 +2230,17 @@ llm_graph_cb llama_context::graph_get_cb() const {
class llama_io_write_dummy : public llama_io_write_i {
public:
llama_io_write_dummy() = default;
llama_io_write_dummy(bool skip_tensors) : skip_tensors(skip_tensors) {}
void write(const void * /* src */, size_t size) override {
size_written += size;
}
void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
void write_tensor(ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
if (skip_tensors) {
return;
}
size_written += size;
}
@@ -2245,34 +2249,21 @@ public:
}
private:
const bool skip_tensors;
size_t size_written = 0;
};
class llama_io_write_buffer : public llama_io_write_i {
class llama_io_write_host : public llama_io_write_i {
public:
llama_io_write_buffer(
llama_io_write_host(
uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
~llama_io_write_buffer() {
#if 1
~llama_io_write_host() {
// TODO: add backend support to batch tensor_get? or some other way to speed this up
for (const auto & info : winfos) {
ggml_backend_tensor_get(info.tensor, info.ptr, info.offset, info.size);
for (const auto & winfo : winfos) {
ggml_backend_tensor_get(winfo.tensor, winfo.ptr, winfo.offset, winfo.size);
}
#else
// flush the writes asynchronously
// this helps on Macs, but on other devices - it does not. just an example
std::vector<std::future<void>> futures;
futures.reserve(winfos.size());
for (const auto & info : winfos) {
futures.push_back(std::async(std::launch::async, [info]() {
ggml_backend_tensor_get(info.tensor, info.ptr, info.offset, info.size);
}));
}
for (auto & f : futures) {
f.wait();
}
#endif
}
void write(const void * src, size_t size) override {
@@ -2285,7 +2276,7 @@ public:
buf_size -= size;
}
void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
void write_tensor(ggml_tensor * tensor, size_t offset, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
@@ -2308,7 +2299,7 @@ private:
size_t size_written = 0;
struct write_info {
const ggml_tensor * tensor;
ggml_tensor * tensor;
uint8_t * ptr;
size_t size;
size_t offset;
@@ -2316,14 +2307,14 @@ private:
std::vector<write_info> winfos;
};
class llama_io_read_buffer : public llama_io_read_i {
class llama_io_read_host : public llama_io_read_i {
public:
llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
llama_io_read_host(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
~llama_io_read_buffer() {
~llama_io_read_host() {
// flush the reads
for (const auto & info : rinfos) {
ggml_backend_tensor_set(info.tensor, info.ptr, info.offset, info.size);
for (const auto & rinfo : rinfos) {
ggml_backend_tensor_set(rinfo.tensor, rinfo.ptr, rinfo.offset, rinfo.size);
}
}
@@ -2377,7 +2368,7 @@ public:
size_written += size;
}
void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
void write_tensor(ggml_tensor * tensor, size_t offset, size_t size) override {
temp_buffer.resize(size);
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
write(temp_buffer.data(), temp_buffer.size());
@@ -2418,8 +2409,162 @@ private:
std::vector<uint8_t> temp_buffer;
};
class llama_io_write_device : public llama_io_write_i {
public:
llama_io_write_device(uint8_t * p, size_t len, llama_memory_buffers & mbufs) : ptr(p), buf_size(len), mbufs(mbufs) {
}
~llama_io_write_device() {
llama_memory_buffers mbufs_new;
for (const auto & winfo : winfos) {
auto * buft = ggml_backend_buffer_get_type(winfo.tensor->buffer);
mbufs_new[buft].n_tensors++;
mbufs_new[buft].total_size += winfo.size;
}
for (auto & [buft, mbuf] : mbufs_new) {
ggml_init_params params = {
/*.mem_size =*/ 2*mbuf.n_tensors*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
mbuf.ctx.reset(ggml_init(params));
mbuf.org.reserve(mbuf.n_tensors);
mbuf.cpy.reserve(mbuf.n_tensors);
}
for (const auto & winfo : winfos) {
auto * buft = ggml_backend_buffer_get_type(winfo.tensor->buffer);
const int64_t n = winfo.size/ggml_element_size(winfo.tensor);
auto & mbuf = mbufs_new[buft];
mbuf.org.push_back(ggml_view_1d (mbuf.ctx.get(), winfo.tensor, n, winfo.offset));
mbuf.cpy.push_back(ggml_new_tensor_1d(mbuf.ctx.get(), winfo.tensor->type, n));
}
for (auto & [buft, mbuf] : mbufs_new) {
auto & mbuf_cur = mbufs[buft];
if (!mbuf_cur.buf || mbuf_cur.org.size() != mbuf.org.size() || mbuf_cur.total_size != mbuf.total_size) {
mbuf_cur = std::move(mbuf);
mbuf_cur.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(mbuf_cur.ctx.get(), buft));
LLAMA_LOG_INFO("%s: allocated '%s' buffer %.3f MiB\n", __func__, ggml_backend_buft_name(buft), mbuf.total_size/1024.0/1024.0);
}
for (size_t i = 0; i < mbuf_cur.org.size(); ++i) {
ggml_backend_tensor_copy(mbuf_cur.org[i], mbuf_cur.cpy[i]);
}
}
}
void write(const void * src, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
memcpy(ptr, src, size);
ptr += size;
size_written += size;
buf_size -= size;
}
void write_tensor(ggml_tensor * tensor, size_t offset, size_t size) override {
// save the write for later during destruction
winfos.push_back({tensor, ptr, size, offset});
}
size_t n_bytes() override {
return size_written;
}
private:
uint8_t * ptr;
size_t buf_size = 0;
size_t size_written = 0;
struct write_info {
ggml_tensor * tensor;
uint8_t * ptr;
size_t size;
size_t offset;
};
std::vector<write_info> winfos;
llama_memory_buffers & mbufs;
};
class llama_io_read_device : public llama_io_read_i {
public:
llama_io_read_device(const uint8_t * p, size_t len, const llama_memory_buffers & mbufs) : ptr(p), buf_size(len), mbufs(mbufs) {
}
~llama_io_read_device() {
llama_memory_buffers mbufs_new;
for (const auto & rinfo : rinfos) {
auto * buft = ggml_backend_buffer_get_type(rinfo.tensor->buffer);
mbufs_new[buft].n_tensors++;
mbufs_new[buft].total_size += rinfo.size;
}
for (auto & [buft, mbuf] : mbufs_new) {
const auto & mbuf_cur = mbufs.at(buft);
if (!mbuf_cur.buf || mbuf_cur.n_tensors != mbuf.n_tensors || mbuf_cur.total_size != mbuf.total_size) {
GGML_ABORT("%s: memory buffer mismatch\n", __func__);
}
for (size_t i = 0; i < mbuf_cur.org.size(); ++i) {
ggml_backend_tensor_copy(mbuf_cur.cpy[i], mbuf_cur.org[i]);
}
}
}
void read(void * dst, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
memcpy(dst, ptr, size);
ptr += size;
size_read += size;
buf_size -= size;
}
void read_tensor(ggml_tensor * tensor, size_t offset, size_t size) override {
// save for later during destruction
rinfos.push_back({tensor, ptr, size, offset});
}
size_t n_bytes() override {
return size_read;
}
private:
const uint8_t * ptr;
size_t buf_size = 0;
size_t size_read = 0;
struct read_info {
ggml_tensor * tensor;
const uint8_t * ptr;
size_t size;
size_t offset;
};
std::vector<read_info> rinfos;
const llama_memory_buffers & mbufs;
};
size_t llama_context::state_get_size() {
llama_io_write_dummy io;
llama_io_write_dummy io(false);
try {
return state_write_data(io);
} catch (const std::exception & err) {
@@ -2429,7 +2574,7 @@ size_t llama_context::state_get_size() {
}
size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
llama_io_write_buffer io(dst, size);
llama_io_write_host io(dst, size);
try {
return state_write_data(io);
} catch (const std::exception & err) {
@@ -2439,7 +2584,7 @@ size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
}
size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
llama_io_read_buffer io(src, size);
llama_io_read_host io(src, size);
try {
return state_read_data(io);
} catch (const std::exception & err) {
@@ -2448,9 +2593,14 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
}
}
static constexpr uint32_t io_magic = 0xaf143cd8;
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
llama_io_write_dummy io;
llama_io_write_dummy io(flags & LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
try {
io.write(&io_magic, sizeof(io_magic));
io.write(&seq_id, sizeof(seq_id));
return state_seq_write_data(io, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
@@ -2459,9 +2609,18 @@ size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_fl
}
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
llama_io_write_buffer io(dst, size);
std::unique_ptr<llama_io_write_i> io;
if (flags & LLAMA_STATE_SEQ_FLAGS_ON_DEVICE) {
io = std::make_unique<llama_io_write_device>(dst, size, mem_storage[seq_id]);
} else {
io = std::make_unique<llama_io_write_host>(dst, size);
}
try {
return state_seq_write_data(io, seq_id, flags);
io->write(&io_magic, sizeof(io_magic));
io->write(&seq_id, sizeof(seq_id));
return state_seq_write_data(*io, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
return 0;
@@ -2469,9 +2628,43 @@ size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, siz
}
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
llama_io_read_buffer io(src, size);
std::unique_ptr<llama_io_read_i> io;
if (flags & LLAMA_STATE_SEQ_FLAGS_ON_DEVICE) {
// create a temporary io to read the magic and the src seq_id
io = std::make_unique<llama_io_read_host>(src, size);
uint32_t magic_read;
io->read(&magic_read, sizeof(magic_read));
if (io_magic != magic_read) {
throw std::runtime_error("wrong sequence state magic");
}
llama_seq_id seq_id_read;
io->read(&seq_id_read, sizeof(seq_id_read));
GGML_ASSERT(mem_storage.find(seq_id_read) != mem_storage.end());
io = std::make_unique<llama_io_read_device>(src, size, mem_storage[seq_id_read]);
} else {
io = std::make_unique<llama_io_read_host>(src, size);
}
try {
return state_seq_read_data(io, seq_id, flags);
uint32_t magic_read;
io->read(&magic_read, sizeof(magic_read));
if (io_magic != magic_read) {
throw std::runtime_error("wrong sequence state magic");
}
const bool need_seq_match = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
llama_seq_id seq_id_read;
io->read(&seq_id_read, sizeof(seq_id_read));
if (need_seq_match && seq_id != seq_id_read) {
throw std::runtime_error("wrong sequence id");
}
return state_seq_read_data(*io, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
return 0;
@@ -3462,7 +3655,6 @@ size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t s
return ctx->state_seq_get_data(seq_id, dst, size, flags);
}
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
ctx->synchronize();

View File

@@ -23,6 +23,21 @@ class llama_io_write_i;
struct llama_memory_i;
struct llama_memory_context_i;
// stores copy of the memory in device buffer. used for fast state save/load
struct llama_memory_buffer {
int n_tensors = 0;
size_t total_size = 0;
ggml_backend_buffer_ptr buf;
ggml_context_ptr ctx;
std::vector<ggml_tensor *> org;
std::vector<ggml_tensor *> cpy;
};
using llama_memory_buffers = std::map<ggml_backend_buffer_type_t, llama_memory_buffer>;
struct llama_context {
// init scheduler and compute buffers, reserve worst-case graphs
llama_context(
@@ -128,6 +143,7 @@ struct llama_context {
size_t state_set_data(const uint8_t * src, size_t size);
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
@@ -328,6 +344,9 @@ private:
// host buffer for the model output (logits and embeddings)
ggml_backend_buffer_ptr buf_output;
// keep copies of the per-sequence memory on the device
std::map<llama_seq_id, llama_memory_buffers> mem_storage;
bool has_evaluated_once = false;
// env: LLAMA_GRAPH_REUSE_DISABLE

View File

@@ -65,7 +65,11 @@ static ggml_tensor * ggml_mul_mat_aux(
ggml_tensor * res;
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
if (!ggml_is_contiguous(cur)) {
res = ggml_cont_2d (ctx, cur, n, ggml_nelements(cur)/n);
} else {
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
}
res = ggml_mul_mat (ctx, rot, res);
ggml_mul_mat_set_hint(res, GGML_HINT_SRC0_IS_HADAMARD);
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);

View File

@@ -12,7 +12,7 @@ public:
virtual ~llama_io_write_i() = default;
virtual void write(const void * src, size_t size) = 0;
virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
virtual void write_tensor(ggml_tensor * tensor, size_t offset, size_t size) = 0;
// bytes written so far
virtual size_t n_bytes() = 0;

View File

@@ -784,7 +784,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
const uint32_t n_layer = hparams.n_layer;
io.write(&s_trans, sizeof(s_trans));
io.write(&n_layer, sizeof(n_layer));
io.write(&n_layer, sizeof(n_layer));
// Iterate and write all the R tensors first, each row is a cell
// Get whole range at a time

View File

@@ -36,7 +36,7 @@ using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
static void server_prompt_checkpoint_update(server_prompt_checkpoint & ckpt, llama_context * ctx, int id, int64_t n_tokens, llama_pos pos_min = -1, llama_pos pos_max = -1) {
static void server_prompt_checkpoint_update(server_prompt_checkpoint & ckpt, llama_context * ctx, int id, int64_t n_tokens, bool on_device, llama_pos pos_min = -1, llama_pos pos_max = -1) {
if (pos_min == -1) {
pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), id);
}
@@ -44,14 +44,19 @@ static void server_prompt_checkpoint_update(server_prompt_checkpoint & ckpt, lla
pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), id);
}
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
auto flags = LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY;
if (on_device) {
flags |= LLAMA_STATE_SEQ_FLAGS_ON_DEVICE;
}
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, id, flags);
ckpt.pos_min = pos_min;
ckpt.pos_max = pos_max;
ckpt.n_tokens = n_tokens;
ckpt.data.resize(checkpoint_size);
const size_t n = llama_state_seq_get_data_ext(ctx, ckpt.data.data(), checkpoint_size, id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
const size_t n = llama_state_seq_get_data_ext(ctx, ckpt.data.data(), checkpoint_size, id, flags);
if (n != checkpoint_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
}
@@ -362,7 +367,7 @@ struct server_slot {
//const int64_t t_start = ggml_time_us();
server_prompt_checkpoint_update(spec_ckpt, ctx, this->id, n_tokens);
server_prompt_checkpoint_update(spec_ckpt, ctx, this->id, n_tokens, true);
//const int64_t t_total = ggml_time_us() - t_start;
//printf("checkpoint total: %f ms\n", t_total / 1000.0);
@@ -1838,7 +1843,7 @@ private:
}
auto & cur = slot.prompt.checkpoints.emplace_back();
server_prompt_checkpoint_update(cur, ctx, slot.id, slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max);
server_prompt_checkpoint_update(cur, ctx, slot.id, slot.prompt.n_tokens() - n_tokens_cur, false, pos_min, pos_max);
SLT_WRN(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
@@ -3003,7 +3008,7 @@ private:
SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n",
ckpt.pos_min, ckpt.pos_max, ckpt.size());
const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
const size_t n = llama_state_seq_set_data_ext(slot.ctx, ckpt.data.data(), ckpt.size(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
if (n != ckpt.size()) {
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt.size(), n);

View File

@@ -10,6 +10,7 @@
#include <atomic>
#include <cstring>
#include <climits>
#include <algorithm>
namespace fs = std::filesystem;
@@ -744,6 +745,24 @@ void server_tools::setup(const std::vector<std::string> & enabled_tools) {
std::unordered_set<std::string> enabled_set(enabled_tools.begin(), enabled_tools.end());
auto all_tools = build_tools();
// collect all known tool names for validation
std::vector<std::string> known_names;
known_names.reserve(all_tools.size());
for (const auto & t : all_tools) {
known_names.push_back(t->name);
}
// validate that every requested tool is known
for (const auto & name : enabled_tools) {
if (name == "all") continue;
if (std::find(known_names.begin(), known_names.end(), name) == known_names.end()) {
throw std::runtime_error(string_format(
"unknown tool \"%s\". available tools: %s",
name.c_str(),
string_join(known_names, ", ").c_str()));
}
}
tools.clear();
for (auto & t : all_tools) {
if (enabled_set.count(t->name) > 0 || enabled_set.count("all") > 0) {

View File

@@ -215,7 +215,12 @@ int main(int argc, char ** argv) {
}
// EXPERIMENTAL built-in tools
if (!params.server_tools.empty()) {
tools.setup(params.server_tools);
try {
tools.setup(params.server_tools);
} catch (const std::exception & e) {
LOG_ERR("%s: tools setup failed: %s\n", __func__, e.what());
return 1;
}
SRV_WRN("%s", "-----------------\n");
SRV_WRN("%s", "Built-in tools are enabled, do not expose server to untrusted environments\n");
SRV_WRN("%s", "This feature is EXPERIMENTAL and may be changed in the future\n");

View File

@@ -2506,6 +2506,10 @@ void get_remote_ip_and_port(socket_t sock, std::string &ip, int &port) {
}
}
// Recursive form retained so operator""_t below can compute hashes for
// switch-case labels at compile time (C++11 constexpr forbids loops). Do not
// call from runtime paths with arbitrary-length inputs — use str2tag()
// instead, which is iterative and stack-safe.
constexpr unsigned int str2tag_core(const char *s, size_t l,
unsigned int h) {
return (l == 0)
@@ -2519,7 +2523,16 @@ constexpr unsigned int str2tag_core(const char *s, size_t l,
}
unsigned int str2tag(const std::string &s) {
return str2tag_core(s.data(), s.size(), 0);
// Iterative form of str2tag_core: the recursive constexpr version is kept
// for compile-time UDL evaluation of short string literals, but at runtime
// we may receive arbitrarily long inputs (e.g. fuzzed Content-Type) that
// would blow the stack with one frame per character.
unsigned int h = 0;
for (auto c : s) {
h = (((std::numeric_limits<unsigned int>::max)() >> 6) & h * 33) ^
static_cast<unsigned char>(c);
}
return h;
}
namespace udl {
@@ -9777,7 +9790,15 @@ bool ClientImpl::process_request(Stream &strm, Request &req,
output_error_log(error, &req);
return false;
}
res.body.reserve(static_cast<size_t>(len));
// Cap the reservation by payload_max_length_ to avoid OOM when a
// hostile or malformed server sends an enormous Content-Length.
// The actual body read below is bounded by payload_max_length_,
// so reserving more than that is never useful.
auto reserve_len = static_cast<size_t>(len);
if (payload_max_length_ > 0 && reserve_len > payload_max_length_) {
reserve_len = payload_max_length_;
}
res.body.reserve(reserve_len);
}
}

View File

@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.43.2"
#define CPPHTTPLIB_VERSION_NUM "0x002b02"
#define CPPHTTPLIB_VERSION "0.43.3"
#define CPPHTTPLIB_VERSION_NUM "0x002b03"
#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00