Compare commits

...

5 Commits

Author SHA1 Message Date
Xuan Son Nguyen
de396e8790 nits (2) 2026-06-08 14:05:24 +02:00
Xuan Son Nguyen
2afe34a58c nits 2026-06-08 14:02:29 +02:00
Xuan Son Nguyen
93e126aa08 wire up input_video, accept raw base64 2026-06-08 13:59:14 +02:00
Xuan Son Nguyen
7705270eec Merge branch 'master' into xsn/server_input_file_schema 2026-06-08 13:43:32 +02:00
Xuan Son Nguyen
1458c8e581 server: refactor/generalize input file schema 2026-06-08 13:07:26 +02:00
2 changed files with 49 additions and 30 deletions

View File

@@ -1232,8 +1232,6 @@ print(completion.choices[0].text)
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more.
*Options:*
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported.
@@ -1252,9 +1250,18 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
`parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template).
For multimodal input:
- Content type `image_url` and `input_audio` are the same as OAI schema
- Content type `input_video` is an extension from OAI schema. For now, it only accepts base64 input
For multimodal input (typed content, `messages[i].content[j]`):
- If `type == "image_url"`:
- `image_url.url` can be a remote URL, base64 (raw or URI-encoded via `data:image/...;base64`) or path to local file
- Accepts formats supported by `stb_image` (jpeg, png, tga, bmp, gif, ...)
- If `type == "input_audio"`:
- Either `input_audio.data` or `input_audio.url` can be specified, can be a remote URL, raw base64 or path to local file
- Accepts formats supported by `miniaudio` (mp3, wav, flac)
- `input_audio.format` will be ignored, the file format will be determined automatically
- If `type == "input_video"`:
- Either `input_video.data` or `input_video.url` can be specified, can be a remote URL, raw base64 or path to local file
- Accepts formats supported by `ffmpeg`
- Note: for local file, make sure to set `--media-path`. File path must be prefixed by `file://`
*Examples:*

View File

@@ -839,12 +839,21 @@ json oaicompat_completion_params_parse(const json & body) {
return llama_params;
}
// media_path always end with '/', see arg.cpp
// url can be
// - http(s):// for remote files
// - file:// for local files (only allowed if media_path is set)
// - data: for base64 encoded data with uri scheme (e.g. data:image/png;base64,...)
// - raw base64 encoded data
static void handle_media(
std::vector<raw_buffer> & out_files,
json & media_obj,
const std::string & media_path) {
std::string url = json_value(media_obj, "url", std::string());
const std::string & url,
const std::string & media_path,
bool accept_base64_uri) {
if (!media_path.empty()) {
// should already be enforced by arg.cpp, but checking just in case
GGML_ASSERT(string_ends_with(media_path, "/"));
}
if (string_starts_with(url, "http")) {
// download remote image
// TODO @ngxson : maybe make these params configurable
@@ -880,20 +889,28 @@ static void handle_media(
data.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
out_files.push_back(data);
} else {
} else if (accept_base64_uri && string_starts_with(url, "data:")) {
// try to decode base64 image
std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
if (parts.size() != 2) {
throw std::runtime_error("Invalid url value");
throw std::runtime_error("Invalid uri-encoded base64 value");
} else if (!string_starts_with(parts[0], "data:image/")) {
throw std::runtime_error("Invalid url format: " + parts[0]);
throw std::runtime_error("Invalid uri format: " + parts[0]);
} else if (!string_ends_with(parts[0], "base64")) {
throw std::runtime_error("url must be base64 encoded");
throw std::runtime_error("uri must be base64 encoded");
} else {
auto base64_data = parts[1];
auto decoded_data = base64_decode(base64_data);
out_files.push_back(decoded_data);
}
} else {
// try as raw base64 string
auto decoded_data = base64_decode(url);
if (decoded_data.empty()) {
throw std::runtime_error("Invalid base64 value");
}
out_files.push_back(decoded_data);
}
}
@@ -979,14 +996,15 @@ json oaicompat_chat_params_parse(
}
for (auto & p : content) {
std::string type = json_value(p, "type", std::string());
std::string type = json_value(p, "type", std::string());
if (type == "image_url") {
if (!opt.allow_image) {
throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
}
json image_url = json_value(p, "image_url", json::object());
handle_media(out_files, image_url, opt.media_path);
std::string url = json_value(image_url, "url", std::string());
handle_media(out_files, url, opt.media_path, true);
p["type"] = "media_marker";
p["text"] = get_media_marker();
@@ -997,17 +1015,11 @@ json oaicompat_chat_params_parse(
throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
}
json input_audio = json_value(p, "input_audio", json::object());
std::string data = json_value(input_audio, "data", std::string());
std::string format = json_value(input_audio, "format", std::string());
// while we also support flac, we don't allow it here so we matches the OAI spec
if (format != "wav" && format != "mp3") {
throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
}
auto decoded_data = base64_decode(data); // expected to be base64 encoded
out_files.push_back(decoded_data);
// TODO: add audio_url support by reusing handle_media()
// note: don't need to validate "format", it's redundant
json input_audio = json_value(p, "input_audio", json::object());
std::string url = json_value(input_audio, "data",
json_value(input_audio, "url", std::string()));
handle_media(out_files, url, opt.media_path, false);
p["type"] = "media_marker";
p["text"] = get_media_marker();
@@ -1018,10 +1030,10 @@ json oaicompat_chat_params_parse(
throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
}
json input_video = json_value(p, "input_video", json::object());
std::string data = json_value(input_video, "data", std::string());
auto decoded_data = base64_decode(data); // expected to be base64 encoded
out_files.push_back(decoded_data);
json input_video = json_value(p, "input_video", json::object());
std::string url = json_value(input_video, "data",
json_value(input_video, "url", std::string()));
handle_media(out_files, url, opt.media_path, false);
p["type"] = "media_marker";
p["text"] = get_media_marker();