mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-07-02 20:23:40 +02:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fdb1db877c | |||
| 4fc4ec5541 | |||
| a6647b1a32 |
+10
-8
@@ -496,13 +496,15 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
}
|
||||
|
||||
// handle hf_plan tasks
|
||||
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
|
||||
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files,
|
||||
const hf_cache::hf_file & primary,
|
||||
common_params_model & model) {
|
||||
for (size_t i = 0; i < model_files.size(); ++i) {
|
||||
auto & model_file = model_files[i];
|
||||
bool is_first = (i == 0);
|
||||
tasks.emplace_back(model_file, opts, [&, is_first]() {
|
||||
if (is_first) {
|
||||
// only use first part as model path
|
||||
bool is_primary = (model_file.path == primary.path);
|
||||
tasks.emplace_back(model_file, opts, [&, is_primary]() {
|
||||
if (is_primary) {
|
||||
// the primary file is the first split (00001-of), use it as model path
|
||||
model.path = hf_cache::finalize_file(model_file);
|
||||
} else {
|
||||
hf_cache::finalize_file(model_file);
|
||||
@@ -511,7 +513,7 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
}
|
||||
};
|
||||
if (!plan.model_files.empty()) {
|
||||
add_tasks(plan.model_files, params.model);
|
||||
add_tasks(plan.model_files, plan.primary, params.model);
|
||||
}
|
||||
if (!plan.mmproj.local_path.empty()) {
|
||||
tasks.emplace_back(plan.mmproj, opts, [&]() {
|
||||
@@ -539,12 +541,12 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
|
||||
// handle plan_spec (e.g. --spec-draft-hf)
|
||||
if (!plan_spec.model_files.empty()) {
|
||||
add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
|
||||
add_tasks(plan_spec.model_files, plan_spec.primary, params.speculative.draft.mparams);
|
||||
}
|
||||
|
||||
// handle vocoder plan (e.g. --hf-repo-v)
|
||||
if (!plan_voc.model_files.empty()) {
|
||||
add_tasks(plan_voc.model_files, params.vocoder.model);
|
||||
add_tasks(plan_voc.model_files, plan_voc.primary, params.vocoder.model);
|
||||
}
|
||||
|
||||
// run all tasks in parallel
|
||||
|
||||
+51
-39
@@ -1,16 +1,26 @@
|
||||
# llama.cpp for OpenCL
|
||||
|
||||
- [Background](#background)
|
||||
- [OS](#os)
|
||||
- [Hardware](#hardware)
|
||||
- [DataType Supports](#datatype-supports)
|
||||
- [Model Preparation](#model-preparation)
|
||||
- [CMake Options](#cmake-options)
|
||||
- [Android](#android)
|
||||
- [Windows 11 Arm64](#windows-11-arm64)
|
||||
- [Linux](#Linux)
|
||||
- [Known Issue](#known-issues)
|
||||
- [TODO](#todo)
|
||||
- [llama.cpp for OpenCL](#llamacpp-for-opencl)
|
||||
- [Background](#background)
|
||||
- [Llama.cpp + OpenCL](#llamacpp--opencl)
|
||||
- [OS](#os)
|
||||
- [Hardware](#hardware)
|
||||
- [Adreno GPU](#adreno-gpu)
|
||||
- [DataType Supports](#datatype-supports)
|
||||
- [Model Preparation](#model-preparation)
|
||||
- [Binary Kernel Library](#binary-kernel-library)
|
||||
- [CMake Options](#cmake-options)
|
||||
- [Android](#android)
|
||||
- [I. Setup Environment](#i-setup-environment)
|
||||
- [II. Build llama.cpp](#ii-build-llamacpp)
|
||||
- [Windows 11 Arm64](#windows-11-arm64)
|
||||
- [I. Setup Environment](#i-setup-environment-1)
|
||||
- [II. Build llama.cpp](#ii-build-llamacpp-1)
|
||||
- [Linux](#linux)
|
||||
- [I. Setup Environment](#i-setup-environment-2)
|
||||
- [II. Build llama.cpp](#ii-build-llamacpp-2)
|
||||
- [Known Issues](#known-issues)
|
||||
- [TODO](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
@@ -34,11 +44,13 @@ The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adren
|
||||
|
||||
**Verified devices**
|
||||
|
||||
| Adreno GPU | Status |
|
||||
|:------------------------------------:|:-------:|
|
||||
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
|
||||
| Adreno 830 (Snapdragon 8 Elite) | Support |
|
||||
| Adreno X85 (Snapdragon X Elite) | Support |
|
||||
| Adreno GPU | Status |
|
||||
|:-------------------------------------:|:-------:|
|
||||
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
|
||||
| Adreno 830 (Snapdragon 8 Elite) | Support |
|
||||
| Adreno 840 (Snapdragon 8 Elite Gen 5) | Support |
|
||||
| Adreno X1-85 (Snapdragon X Elite) | Support |
|
||||
| Adreno X2-90 (Snapdragon X2 Elite) | Support |
|
||||
|
||||
> A6x GPUs with a recent driver and compiler are supported; they are usually found in IoT platforms.
|
||||
However, A6x GPUs in phones are likely not supported due to the outdated driver and compiler.
|
||||
@@ -47,42 +59,43 @@ However, A6x GPUs in phones are likely not supported due to the outdated driver
|
||||
|
||||
| DataType | Status |
|
||||
|:----------------------:|:--------------------------:|
|
||||
| Q1_0 | Support |
|
||||
| Q4_0 | Support |
|
||||
| Q6_K | Support, but not optimized |
|
||||
| Q4_1 | Support |
|
||||
| Q5_0 | Support |
|
||||
| Q5_1 | Support |
|
||||
| Q8_0 | Support |
|
||||
| Q4_K | Support |
|
||||
| Q5_K | Support |
|
||||
| Q6_K | Support |
|
||||
| MXFP4 | Support |
|
||||
| IQ4_NL | Support |
|
||||
|
||||
## Model Preparation
|
||||
|
||||
You can refer to the general [llama-quantize tool](/tools/quantize/README.md) for steps to convert a model in Hugging Face safetensor format to GGUF with quantization.
|
||||
Since common quantizations are supported now, it is recommanded to download GGUF models directly from Huggingface.
|
||||
|
||||
Currently we support `Q4_0` quantization and have optimized for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize` (i.e., make all weights in `Q4_0`). For example,
|
||||
## Binary Kernel Library
|
||||
|
||||
```sh
|
||||
./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
|
||||
```
|
||||
A prebuilt binary kernel library has been introduced for Adreno GPUs.
|
||||
It currently targets X2 GPUs (X2-90, X2-85 and X2-45) found in Snapdragon X2 SoC.
|
||||
The library currently contains kernels for MUL_MAT_ID with Q4_0, Q4_1, Q4_K, MXFP4.
|
||||
The library must be manually downloaded from https://softwarecenter.qualcomm.com/catalog/item/Adreno_Kernel_Library_GGML.
|
||||
|
||||
Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
|
||||
To allow using the kernel library, add `-DGGML_OPENCL_USE_ADRENO_BIN_KERNELS=ON` when configuring with CMake.
|
||||
Then, extract `adreno-opencl-kernels.dll` from the zip file downloaded from the above URL and put it alongside the executables.
|
||||
If kernels compatible with the current GPU are found in the library, they will be loaded and used.
|
||||
|
||||
### `MXFP4` MoE Models
|
||||
|
||||
OpenAI gpt-oss models are MoE models in `MXFP4`. The quantized model will be in `MXFP4_MOE`, a mixture of `MXFP4` and `Q8_0`.
|
||||
For this quantization, there is no need to specify `--pure`.
|
||||
For gpt-oss-20b model, you can directly [download](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) the quantized GGUF file in `MXFP4_MOE` from Hugging Face.
|
||||
|
||||
Although it is possible to quantize gpt-oss-20b model in pure `Q4_0` (all weights in `Q4_0`), it is not recommended since `MXFP4` has been optimized for MoE while `Q4_0` is not. In addition, accuracy should degrade with such pure `Q4_0` quantization.
|
||||
Hence, using the default `MXFP4_MOE` quantization (see the link above) is recommended for this model.
|
||||
|
||||
> Note that the `Q4_0` model found [here](https://huggingface.co/unsloth/gpt-oss-20b-GGUF/blob/main/gpt-oss-20b-Q4_0.gguf) is a mixture of `Q4_0`, `Q8_0` and `MXFP4` and gives better performance than `MXFP4_MOE` quantization.
|
||||
|
||||
## CMake Options
|
||||
|
||||
The OpenCL backend has the following CMake options that control the behavior of the backend.
|
||||
|
||||
| CMake options | Default value | Description |
|
||||
|:---------------------------------:|:--------------:|:------------------------------------------|
|
||||
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
|
||||
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
|
||||
| CMake options | Default value | Description |
|
||||
|:------------------------------------:|:--------------:|:------------------------------------------|
|
||||
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
|
||||
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
|
||||
| `GGML_OPENCL_USE_ADRENO_BIN_KERNELS` | `OFF` | Allow using binary kernel lib for Adreno. |
|
||||
|
||||
## Android
|
||||
|
||||
@@ -277,6 +290,5 @@ ninja
|
||||
|
||||
## TODO
|
||||
|
||||
- Optimization for Q6_K
|
||||
- Support and optimization for Q4_K
|
||||
- Improve flash attention
|
||||
- Improve OpenCL C kernels performance
|
||||
|
||||
@@ -31,6 +31,11 @@ if (GGML_OPENCL_EMBED_KERNELS)
|
||||
target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
||||
endif ()
|
||||
|
||||
if (GGML_OPENCL_USE_ADRENO_BIN_KERNELS)
|
||||
message(STATUS "OpenCL will use precompiled binary kernels for Adreno (improved performance on some platforms)")
|
||||
add_compile_definitions(GGML_OPENCL_USE_ADRENO_BIN_KERNELS)
|
||||
endif ()
|
||||
|
||||
function(ggml_opencl_add_kernel KNAME)
|
||||
set(KERN_HDR ${CMAKE_CURRENT_BINARY_DIR}/autogenerated/${KNAME}.cl.h)
|
||||
set(KERN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernels/${KNAME}.cl)
|
||||
|
||||
@@ -13,6 +13,22 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_BIN_KERNELS
|
||||
#include "libdl.h"
|
||||
#ifdef _WIN32
|
||||
#define KERNEL_LIB_NAME "adreno-opencl-kernels.dll"
|
||||
#else
|
||||
#define KERNEL_LIB_NAME "libadreno-opencl-kernels.so"
|
||||
#endif // _WIN32
|
||||
#endif // GGML_OPENCL_USE_ADRENO_BIN_KERNELS
|
||||
|
||||
typedef const void * (*get_adreno_bin_kernel_func_t)(
|
||||
const char * name,
|
||||
const char * gpu_name,
|
||||
const char * compiler_ver,
|
||||
size_t * out_size
|
||||
);
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include <inttypes.h>
|
||||
@@ -476,6 +492,8 @@ struct ggml_backend_opencl_context {
|
||||
|
||||
bool adreno_has_large_buffer;
|
||||
bool adreno_use_large_buffer;
|
||||
bool adreno_use_bin_kernels;
|
||||
get_adreno_bin_kernel_func_t get_adreno_bin_kernel_func = nullptr;
|
||||
ggml_cl_compiler_version adreno_cl_compiler_version;
|
||||
|
||||
std::string kernel_compile_opts; // cached for lazy-compiled kernels.
|
||||
@@ -718,15 +736,15 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gated_delta_net_f32[4][2][2] = {};
|
||||
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns_bin;
|
||||
cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns_bin;
|
||||
cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns_bin;
|
||||
cl_kernel kernel_gemv_moe_q5_k_f32_ns, kernel_gemm_moe_q5_k_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q6_k_f32_ns, kernel_gemm_moe_q6_k_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns_bin;
|
||||
cl_kernel kernel_moe_reorder_b;
|
||||
cl_kernel kernel_moe_histogram, kernel_moe_scan, kernel_moe_fill, kernel_moe_scatter;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
@@ -870,6 +888,20 @@ struct ggml_backend_opencl_context {
|
||||
#endif
|
||||
}
|
||||
|
||||
const void * get_adreno_bin_kernel(const std::string &kernel_name, size_t *bin_size) const {
|
||||
if (!get_adreno_bin_kernel_func) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t sz;
|
||||
const void * kernel_bin = get_adreno_bin_kernel_func(
|
||||
kernel_name.c_str(), device_name.c_str(), driver_version.c_str(), &sz);
|
||||
if (bin_size) {
|
||||
*bin_size = sz;
|
||||
}
|
||||
return kernel_bin;
|
||||
}
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
cl_program program_transpose;
|
||||
@@ -891,7 +923,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_gemv_noshuffle_q4_0_f32_32000_1_4096;
|
||||
cl_kernel kernel_gemv_noshuffle_q4_1_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q4_1_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q8_0_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q8_0_f32, kernel_gemm_noshuffle_q8_0_f32_bin;
|
||||
cl_kernel kernel_gemv_noshuffle_q8_0_f32;
|
||||
cl_kernel kernel_gemm_noshuffle_q1_0_f32;
|
||||
cl_kernel kernel_gemv_noshuffle_q1_0_f32;
|
||||
@@ -988,6 +1020,32 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
||||
return build_program_from_source_ex(ctx, dev, program_buffer, compile_opts, /*fatal=*/true);
|
||||
}
|
||||
|
||||
static cl_program build_program_from_binary(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts, size_t bin_size = 0) {
|
||||
cl_program p;
|
||||
char *program_log;
|
||||
size_t log_size;
|
||||
int err;
|
||||
|
||||
p = clCreateProgramWithBinary(ctx, 1, &dev, &bin_size, (const unsigned char**)&program_buffer, NULL, &err);
|
||||
if(err < 0) {
|
||||
GGML_LOG_ERROR("OpenCL error creating program from binary");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
|
||||
if(err < 0) {
|
||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||
program_log = (char*) malloc(log_size + 1);
|
||||
program_log[log_size] = '\0';
|
||||
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
|
||||
GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
|
||||
free(program_log);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) {
|
||||
// compiler options for general kernels
|
||||
auto opencl_c_std =
|
||||
@@ -1014,6 +1072,17 @@ static void load_cl_kernels_argsort(ggml_backend_opencl_context *backend_ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
static bool use_adreno_bin_kernels(ggml_backend_opencl_context * backend_ctx) {
|
||||
#ifndef GGML_OPENCL_USE_ADRENO_BIN_KERNELS
|
||||
return false;
|
||||
#else
|
||||
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
|
||||
return false;
|
||||
}
|
||||
return backend_ctx->adreno_use_bin_kernels;
|
||||
#endif // GGML_OPENCL_USE_ADRENO_BIN_KERNELS
|
||||
}
|
||||
|
||||
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
if (backend_ctx->kernels_loaded) {
|
||||
return;
|
||||
@@ -3323,6 +3392,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_noshuffle_q8_0_f32_bin
|
||||
{
|
||||
size_t bin_size = 0;
|
||||
backend_ctx->kernel_gemm_noshuffle_q8_0_f32_bin = nullptr;
|
||||
|
||||
if (use_adreno_bin_kernels(backend_ctx)) {
|
||||
const char * kernel_bin = (const char *)backend_ctx->get_adreno_bin_kernel("gemm_noshuffle_q8_0_f32_ila", &bin_size);
|
||||
if (kernel_bin && bin_size > 0) {
|
||||
cl_program prog =
|
||||
build_program_from_binary(backend_ctx->context, backend_ctx->device, kernel_bin, compile_opts, bin_size);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q8_0_f32_bin = clCreateKernel(prog, "kernel_gemm_noshuffle_q8_0_f32_ila", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gemv_noshuffle_general_q8_0_f32
|
||||
{
|
||||
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
||||
@@ -3424,6 +3511,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q4_1_f32_ns_bin
|
||||
{
|
||||
size_t bin_size = 0;
|
||||
backend_ctx->kernel_gemm_moe_q4_1_f32_ns_bin = nullptr;
|
||||
|
||||
if (use_adreno_bin_kernels(backend_ctx)) {
|
||||
const char * kernel_bin = (const char *)backend_ctx->get_adreno_bin_kernel("gemm_moe_q4_1_f32_ns_ila", &bin_size);
|
||||
if (kernel_bin && bin_size > 0) {
|
||||
cl_program prog =
|
||||
build_program_from_binary(backend_ctx->context, backend_ctx->device, kernel_bin, CL_moe_compile_opts, bin_size);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q4_1_f32_ns_bin = clCreateKernel(prog, "kernel_gemm_moe_q4_1_f32_ns_ila", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gemv_moe_mxfp4_f32
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3490,6 +3595,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q4_0_f32_ns_bin
|
||||
{
|
||||
size_t bin_size = 0;
|
||||
backend_ctx->kernel_gemm_moe_q4_0_f32_ns_bin = nullptr;
|
||||
|
||||
if (use_adreno_bin_kernels(backend_ctx)) {
|
||||
const char * kernel_bin = (const char *)backend_ctx->get_adreno_bin_kernel("gemm_moe_q4_0_f32_ns_ila", &bin_size);
|
||||
if (kernel_bin && bin_size > 0) {
|
||||
cl_program prog =
|
||||
build_program_from_binary(backend_ctx->context, backend_ctx->device, kernel_bin, CL_moe_compile_opts, bin_size);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q4_0_f32_ns_bin = clCreateKernel(prog, "kernel_gemm_moe_q4_0_f32_ns_ila", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gemv_moe_q5_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3592,6 +3715,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q4_k_f32_ns_bin
|
||||
{
|
||||
size_t bin_size = 0;
|
||||
backend_ctx->kernel_gemm_moe_q4_k_f32_ns_bin = nullptr;
|
||||
|
||||
if (use_adreno_bin_kernels(backend_ctx)) {
|
||||
const char * kernel_bin = (const char *)backend_ctx->get_adreno_bin_kernel("gemm_moe_q4_k_f32_ns_ila", &bin_size);
|
||||
if (kernel_bin && bin_size > 0) {
|
||||
cl_program prog =
|
||||
build_program_from_binary(backend_ctx->context, backend_ctx->device, kernel_bin, CL_moe_compile_opts, bin_size);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q4_k_f32_ns_bin = clCreateKernel(prog, "kernel_gemm_moe_q4_k_f32_ns_ila", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gemv_moe_q5_k_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3689,9 +3830,27 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_mxfp4_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_mxfp4_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_mxfp4_f32_ns_bin
|
||||
{
|
||||
size_t bin_size = 0;
|
||||
backend_ctx->kernel_gemm_moe_mxfp4_f32_ns_bin = nullptr;
|
||||
|
||||
if (use_adreno_bin_kernels(backend_ctx)) {
|
||||
const char * kernel_bin = (const char *)backend_ctx->get_adreno_bin_kernel("gemm_moe_mxfp4_f32_ns_ila", &bin_size);
|
||||
if (kernel_bin && bin_size > 0) {
|
||||
cl_program prog =
|
||||
build_program_from_binary(backend_ctx->context, backend_ctx->device, kernel_bin, CL_moe_compile_opts, bin_size);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32_ns_bin = clCreateKernel(prog, "kernel_gemm_moe_mxfp4_f32_ns_ila", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// moe_reorder_b
|
||||
@@ -4770,6 +4929,27 @@ static ggml_backend_opencl_context * ggml_cl_init(ggml_backend_dev_t dev) {
|
||||
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
|
||||
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_BIN_KERNELS
|
||||
// try loading adreno binary kernels if enabled
|
||||
// if fails to load, builtin kernels will be used
|
||||
{
|
||||
dl_handle * kernel_lib_handle = dl_load_library(KERNEL_LIB_NAME);
|
||||
backend_ctx->adreno_use_bin_kernels = false;
|
||||
|
||||
if (kernel_lib_handle) {
|
||||
backend_ctx->get_adreno_bin_kernel_func = (get_adreno_bin_kernel_func_t)dl_get_sym(kernel_lib_handle, "get_adreno_kernels");
|
||||
if (backend_ctx->get_adreno_bin_kernel_func) {
|
||||
GGML_LOG_INFO("ggml_opencl: loaded bin kernel library %s\n", KERNEL_LIB_NAME);
|
||||
backend_ctx->adreno_use_bin_kernels = true;
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml_opencl: bin kernel library %s is invalid, will use builtin kernels\n", KERNEL_LIB_NAME);
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml_opencl: failed to load %s, will use builtin kernels\n", KERNEL_LIB_NAME);
|
||||
}
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_BIN_KERNELS
|
||||
|
||||
cl_int err;
|
||||
|
||||
// A local ref of cl_context for convenience
|
||||
@@ -14972,6 +15152,99 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
} else {
|
||||
// use bin kernel if available
|
||||
if (backend_ctx->kernel_gemm_noshuffle_q8_0_f32_bin) {
|
||||
int K_pad = K;
|
||||
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem d_sub_buf = nullptr;
|
||||
|
||||
cl_mem a_img = nullptr;
|
||||
cl_mem s_img = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
cl_mem d_img = nullptr;
|
||||
|
||||
// subbuffer for activations
|
||||
region.origin = offset1;
|
||||
region.size = K_pad * N * sizeof(float);
|
||||
CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// Create subbuffer and image1d_buffer for dst
|
||||
region.origin = (extrad->offset); // + dst->view_offs;
|
||||
region.size = M * N * sizeof(float);
|
||||
CL_CHECK((d_sub_buf = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err));
|
||||
|
||||
// create an image for A
|
||||
img_fmt = { CL_R, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 4; // Divide by 4 for char -> float
|
||||
img_desc.buffer = extra0_q8_0->q;
|
||||
CL_CHECK((a_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// create an image for Scale
|
||||
img_fmt = { CL_R, CL_HALF_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * K / 32; // Block size is 32
|
||||
img_desc.buffer = extra0_q8_0->d;
|
||||
CL_CHECK((s_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// create an image for B from sub_buffer
|
||||
img_fmt = {CL_R, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = K_pad * N;
|
||||
img_desc.buffer = b_sub_buf;
|
||||
CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// img for d
|
||||
img_fmt = {CL_R, CL_FLOAT};
|
||||
memset(&img_desc, 0, sizeof(img_desc));
|
||||
img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
||||
img_desc.image_width = M * N;
|
||||
img_desc.buffer = d_sub_buf;
|
||||
CL_CHECK((d_img = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt, &img_desc, NULL, &err), err));
|
||||
|
||||
// gemm
|
||||
kernel = backend_ctx->kernel_gemm_noshuffle_q8_0_f32_bin;
|
||||
|
||||
bool layoutA_Mfirst = true;
|
||||
bool layoutS_Mfirst = true;
|
||||
bool layoutB_Nfirst = false;
|
||||
bool layoutC_Mfirst = true;
|
||||
|
||||
cl_uint lineStrideMatrixAinBytes = layoutA_Mfirst ? M * 4 : K; // int8
|
||||
cl_uint lineStrideMatrixSinBytes = layoutS_Mfirst ? M * 2 : (K / 32) * 2; // fp16
|
||||
cl_uint lineStrideMatrixBinBytes = layoutB_Nfirst ? N * 4 : K_pad * 4; // fp32
|
||||
cl_uint lineStrideMatrixCinBytes = layoutC_Mfirst ? M * 4 : N * 4; // fp32
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &s_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &extra1->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &extrad->offset));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &lineStrideMatrixAinBytes));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &lineStrideMatrixSinBytes));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &lineStrideMatrixBinBytes));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &lineStrideMatrixCinBytes));
|
||||
|
||||
size_t global_work_size[] = { 64, (size_t)CEIL_DIV(M, 64), (size_t)CEIL_DIV(N, 64)};
|
||||
size_t local_work_size[] = { 64, 2, 2 };
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(b_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(d_sub_buf));
|
||||
CL_CHECK(clReleaseMemObject(a_img));
|
||||
CL_CHECK(clReleaseMemObject(s_img));
|
||||
CL_CHECK(clReleaseMemObject(b_img));
|
||||
CL_CHECK(clReleaseMemObject(d_img));
|
||||
return;
|
||||
}
|
||||
|
||||
cl_mem b_sub_buf = nullptr;
|
||||
cl_mem b_sub_buf_trans = nullptr;
|
||||
cl_mem b_img = nullptr;
|
||||
@@ -17825,6 +18098,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns;
|
||||
if (backend_ctx->kernel_gemm_moe_q4_0_f32_ns_bin) {
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_0_f32_ns_bin;
|
||||
}
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
@@ -17870,6 +18146,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
if (backend_ctx->kernel_gemm_moe_q4_0_f32_ns_bin) {
|
||||
// bin kernel uses slightly different image format
|
||||
image_format_buf_src1 = {CL_R, CL_FLOAT};
|
||||
image_desc_buf_src1.image_width = static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size);
|
||||
}
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
@@ -18042,6 +18323,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_1_f32_ns;
|
||||
if (backend_ctx->kernel_gemm_moe_q4_1_f32_ns_bin) {
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_1_f32_ns_bin;
|
||||
}
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
@@ -18087,6 +18371,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
if (backend_ctx->kernel_gemm_moe_q4_1_f32_ns_bin) {
|
||||
// bin kernel uses slightly different image format
|
||||
image_format_buf_src1 = {CL_R, CL_FLOAT};
|
||||
image_desc_buf_src1.image_width = static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size);
|
||||
}
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
@@ -18648,6 +18937,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_k_f32_ns;
|
||||
if (backend_ctx->kernel_gemm_moe_q4_k_f32_ns_bin) {
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_k_f32_ns_bin;
|
||||
}
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
@@ -18689,6 +18981,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
CL_CHECK(status);
|
||||
cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
if (backend_ctx->kernel_gemm_moe_q4_k_f32_ns_bin) {
|
||||
// bin kernel uses slightly different image format
|
||||
image_format_buf_src1 = {CL_R, CL_FLOAT};
|
||||
image_desc_buf_src1.image_width = static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size);
|
||||
}
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
@@ -19172,6 +19469,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32_ns;
|
||||
if (backend_ctx->kernel_gemm_moe_mxfp4_f32_ns_bin) {
|
||||
kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32_ns_bin;
|
||||
}
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
@@ -19218,6 +19518,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
if (backend_ctx->kernel_gemm_moe_mxfp4_f32_ns_bin) {
|
||||
// bin kernel uses slightly different image format
|
||||
image_format_buf_src1 = {CL_R, CL_FLOAT};
|
||||
image_desc_buf_src1.image_width = static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size);
|
||||
}
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
# include <winevt.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <filesystem>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(HMODULE handle) {
|
||||
FreeLibrary(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static inline dl_handle * dl_load_library(const fs::path & path) {
|
||||
// suppress error dialogs for missing DLLs
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
HMODULE handle = LoadLibraryW(path.wstring().c_str());
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
static inline void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
|
||||
void * p = (void *) GetProcAddress(handle, name);
|
||||
|
||||
SetErrorMode(old_mode);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
|
||||
struct dl_handle_deleter {
|
||||
void operator()(void * handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
};
|
||||
|
||||
static inline dl_handle * dl_load_library(const fs::path & path) {
|
||||
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
return handle;
|
||||
}
|
||||
|
||||
static inline void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
static inline const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -159,6 +159,9 @@ extern "C" {
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
||||
// Get the model file type (quantization) as a string, e.g. "Q8_0" or "Q4_K - Medium"
|
||||
LLAMA_API const char * llama_ftype_name(enum llama_ftype ftype);
|
||||
|
||||
enum llama_rope_scaling_type {
|
||||
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
||||
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
||||
@@ -606,6 +609,9 @@ extern "C" {
|
||||
// Get a string describing the model type
|
||||
LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
|
||||
// Get the model file type (quantization), e.g. LLAMA_FTYPE_MOSTLY_Q8_0
|
||||
LLAMA_API enum llama_ftype llama_model_ftype(const struct llama_model * model);
|
||||
|
||||
// Returns the total size of all the tensors in the model in bytes
|
||||
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||
|
||||
|
||||
+46
-44
@@ -27,52 +27,54 @@ const char * llama_file_version_name(llama_fver version) {
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
if (ftype & LLAMA_FTYPE_GUESSED) {
|
||||
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
||||
}
|
||||
#define LLAMA_FTYPE_PREFIX "(guessed) "
|
||||
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
||||
case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
const char * llama_ftype_name(llama_ftype ftype) {
|
||||
static constexpr size_t guessed_prefix_len = sizeof(LLAMA_FTYPE_PREFIX) - 1;
|
||||
const char * name;
|
||||
switch ((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) {
|
||||
case LLAMA_FTYPE_ALL_F32: name = LLAMA_FTYPE_PREFIX "all F32"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: name = LLAMA_FTYPE_PREFIX "F16"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_BF16: name = LLAMA_FTYPE_PREFIX "BF16"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q1_0: name = LLAMA_FTYPE_PREFIX "Q1_0"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: name = LLAMA_FTYPE_PREFIX "Q4_0"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: name = LLAMA_FTYPE_PREFIX "Q4_1"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: name = LLAMA_FTYPE_PREFIX "Q5_0"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: name = LLAMA_FTYPE_PREFIX "Q5_1"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: name = LLAMA_FTYPE_PREFIX "Q8_0"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: name = LLAMA_FTYPE_PREFIX "MXFP4 MoE"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_NVFP4: name = LLAMA_FTYPE_PREFIX "NVFP4"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: name = LLAMA_FTYPE_PREFIX "Q2_K - Medium"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: name = LLAMA_FTYPE_PREFIX "Q2_K - Small"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: name = LLAMA_FTYPE_PREFIX "Q3_K - Small"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_M: name = LLAMA_FTYPE_PREFIX "Q3_K - Medium"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_L: name = LLAMA_FTYPE_PREFIX "Q3_K - Large"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_S: name = LLAMA_FTYPE_PREFIX "Q4_K - Small"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_K_M: name = LLAMA_FTYPE_PREFIX "Q4_K - Medium"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: name = LLAMA_FTYPE_PREFIX "Q5_K - Small"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: name = LLAMA_FTYPE_PREFIX "Q5_K - Medium"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: name = LLAMA_FTYPE_PREFIX "Q6_K"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_TQ1_0: name = LLAMA_FTYPE_PREFIX "TQ1_0 - 1.69 bpw ternary"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_TQ2_0: name = LLAMA_FTYPE_PREFIX "TQ2_0 - 2.06 bpw ternary"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: name = LLAMA_FTYPE_PREFIX "IQ2_XXS - 2.0625 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: name = LLAMA_FTYPE_PREFIX "IQ2_XS - 2.3125 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_S: name = LLAMA_FTYPE_PREFIX "IQ2_S - 2.5 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_M: name = LLAMA_FTYPE_PREFIX "IQ2_M - 2.7 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XS: name = LLAMA_FTYPE_PREFIX "IQ3_XS - 3.3 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: name = LLAMA_FTYPE_PREFIX "IQ3_XXS - 3.0625 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: name = LLAMA_FTYPE_PREFIX "IQ1_S - 1.5625 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_M: name = LLAMA_FTYPE_PREFIX "IQ1_M - 1.75 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: name = LLAMA_FTYPE_PREFIX "IQ4_NL - 4.5 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: name = LLAMA_FTYPE_PREFIX "IQ4_XS - 4.25 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: name = LLAMA_FTYPE_PREFIX "IQ3_S - 3.4375 bpw"; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: name = LLAMA_FTYPE_PREFIX "IQ3_S mix - 3.66 bpw"; break;
|
||||
default: name = LLAMA_FTYPE_PREFIX "unknown, may not work"; break;
|
||||
}
|
||||
return (ftype & LLAMA_FTYPE_GUESSED) ? name : name + guessed_prefix_len;
|
||||
}
|
||||
|
||||
#undef LLAMA_FTYPE_PREFIX
|
||||
|
||||
// return a list of splits for a given path
|
||||
// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
|
||||
static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
|
||||
@@ -1693,12 +1695,12 @@ bool llama_model_loader::load_all_data(
|
||||
}
|
||||
|
||||
std::string llama_model_loader::ftype_name() const {
|
||||
return llama_model_ftype_name(ftype);
|
||||
return llama_ftype_name(ftype);
|
||||
}
|
||||
|
||||
void llama_model_loader::print_info() const {
|
||||
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
|
||||
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
|
||||
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_ftype_name(ftype));
|
||||
if (n_bytes < GiB) {
|
||||
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
||||
} else {
|
||||
|
||||
@@ -987,6 +987,8 @@ struct llama_model::impl {
|
||||
|
||||
std::string desc_str;
|
||||
|
||||
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
// model memory mapped files
|
||||
llama_mmaps mappings;
|
||||
|
||||
@@ -1200,6 +1202,8 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
|
||||
|
||||
pimpl->ftype = ml.ftype;
|
||||
|
||||
if (hparams.f_max_alibi_bias > 0.0f) {
|
||||
hparams.use_alibi = true;
|
||||
}
|
||||
@@ -1646,6 +1650,10 @@ std::string llama_model::desc() const {
|
||||
return pimpl->desc_str;
|
||||
}
|
||||
|
||||
llama_ftype llama_model::ftype() const {
|
||||
return pimpl->ftype;
|
||||
}
|
||||
|
||||
size_t llama_model::size() const {
|
||||
return pimpl->n_bytes;
|
||||
}
|
||||
@@ -2616,6 +2624,10 @@ int32_t llama_model_desc(const llama_model * model, char * buf, size_t buf_size)
|
||||
return snprintf(buf, buf_size, "%s", model->desc().c_str());
|
||||
}
|
||||
|
||||
llama_ftype llama_model_ftype(const llama_model * model) {
|
||||
return model->ftype();
|
||||
}
|
||||
|
||||
uint64_t llama_model_size(const llama_model * model) {
|
||||
return model->size();
|
||||
}
|
||||
|
||||
@@ -637,6 +637,8 @@ struct llama_model {
|
||||
|
||||
std::string desc() const;
|
||||
|
||||
llama_ftype ftype() const;
|
||||
|
||||
size_t size() const; // file size
|
||||
size_t n_tensors() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
@@ -448,6 +448,9 @@ int llama_cli(int argc, char ** argv) {
|
||||
console::log("%s\n", LLAMA_ASCII_LOGO);
|
||||
console::log("build : %s\n", inf.build_info.c_str());
|
||||
console::log("model : %s\n", inf.model_name.c_str());
|
||||
if (!inf.model_ftype.empty()) {
|
||||
console::log("ftype : %s\n", inf.model_ftype.c_str());
|
||||
}
|
||||
console::log("modalities : %s\n", modalities.c_str());
|
||||
if (!params.system_prompt.empty()) {
|
||||
console::log("using custom system prompt\n");
|
||||
|
||||
@@ -3989,6 +3989,8 @@ server_context_meta server_context::get_meta() const {
|
||||
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_tgt, bos_id, true) : "";
|
||||
auto eos_token_str = eos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx_tgt, eos_id, true) : "";
|
||||
|
||||
const char * ftype_name = llama_ftype_name(llama_model_ftype(impl->model_tgt));
|
||||
|
||||
return server_context_meta {
|
||||
/* build_info */ std::string(llama_build_info()),
|
||||
/* model_name */ impl->model_name,
|
||||
@@ -4023,6 +4025,7 @@ server_context_meta server_context::get_meta() const {
|
||||
/* model_n_embd_inp */ llama_model_n_embd(impl->model_tgt),
|
||||
/* model_n_params */ llama_model_n_params(impl->model_tgt),
|
||||
/* model_size */ llama_model_size(impl->model_tgt),
|
||||
/* model_ftype */ ftype_name,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -5118,6 +5121,7 @@ json server_routes::get_model_info() const {
|
||||
{"n_embd", meta->model_n_embd_inp},
|
||||
{"n_params", meta->model_n_params},
|
||||
{"size", meta->model_size},
|
||||
{"ftype", meta->model_ftype},
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -50,6 +50,7 @@ struct server_context_meta {
|
||||
int32_t model_n_embd_inp;
|
||||
uint64_t model_n_params;
|
||||
uint64_t model_size;
|
||||
std::string model_ftype;
|
||||
};
|
||||
|
||||
enum server_state {
|
||||
|
||||
Reference in New Issue
Block a user