Compare commits

...

4 Commits

Author SHA1 Message Date
Gaurav Garg 0ed235ea2c [CUDA] Added a cudaMemcpy2DAsync fast path to ggml_cuda_cpy (#25057)
* [CUDA] Added a cudaMemcpy2DAsync fast path to ggml_cuda_cpy

Add a CUDA ggml_cpy fast path for same-type, same-shape strided copies that are just 2D pitched block copies.
When tensors are not fully contiguous but each row is contiguous, it now uses cudaMemcpy2DAsync instead of the slow element-wise scalar copy kernel.

This fixes the GDN recurrent snapshot update with -np 4, where rollback slots are separated by cache stride gaps.

* Add new tests that execute the new optimized strided copy path

* Return unsupported for strided copy in OpenVINO, as new tests are failing
2026-06-27 17:46:21 +05:30
Neo Zhang 9bebfcb4bc sycl : fix failed ut cases of norm (#25044) 2026-06-27 12:13:43 +03:00
Ruben Ortlam 0b6529d818 vulkan: fix step operator for 0 input (#25036) 2026-06-27 10:57:31 +02:00
Christian Kastner c299a92c38 binaries : Improve rpc-server and export-graph-ops names. (#25045)
Tests are generally prefixed with -test, so rename export-graph-ops
accordingly.

rpc-server is probably too generic a name for /usr/bin. Because it
should work with any ggml application, it is renamed to ggml-rpc-server.
2026-06-27 10:31:29 +03:00
10 changed files with 205 additions and 81 deletions
+1 -1
View File
@@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
### Untrusted environments or networks
If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
* Encrypt your data if sending it over the network.
+45
View File
@@ -386,6 +386,46 @@ static void ggml_cpy_f32_iq4_nl_cuda(
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
}
// check if a same-type copy reduces to a 2D strided copy (height rows of width
// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel
static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1,
size_t & width, size_t & height, size_t & spitch, size_t & dpitch) {
// require matching shape: a reshaped copy maps elements by flat order, which the
// prefix walk below does not handle
if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) {
return false;
}
// grow the contiguous prefix block shared by both tensors
size_t block_nb = ggml_element_size(src0);
int d = 0;
for (; d < GGML_MAX_DIMS; ++d) {
if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) {
break;
}
block_nb *= src0->ne[d];
}
// d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy)
if (d == 0 || d == GGML_MAX_DIMS) {
return false;
}
// dim d carries the rows; everything above it must be a single element
for (int i = d + 1; i < GGML_MAX_DIMS; ++i) {
if (src0->ne[i] != 1) {
return false;
}
}
width = block_nb;
height = src0->ne[d];
spitch = src0->nb[d];
dpitch = src1->nb[d];
return spitch >= width && dpitch >= width;
}
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
@@ -421,6 +461,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0;
if (src0->type == src1->type && contiguous_srcs) {
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
@@ -431,6 +473,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
{
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
}
} else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) {
CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch,
mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream));
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
if (can_be_transposed) {
ggml_cpy_scalar_cuda<float, float, true>
+4
View File
@@ -1053,6 +1053,10 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
(op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) {
return true;
}
// CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported
if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) {
return true;
}
break;
}
case GGML_OP_MUL_MAT: {
+102 -48
View File
@@ -2,8 +2,10 @@
#include "ggml-sycl/common.hpp"
#include "ggml-sycl/presets.hpp"
static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
static void norm_f32(const float* x, float* dst, const int ncols,
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
const int nrows = item_ct1.get_group_range(2);
const int nchannels = item_ct1.get_group_range(1);
@@ -16,16 +18,16 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
const int tid = item_ct1.get_local_id(2);
const int nwarps = nthreads / WARP_SIZE;
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row});
const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row});
x += strided_offset;
dst += packed_offset;
x += src_offset;
dst += dst_offset;
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[col];
const float xi = x[col * src_stride_col];
mean_var.x() += xi;
mean_var.y() += xi * xi;
}
@@ -54,7 +56,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t
const float inv_std = sycl::rsqrt(var + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[col] = (x[col] - mean) * inv_std;
dst[col * dst_stride_col] = (x[col * src_stride_col] - mean) * inv_std;
}
}
@@ -145,8 +147,10 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
}
}
static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
static void rms_norm_f32(const float* x, float* dst, const int ncols,
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
const int nrows = item_ct1.get_group_range(2);
const int nchannels = item_ct1.get_group_range(1);
@@ -160,17 +164,17 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
const int tid = item_ct1.get_local_id(2);
const int nwarps = nthreads / WARP_SIZE;
const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row});
const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row});
x += strided_offset;
dst += packed_offset;
x += src_offset;
dst += dst_offset;
float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[col];
const float xi = x[col * src_stride_col];
tmp += xi * xi;
}
@@ -198,14 +202,15 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6
const float scale = sycl::rsqrt(mean + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[col] = scale * x[col];
dst[col * dst_stride_col] = scale * x[col * src_stride_col];
}
}
template<int warp_size>
static void l2_norm_f32(const float * x, float * dst, const int ncols,
const int64_t stride_row, const int64_t stride_channel,
const int64_t stride_sample, const float eps,
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel,
const int64_t src_stride_sample, const int64_t dst_stride_col, const int64_t dst_stride_row,
const int64_t dst_stride_channel, const int64_t dst_stride_sample, const float eps,
const sycl::nd_item<3>& item_ct1, float* s_sum, const int block_size) {
const int nrows = item_ct1.get_group_range(2);
const int nchannels = item_ct1.get_group_range(1);
@@ -215,13 +220,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols,
const int sample = item_ct1.get_group(0);
const int tid = item_ct1.get_local_id(2);
x += sample*stride_sample + channel*stride_channel + row*stride_row;
dst += ((sample*nchannels + channel)*nrows + row)*ncols;
x += sample*src_stride_sample + channel*src_stride_channel + row*src_stride_row;
dst += sample*dst_stride_sample + channel*dst_stride_channel + row*dst_stride_row;
float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[col];
const float xi = x[col * src_stride_col];
tmp += xi * xi;
}
@@ -229,12 +234,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols,
const float scale = sycl::rsqrt(sycl::fmax(tmp, eps * eps));
for (int col = tid; col < ncols; col += block_size) {
dst[col] = scale * x[col];
dst[col * dst_stride_col] = scale * x[col * src_stride_col];
}
}
static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
const float eps, queue_ptr stream, int device) {
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
@@ -245,7 +251,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
norm_f32(x, dst, ncols,
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
eps, item_ct1, nullptr, WARP_SIZE);
});
});
}
@@ -265,7 +274,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
norm_f32(x, dst, ncols,
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
});
});
}
@@ -319,7 +331,9 @@ static void group_norm_f32_sycl(const float* x, float* dst,
}
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample,
const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample,
const float eps, queue_ptr stream, int device) {
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
const sycl::range<3> global_dims(nsamples, nchannels, nrows);
@@ -330,7 +344,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
rms_norm_f32(x, dst, ncols,
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
eps, item_ct1, nullptr, WARP_SIZE);
});
});
}
@@ -350,7 +367,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
sycl::nd_range<3>(global_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
rms_norm_f32(x, dst, ncols,
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
});
});
}
@@ -363,9 +383,14 @@ static void l2_norm_f32_sycl(const float * x,
const int nrows,
const int nchannels,
const int nsamples,
const int64_t stride_row,
const int64_t stride_channel,
const int64_t stride_sample,
const int64_t src_stride_col,
const int64_t src_stride_row,
const int64_t src_stride_channel,
const int64_t src_stride_sample,
const int64_t dst_stride_col,
const int64_t dst_stride_row,
const int64_t dst_stride_channel,
const int64_t dst_stride_sample,
const float eps,
queue_ptr stream,
int device) {
@@ -379,7 +404,10 @@ static void l2_norm_f32_sycl(const float * x,
block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(warp_size)]] {
l2_norm_f32<warp_size>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
l2_norm_f32<warp_size>(x, dst, ncols,
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
eps, item_ct1,
nullptr, warp_size);
});
});
@@ -398,7 +426,9 @@ static void l2_norm_f32_sycl(const float * x,
block_dims),
[=](sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(warp_size)]] {
l2_norm_f32<warp_size>(x, dst, ncols, stride_row, stride_channel, stride_sample,
l2_norm_f32<warp_size>(x, dst, ncols,
src_stride_col, src_stride_row, src_stride_channel, src_stride_sample,
dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample,
eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
});
});
@@ -421,12 +451,20 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
const size_t tdst = ggml_type_size(dst->type);
GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
const int64_t ss0 = nb00 / ts0;
const int64_t ss1 = nb01 / ts0;
const int64_t ss2 = nb02 / ts0;
const int64_t ss3 = nb03 / ts0;
const int64_t ds0 = nb0 / tdst;
const int64_t ds1 = nb1 / tdst;
const int64_t ds2 = nb2 / tdst;
const int64_t ds3 = nb3 / tdst;
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03,
ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device);
}
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
@@ -465,11 +503,19 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_TENSOR_UNARY_OP_LOCALS
const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
const size_t tdst = ggml_type_size(dst->type);
GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
const int64_t ss0 = nb00 / ts0;
const int64_t ss1 = nb01 / ts0;
const int64_t ss2 = nb02 / ts0;
const int64_t ss3 = nb03 / ts0;
const int64_t ds0 = nb0 / tdst;
const int64_t ds1 = nb1 / tdst;
const int64_t ds2 = nb2 / tdst;
const int64_t ds3 = nb3 / tdst;
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03,
ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device);
}
void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -644,13 +690,21 @@ void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
GGML_ASSERT(eps >= 0.0f);
const size_t ts0 = ggml_type_size(src0->type);
GGML_ASSERT(nb00 == ts0);
const int64_t s01 = nb01 / ts0;
const int64_t s02 = nb02 / ts0;
const int64_t s03 = nb03 / ts0;
const size_t tdst = ggml_type_size(dst->type);
GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0);
GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0);
const int64_t ss0 = nb00 / ts0;
const int64_t ss1 = nb01 / ts0;
const int64_t ss2 = nb02 / ts0;
const int64_t ss3 = nb03 / ts0;
const int64_t ds0 = nb0 / tdst;
const int64_t ds1 = nb1 / tdst;
const int64_t ds2 = nb2 / tdst;
const int64_t ds3 = nb3 / tdst;
/*support both WARP_SIZE or WARP_32_SIZE in code
choose by hardware for better performance
*/
l2_norm_f32_sycl<WARP_SIZE>(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream, ctx.device);
l2_norm_f32_sycl<WARP_SIZE>(src0_d, dst_d, ne00, ne01, ne02, ne03,
ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, stream, ctx.device);
}
@@ -42,7 +42,7 @@ float op_leaky_relu(float x) {
}
float op_step(float x) {
return x >= 0.0f ? 1.0f : 0.0f;
return x > 0.0f ? 1.0f : 0.0f;
}
float op_tanh(float x) {
+4 -4
View File
@@ -302,9 +302,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
llama_build_and_test(test-alloc.cpp)
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
llama_build(export-graph-ops.cpp)
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
llama_build(test-export-graph-ops.cpp)
target_include_directories(test-export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
if (TARGET gguf-model-data)
target_link_libraries(export-graph-ops PRIVATE gguf-model-data)
target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH)
target_link_libraries(test-export-graph-ops PRIVATE gguf-model-data)
target_compile_definitions(test-export-graph-ops PRIVATE LLAMA_HF_FETCH)
endif()
+29 -8
View File
@@ -2890,12 +2890,17 @@ struct test_cpy : public test_case {
const std::array<int64_t, 4> ne_dst;
const std::array<int64_t, 4> permute_src;
const std::array<int64_t, 4> permute_dst;
const std::array<int64_t, 4> dst_alloc; // if set, dst is a view into a larger buffer (strided)
bool _src_use_permute;
bool _dst_use_permute;
bool _src_transpose;
bool _use_dst_shape;
bool _use_dst_alloc;
std::string vars() override {
if (_use_dst_alloc) {
return VARS_TO_STR8(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose, dst_alloc);
}
if (_use_dst_shape) {
return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
}
@@ -2943,12 +2948,15 @@ struct test_cpy : public test_case {
std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
bool transpose_src = false)
bool transpose_src = false,
std::array<int64_t, 4> dst_alloc = {0, 0, 0, 0})
: type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
dst_alloc(dst_alloc),
_src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
_src_transpose(transpose_src),
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0),
_use_dst_alloc(dst_alloc[0] > 0){}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
@@ -2966,12 +2974,23 @@ struct test_cpy : public test_case {
}
std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
ggml_set_name(dst, "dst");
ggml_tensor * dst;
if (_dst_use_permute) {
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
ggml_set_name(dst, "dst_permuted");
if (_use_dst_alloc) {
// view a sub-block of a larger buffer -> strided dst
ggml_tensor * dst_buf = ggml_new_tensor(ctx, type_dst, 4, dst_alloc.data());
ggml_set_name(dst_buf, "dst_buf");
dst = ggml_view_4d(ctx, dst_buf, dst_ne[0], dst_ne[1], dst_ne[2], dst_ne[3],
dst_buf->nb[1], dst_buf->nb[2], dst_buf->nb[3], 0);
ggml_set_name(dst, "dst_view");
} else {
dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
ggml_set_name(dst, "dst");
if (_dst_use_permute) {
dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]);
ggml_set_name(dst, "dst_permuted");
}
}
ggml_tensor * out = ggml_cpy(ctx, src, dst);
@@ -8181,6 +8200,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst
// CPY - different src/dst shapes (reshaping via CPY)
// Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
@@ -9943,7 +9964,7 @@ static void usage(char ** argv) {
printf(" --output specifies output format (default: console, options: console, sql, csv)\n");
printf(" --list-ops lists all available GGML operations\n");
printf(" --show-coverage shows test coverage\n");
printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n");
printf(" --test-file reads test operators from a test file generated by test-export-graph-ops\n");
printf(" -j <n> runs tests using <n> parallel worker threads (default: 1, test mode only)\n");
}
@@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
return 1;
}
#else
LOG_ERR("export-graph-ops compiled without HF fetch support\n");
LOG_ERR("test-export-graph-ops compiled without HF fetch support\n");
return 1;
#endif
}
+1 -1
View File
@@ -1,4 +1,4 @@
set(TARGET rpc-server)
set(TARGET ggml-rpc-server)
add_executable(${TARGET} rpc-server.cpp)
target_link_libraries(${TARGET} PRIVATE ggml)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
+17 -17
View File
@@ -4,8 +4,8 @@
> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
The `rpc-server` allows exposing `ggml` devices on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
The `ggml-rpc-server` allows exposing `ggml` devices on a remote host.
The RPC backend communicates with one or several instances of `ggml-rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:
```mermaid
@@ -14,15 +14,15 @@ flowchart TD
rpcb<-->|TCP|srvb
rpcb<-.->|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]<-.->dev4["CUDA0"]
srvn[rpc-server]<-.->dev5["CPU"]
srvn[ggml-rpc-server]<-.->dev4["CUDA0"]
srvn[ggml-rpc-server]<-.->dev5["CPU"]
end
subgraph hostb[Host B]
srvb[rpc-server]<-->dev3["Metal"]
srvb[ggml-rpc-server]<-->dev3["Metal"]
end
subgraph hosta[Host A]
srva[rpc-server]<-->dev["CUDA0"]
srva[rpc-server]<-->dev2["CUDA1"]
srva[ggml-rpc-server]<-->dev["CUDA0"]
srva[ggml-rpc-server]<-->dev2["CUDA1"]
end
subgraph host[Main Host]
local["Local devices"]<-->ggml[llama-cli]
@@ -33,7 +33,7 @@ flowchart TD
class local,dev,dev2,dev3,dev4,dev5 devcls
```
By default, `rpc-server` exposes all available accelerator devices on the host.
By default, `ggml-rpc-server` exposes all available accelerator devices on the host.
If there are no accelerators, it exposes a single `CPU` device.
## Usage
@@ -41,7 +41,7 @@ If there are no accelerators, it exposes a single `CPU` device.
### Remote hosts
On each remote host, build the backends for each accelerator by adding `-DGGML_RPC=ON` to the build options.
For example, to build the `rpc-server` with support for CUDA accelerators:
For example, to build the `ggml-rpc-server` with support for CUDA accelerators:
```bash
mkdir build-rpc-cuda
@@ -50,10 +50,10 @@ cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
cmake --build . --config Release
```
When started, the `rpc-server` will detect and expose all available `CUDA` devices:
When started, the `ggml-rpc-server` will detect and expose all available `CUDA` devices:
```bash
$ bin/rpc-server
$ bin/ggml-rpc-server
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
@@ -67,14 +67,14 @@ Devices:
You can control the set of exposed CUDA devices with the `CUDA_VISIBLE_DEVICES` environment variable or the `--device` command line option. The following two commands have the same effect:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
$ bin/rpc-server --device CUDA0 -p 50052
$ CUDA_VISIBLE_DEVICES=0 bin/ggml-rpc-server -p 50052
$ bin/ggml-rpc-server --device CUDA0 -p 50052
```
### Main host
On the main host build `llama.cpp` with the backends for the local devices and add `-DGGML_RPC=ON` to the build options.
Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `rpc-server`:
Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `ggml-rpc-server`:
```bash
$ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -ngl 99 --rpc 192.168.88.10:50052,192.168.88.11:50052
@@ -90,7 +90,7 @@ This can speed up model loading significantly, especially when using large model
To enable the cache, use the `-c` option:
```bash
$ bin/rpc-server -c
$ bin/ggml-rpc-server -c
```
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
@@ -103,8 +103,8 @@ RDMA is enabled by default when `libibverbs` is found at build time.
### Troubleshooting
Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `rpc-server`:
Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `ggml-rpc-server`:
```bash
$ GGML_RPC_DEBUG=1 bin/rpc-server
$ GGML_RPC_DEBUG=1 bin/ggml-rpc-server
```