mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-04-25 20:39:42 +02:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a118d80233 | ||
|
|
61550f8231 | ||
|
|
aa79524c51 | ||
|
|
b77d11179d | ||
|
|
c7aa1364fd | ||
|
|
1a67fcc306 | ||
|
|
204f2cf168 | ||
|
|
138b288b59 | ||
|
|
bbd0f91779 |
@@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||
}
|
||||
}
|
||||
auto msg = builder.result();
|
||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||
if (!is_partial) {
|
||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
||||
176
docs/ops.md
176
docs/ops.md
@@ -12,91 +12,91 @@ Legend:
|
||||
- 🟡 Partially supported by this backend
|
||||
- ❌ Not supported by this backend
|
||||
|
||||
| Operation | BLAS | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
|
||||
|-----------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| CONV_2D | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| ELU | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| EXP | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| NEG | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROLL | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| SIN | ❌ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
|
||||
| SQR | ❌ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| SUM | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
|
||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
|
||||
|-----------|------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
|
||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
|
||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
|
||||
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
|
||||
|
||||
8133
docs/ops/CANN.csv
Normal file
8133
docs/ops/CANN.csv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -174,6 +174,7 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
|
||||
@@ -68,6 +68,8 @@
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <aclnnop/aclnn_index_copy.h>
|
||||
#include <aclnnop/aclnn_index_select.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -1614,50 +1616,97 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
||||
* @brief Performs index select operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function extracts slices from the source tensor (`src_buffer`),
|
||||
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
||||
* embedding operation on them. The embedding operation is applied by iterating
|
||||
* over the last two dimensions of the source tensor, creating the necessary
|
||||
* tensors for the source, index, and output, and executing the embedding operation.
|
||||
* This function applies the `IndexSelect` operation along a specific dimension
|
||||
* of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the source tensor, creates the corresponding
|
||||
* CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer holding the data for the source tensor.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param index The index tensor used in the embedding operation.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* @param dst_buffer The destination buffer where the output tensor data will be written.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying the indices to select from the source tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
||||
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
||||
ggml_tensor* dst) {
|
||||
static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
|
||||
void* src_buffer,int64_t* src_ne, size_t* src_nb,
|
||||
void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
|
||||
ggml_tensor* index, ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
||||
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_src_ne, acl_src_nb, 2);
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
int64_t acl_index_ne[1] = {index->ne[0]};
|
||||
size_t acl_index_nb[1] = {index->nb[0]};
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
||||
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
acl_index_ne, acl_index_nb, 1);
|
||||
index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
||||
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_out_ne, acl_out_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
|
||||
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
|
||||
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function applies the `IndexCopy` operation along a specific dimension of the
|
||||
* destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
|
||||
* to positions specified by the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the tensors, creates the corresponding
|
||||
* CANN tensors for source, index, and destination slices, and performs the index copy
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data to be copied.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param dst_buffer The destination buffer where values will be copied to.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying target positions in the destination tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
|
||||
void* src_buffer,int64_t* src_ne, size_t* src_nb,
|
||||
void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
|
||||
ggml_tensor* index, ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
||||
}
|
||||
}
|
||||
@@ -1669,8 +1718,9 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
||||
dst);
|
||||
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
@@ -1687,8 +1737,9 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
||||
src_trans_nb, src1, dst);
|
||||
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
||||
break;
|
||||
}
|
||||
@@ -1748,8 +1799,10 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
|
||||
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb, src1, dst);
|
||||
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
|
||||
ggml_cann_release_resources(ctx, dequant_tensor);
|
||||
break;
|
||||
@@ -1760,6 +1813,43 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // src
|
||||
ggml_tensor* src1 = dst->src[1]; // index
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along a specified dimension.
|
||||
*
|
||||
|
||||
@@ -424,15 +424,25 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
*
|
||||
* @details This function retrieves rows from a source tensor src0 according to
|
||||
* the indices provided in another tensor src1 and stores the result in
|
||||
* a destination tensor (\p dst). It supports different data types
|
||||
* including F32, F16, Q4_0, and Q8_0.
|
||||
* a destination tensor (\p dst).
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the extracted rows will be stored.
|
||||
* dst->op is `GGML_OP_GET_ROWS`.
|
||||
*/
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Writes specific rows into a tensor at positions specified by indices.
|
||||
*
|
||||
* @details This function copies rows from a source tensor into a destination
|
||||
* tensor (\p dst) at the positions indicated by the indices in another
|
||||
* tensor.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the specified rows will be updated.
|
||||
*/
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Executes matrix multiplication for the given tensor.
|
||||
*
|
||||
|
||||
@@ -1659,6 +1659,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_cann_get_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_cann_set_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
ggml_cann_dup(ctx, dst);
|
||||
break;
|
||||
@@ -2191,13 +2194,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS: {
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
|
||||
@@ -176,7 +176,7 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
||||
@@ -191,7 +191,7 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
do { \
|
||||
GGML_UNUSED(nbytes); \
|
||||
} while (0)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
@@ -211,9 +211,9 @@ typedef float2 dfloat2;
|
||||
#define GGML_USE_VMM
|
||||
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
|
||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
#define FAST_FP16_AVAILABLE
|
||||
@@ -227,17 +227,17 @@ typedef float2 dfloat2;
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
#define AMD_MFMA_AVAILABLE
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
|
||||
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#define CP_ASYNC_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
@@ -259,7 +259,7 @@ static bool fast_fp16_hardware_available(const int cc) {
|
||||
|
||||
// Any FP16 tensor core instructions are available for ggml code.
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
#if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
return false;
|
||||
#else
|
||||
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
||||
@@ -275,7 +275,7 @@ static bool fp16_mma_available(const int cc) {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
#endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
}
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
@@ -295,7 +295,11 @@ static bool fp32_mma_hardware_available(const int cc) {
|
||||
|
||||
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
|
||||
static bool amd_mfma_available(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
|
||||
#if !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
return GGML_CUDA_CC_IS_CDNA3(cc);
|
||||
#else
|
||||
return false;
|
||||
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
@@ -308,25 +312,25 @@ static bool cp_async_available(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
#if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
#else
|
||||
return 32;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
#endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
static __device__ void no_device_code(
|
||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||
file_name, line, function_name, arch);
|
||||
GGML_UNUSED(arch_list);
|
||||
#else
|
||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||
file_name, line, function_name, arch, arch_list);
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
__trap();
|
||||
|
||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||
@@ -363,7 +367,7 @@ struct ggml_cuda_unroll<1> {
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
return __reduce_add_sync(0xffffffff, x);
|
||||
#else
|
||||
#pragma unroll
|
||||
@@ -371,7 +375,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
x += __shfl_xor_sync(0xffffffff, x, offset, width);
|
||||
}
|
||||
return x;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
@@ -440,11 +444,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
#if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
|
||||
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||
#else
|
||||
return __hmax(a, b);
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
#endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
@@ -472,7 +476,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#pragma unroll
|
||||
for (int offset = width/2; offset > 0; offset >>= 1) {
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
||||
@@ -481,7 +485,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
@@ -493,7 +497,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
#endif // CUDART_VERSION < CUDART_HMASK
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3) || defined(RDNA4)
|
||||
@@ -519,7 +523,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
#endif
|
||||
return c;
|
||||
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#else // defined(GGML_USE_HIP)
|
||||
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
return __dp4a(a, b, c);
|
||||
@@ -529,7 +533,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
|
||||
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
|
||||
@@ -592,9 +592,9 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||
}
|
||||
|
||||
template<int D> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_combine_results(
|
||||
const float * __restrict__ VKQ_parts,
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
|
||||
@@ -1391,24 +1391,24 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
|
||||
constexpr bool use_logit_softcap = false;
|
||||
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
} else {
|
||||
constexpr bool use_logit_softcap = true;
|
||||
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla>;
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
launch_fattn<DV, ncols1, ncols2>
|
||||
|
||||
@@ -5,9 +5,9 @@
|
||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
||||
@@ -5,9 +5,9 @@
|
||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
@@ -341,6 +347,9 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
NO_DEVICE_CODE;
|
||||
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -1,6 +1,12 @@
|
||||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
// Currenlty llvm with the amdgcn target dose not support unrolling loops
|
||||
// that contain a break that can not be resolved at compile time.
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wpass-failed"
|
||||
#endif // __clang__
|
||||
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#ifndef GGML_USE_HIP
|
||||
__launch_bounds__(D, 1)
|
||||
@@ -336,6 +342,9 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif // __clang__
|
||||
|
||||
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
|
||||
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include "fattn-wmma-f16.cuh"
|
||||
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
#include <mma.h>
|
||||
#ifdef GGML_USE_MUSA
|
||||
namespace wmma = mtmusa::wmma;
|
||||
@@ -18,7 +18,7 @@ namespace wmma = nvcuda::wmma;
|
||||
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
||||
#include <rocwmma/rocwmma.hpp>
|
||||
namespace wmma = rocwmma;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
#endif // FP16_MMA_AVAILABLE
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
@@ -546,7 +546,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
return;
|
||||
}
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !defined(GGML_USE_HIP)
|
||||
if (Q->ne[1] <= 8 && Q->ne[0] % warp_size == 0) {
|
||||
constexpr int cols_per_block = 8;
|
||||
switch (Q->ne[0]) {
|
||||
@@ -568,7 +568,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
|
||||
if (Q->ne[1] <= 32) {
|
||||
constexpr int cols_per_block = 16;
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "ggml-cuda/rope.cuh"
|
||||
#include "ggml-cuda/roll.cuh"
|
||||
#include "ggml-cuda/scale.cuh"
|
||||
#include "ggml-cuda/softcap.cuh"
|
||||
#include "ggml-cuda/softmax.cuh"
|
||||
#include "ggml-cuda/ssm-conv.cuh"
|
||||
#include "ggml-cuda/ssm-scan.cuh"
|
||||
@@ -127,7 +128,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
return err;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
static int ggml_cuda_parse_id(char devName[]) {
|
||||
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
|
||||
// these values are not stable so this is susceptible to breakage
|
||||
@@ -174,10 +175,10 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
archNum += archMinor;
|
||||
return archNum;
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||
{
|
||||
@@ -250,7 +251,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
info.devices[id].warp_size = prop.warpSize;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
|
||||
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
|
||||
@@ -280,7 +281,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
@@ -2770,7 +2771,12 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
|
||||
static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops, std::initializer_list<enum ggml_unary_op> unary_ops) {
|
||||
#ifndef NDEBUG
|
||||
const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
|
||||
GGML_ASSERT(unary_ops.size() == num_unary);
|
||||
#endif
|
||||
|
||||
if (!ggml_can_fuse(cgraph, node_idx, ops)) {
|
||||
return false;
|
||||
}
|
||||
@@ -2798,9 +2804,32 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
|
||||
if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return true;
|
||||
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
|
||||
const ggml_tensor *scale = cgraph->nodes[node_idx];
|
||||
const ggml_tensor *tanh = cgraph->nodes[node_idx+1];
|
||||
const ggml_tensor *scale2 = cgraph->nodes[node_idx+2];
|
||||
|
||||
GGML_ASSERT(scale->src[0]->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(scale->type == GGML_TYPE_F32);
|
||||
|
||||
if (ggml_get_unary_op(tanh) != GGML_UNARY_OP_TANH) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for bias
|
||||
if (ggml_get_op_params_f32(scale, 1) != 0.0f || ggml_get_op_params_f32(scale2, 1) != 0.0f) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
@@ -2821,10 +2850,18 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
}
|
||||
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion && ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
if (!disable_fusion) {
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
||||
i += 2;
|
||||
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
|
||||
@@ -68,7 +68,7 @@ namespace ggml_cuda_mma {
|
||||
static constexpr int I = I_;
|
||||
static constexpr int J = J_;
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
static constexpr int ne = I * J / 64;
|
||||
T x[ne] = {0};
|
||||
|
||||
@@ -132,7 +132,7 @@ namespace ggml_cuda_mma {
|
||||
static_assert(I == -1 && J == -1, "template specialization not implemented");
|
||||
}
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
};
|
||||
|
||||
template <int I_, int J_>
|
||||
|
||||
@@ -104,9 +104,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
return 128;
|
||||
#else // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
return 64;
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#else // defined(GGML_USE_HIP)
|
||||
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
@@ -118,7 +118,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(NEW_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ static int get_mmq_y_host(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA1)
|
||||
return 64;
|
||||
#else
|
||||
@@ -140,7 +140,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||
#else
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
// Decouple shared memory tile sizes from WARP_SIZE to allow for different warp sizes.
|
||||
@@ -250,7 +250,7 @@ static constexpr __device__ int mmq_get_granularity_device(const int /*mmq_x*/)
|
||||
}
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
static int mmq_get_nwarps_host(const int cc) {
|
||||
return amd_mfma_available(cc) ? 8 : 4;
|
||||
}
|
||||
@@ -258,10 +258,10 @@ static int mmq_get_nwarps_host(const int cc) {
|
||||
static int mmq_get_nwarps_host(const int /*cc*/) {
|
||||
return 8;
|
||||
}
|
||||
#endif // (GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // (GGML_USE_HIP)
|
||||
|
||||
static constexpr __device__ int mmq_get_nwarps_device() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(AMD_MFMA_AVAILABLE)
|
||||
return 8;
|
||||
#else
|
||||
@@ -269,7 +269,7 @@ static constexpr __device__ int mmq_get_nwarps_device() {
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
#else
|
||||
return 8;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------
|
||||
@@ -3047,7 +3047,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
||||
|
||||
template <ggml_type type, int mmq_x, bool need_check>
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#if defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
||||
#endif // defined(RDNA4) || defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
@@ -3057,7 +3057,7 @@ template <ggml_type type, int mmq_x, bool need_check>
|
||||
#else
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device(), 2)
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
@@ -3097,7 +3097,7 @@ static __global__ void mul_mat_q(
|
||||
__syncthreads();
|
||||
|
||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
#if (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
{
|
||||
const int wt = blockIdx.z / nchannels_y;
|
||||
const int zt = blockIdx.z - wt*nchannels_y;
|
||||
@@ -3151,7 +3151,7 @@ static __global__ void mul_mat_q(
|
||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||
return;
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
#endif // (defined(GGML_USE_HIP) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
|
||||
34
ggml/src/ggml-cuda/softcap.cu
Normal file
34
ggml/src/ggml-cuda/softcap.cu
Normal file
@@ -0,0 +1,34 @@
|
||||
#include "softcap.cuh"
|
||||
|
||||
static __global__ void softcap_f32(const float * x, float * dst, const float scale, const float softcap, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = tanhf(scale * x[i]) * softcap;
|
||||
}
|
||||
|
||||
static void softcap_f32_cuda(const float * x, float * dst, const float scale, const float softcap, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE;
|
||||
softcap_f32<<<num_blocks, CUDA_SOFTCAP_BLOCK_SIZE, 0, stream>>>(x, dst, scale, softcap, k);
|
||||
}
|
||||
|
||||
// fused GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
|
||||
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src) {
|
||||
const ggml_tensor * src0 = src->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
float scale;
|
||||
float softcap;
|
||||
memcpy(&scale, (float *) src->op_params + 0, sizeof(float));
|
||||
memcpy(&softcap, (float *) dst->op_params + 0, sizeof(float));
|
||||
|
||||
softcap_f32_cuda(src0_d, dst_d, scale, softcap, ggml_nelements(src0), stream);
|
||||
}
|
||||
5
ggml/src/ggml-cuda/softcap.cuh
Normal file
5
ggml/src/ggml-cuda/softcap.cuh
Normal file
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_SOFTCAP_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * src);
|
||||
14
ggml/src/ggml-cuda/vendors/hip.h
vendored
14
ggml/src/ggml-cuda/vendors/hip.h
vendored
@@ -5,10 +5,8 @@
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif // __HIP_PLATFORM_AMD__
|
||||
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
@@ -139,7 +137,7 @@
|
||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000
|
||||
#if HIP_VERSION >= 70000000
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
|
||||
@@ -151,7 +149,11 @@
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define cublasComputeType_t hipblasDatatype_t
|
||||
#define cudaDataType_t hipblasDatatype_t
|
||||
#endif
|
||||
#endif // HIP_VERSION >= 7000000
|
||||
|
||||
#if !defined(__HIP_PLATFORM_AMD__)
|
||||
#error "The HIP backend supports only AMD targets"
|
||||
#endif // !defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
@@ -249,7 +251,7 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
|
||||
return c;
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#if HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
@@ -261,4 +263,4 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
|
||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||
#endif // HIP_VERSION < 50600000
|
||||
|
||||
@@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_HIP_MMQ_MFMA)
|
||||
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
|
||||
endif()
|
||||
|
||||
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
|
||||
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
endif()
|
||||
|
||||
@@ -32,11 +32,12 @@ def get_prompts_text(dataset_name: str, n_prompts: int) -> Optional[list[str]]:
|
||||
return ret
|
||||
|
||||
|
||||
def get_prompt_lengths_rng(n_prompts: int, prompt_length_min: int, prompt_length_max: int) -> list[int]:
|
||||
def get_prompt_lengths_rng(n_prompts: int, prompt_length_min: int, prompt_length_max: int, seed_offset: int) -> list[int]:
|
||||
assert n_prompts >= 0
|
||||
ret: list[int] = []
|
||||
for i in range(n_prompts):
|
||||
random.seed(13 * i + 0)
|
||||
if seed_offset >= 0:
|
||||
random.seed(3 * (seed_offset + 1000 * i) + 0)
|
||||
ret.append(random.randint(prompt_length_min, prompt_length_max))
|
||||
return ret
|
||||
|
||||
@@ -46,12 +47,20 @@ def get_prompts_rng(prompt_lengths: list[int]) -> list[list[int]]:
|
||||
|
||||
|
||||
def get_server(path_server: str, path_log: Optional[str]) -> dict:
|
||||
logger.info("Starting the llama.cpp server...")
|
||||
hostname: str = os.environ.get("LLAMA_ARG_HOST", "127.0.0.1")
|
||||
port: str = os.environ.get("LLAMA_ARG_PORT", "8080")
|
||||
if os.environ.get("LLAMA_ARG_HOST") is None:
|
||||
logger.info("LLAMA_ARG_HOST not explicitly set, using 127.0.0.1")
|
||||
os.environ["LLAMA_ARG_HOST"] = "127.0.0.1"
|
||||
if os.environ.get("LLAMA_ARG_PORT") is None:
|
||||
logger.info("LLAMA_ARG_PORT not explicitly set, using 8080")
|
||||
os.environ["LLAMA_ARG_PORT"] = "8080"
|
||||
hostname: Optional[str] = os.environ.get("LLAMA_ARG_HOST")
|
||||
port: Optional[str] = os.environ.get("LLAMA_ARG_PORT")
|
||||
assert hostname is not None
|
||||
assert port is not None
|
||||
address: str = f"http://{hostname}:{port}"
|
||||
logger.info(f"Starting the llama.cpp server under {address}...")
|
||||
|
||||
fout = open(path_log, "w") if path_log is not None else subprocess.DEVNULL
|
||||
fout = open(path_log.format(port=port), "w") if path_log is not None else subprocess.DEVNULL
|
||||
process = subprocess.Popen([path_server], stdout=fout, stderr=subprocess.STDOUT)
|
||||
|
||||
n_failures: int = 0
|
||||
@@ -60,7 +69,7 @@ def get_server(path_server: str, path_log: Optional[str]) -> dict:
|
||||
sleep(1.0)
|
||||
exit_code = process.poll()
|
||||
if exit_code is not None:
|
||||
raise RuntimeError(f"llama.cpp server exited unexpectedly with exit code {exit_code}, see {path_log}")
|
||||
raise RuntimeError(f"llama.cpp server exited unexpectedly with exit code {exit_code}{path_log and f', see {path_log.format(port=port)}' or ''}")
|
||||
response = requests.get(f"{address}/health")
|
||||
if response.status_code == 200:
|
||||
break
|
||||
@@ -128,7 +137,7 @@ def send_prompt(data: dict) -> tuple[float, list[float]]:
|
||||
return (t_submit, token_arrival_times)
|
||||
|
||||
|
||||
def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_prompts: int, n_predict: int, n_predict_min: int):
|
||||
def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_prompts: int, n_predict: int, n_predict_min: int, seed_offset: int):
|
||||
if os.environ.get("LLAMA_ARG_N_PARALLEL") is None:
|
||||
logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32")
|
||||
os.environ["LLAMA_ARG_N_PARALLEL"] = "32"
|
||||
@@ -139,7 +148,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
|
||||
logger.info("LLAMA_ARG_FLASH_ATTN not explicitly set, using 'true'")
|
||||
os.environ["LLAMA_ARG_FLASH_ATTN"] = "true"
|
||||
|
||||
parallel: int = int(os.environ.get("LLAMA_ARG_N_PARALLEL", 1))
|
||||
parallel: int = int(os.environ.get("LLAMA_ARG_N_PARALLEL")) # type: ignore
|
||||
prompts: Union[None, list[str], list[list[int]]] = get_prompts_text(prompt_source, n_prompts)
|
||||
synthetic_prompts: bool = prompts is None
|
||||
prompt_n = []
|
||||
@@ -151,7 +160,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
|
||||
prompt_length_min: int = int(prompt_source_split[1])
|
||||
prompt_length_max: int = int(prompt_source_split[2])
|
||||
logger.info("Generating random prompts...")
|
||||
prompt_n = get_prompt_lengths_rng(n_prompts, prompt_length_min, prompt_length_max)
|
||||
prompt_n = get_prompt_lengths_rng(n_prompts, prompt_length_min, prompt_length_max, seed_offset)
|
||||
prompts = get_prompts_rng(prompt_n)
|
||||
else:
|
||||
n_predict_min = n_predict
|
||||
@@ -176,10 +185,11 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
|
||||
data: list[dict] = []
|
||||
|
||||
for i, p in enumerate(prompts):
|
||||
random.seed(13 * i + 1)
|
||||
if seed_offset >= 0:
|
||||
random.seed(3 * (seed_offset + 1000 * i) + 1)
|
||||
data.append({
|
||||
"session": session, "server_address": server_address, "prompt": p, "synthetic_prompt": synthetic_prompts,
|
||||
"n_predict": random.randint(n_predict_min, n_predict), "seed": 13 * i + 2})
|
||||
"n_predict": random.randint(n_predict_min, n_predict), "seed": (3 * (seed_offset + 1000 * i) + 2) if seed_offset >= 0 else -1})
|
||||
|
||||
if not synthetic_prompts:
|
||||
logger.info("Getting the prompt lengths...")
|
||||
@@ -251,7 +261,7 @@ if __name__ == "__main__":
|
||||
"Results are printed to console and visualized as plots (saved to current working directory). "
|
||||
"To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help).")
|
||||
parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
|
||||
parser.add_argument("--path_log", type=str, default="server-bench.log", help="Path to the model to use for the benchmark")
|
||||
parser.add_argument("--path_log", type=str, default="server-bench-{port}.log", help="Path to the model to use for the benchmark")
|
||||
parser.add_argument(
|
||||
"--prompt_source", type=str, default="rng-1024-2048",
|
||||
help="How to get the prompts for the benchmark, either 'mmlu' for MMLU questions or "
|
||||
@@ -261,5 +271,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--n_predict_min", type=int, default=1024,
|
||||
help="Min. number of tokens to predict per prompt (supported for synthetic prompts only)")
|
||||
parser.add_argument("--seed_offset", type=int, default=0, help="Offset for determining the seeds for pseudorandom prompt/generation lengths. "
|
||||
"Corelations between seeds can occur when set >= 1000. Negative values mean no seed.")
|
||||
args = parser.parse_args()
|
||||
benchmark(**vars(args))
|
||||
|
||||
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
|
||||
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
|
||||
|
||||
if (cparams.embeddings && (
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
|
||||
)) {
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
|
||||
)) {
|
||||
GGML_ASSERT(cls);
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
||||
|
||||
uint32_t * data = (uint32_t *) cls->data;
|
||||
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
||||
|
||||
for (int i = 0; i < n_tokens; i += n_seq_tokens) {
|
||||
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
||||
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
||||
std::vector<int> target_pos(n_seqs_unq, -1);
|
||||
std::vector<int> target_row(n_seqs_unq, -1);
|
||||
|
||||
data[seq_idx] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
||||
GGML_ASSERT(cls);
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
|
||||
|
||||
uint32_t * data = (uint32_t *) cls->data;
|
||||
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
|
||||
|
||||
std::vector<int> last_pos(n_seqs_unq, -1);
|
||||
std::vector<int> last_row(n_seqs_unq, -1);
|
||||
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_pos pos = ubatch->pos[i];
|
||||
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
||||
const int32_t seq_idx = ubatch->seq_idx[seq_id];
|
||||
|
||||
if (pos >= last_pos[seq_idx]) {
|
||||
last_pos[seq_idx] = pos;
|
||||
last_row[seq_idx] = i;
|
||||
if (
|
||||
(target_pos[seq_idx] == -1) ||
|
||||
( last && pos >= target_pos[seq_idx]) ||
|
||||
(!last && pos < target_pos[seq_idx])
|
||||
) {
|
||||
target_pos[seq_idx] = pos;
|
||||
target_row[seq_idx] = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int s = 0; s < n_seqs_unq; ++s) {
|
||||
if (last_row[s] >= 0) {
|
||||
data[s] = last_row[s];
|
||||
if (target_row[s] >= 0) {
|
||||
data[s] = target_row[s];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2545,6 +2545,41 @@ struct test_scale : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SCALE + GGML_UNARY_OP_TANH + GGML_OP_SCALE
|
||||
struct test_softcap : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
float softcap;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "SOFTCAP";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, softcap);
|
||||
}
|
||||
|
||||
test_softcap(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
float softcap = 30.0f)
|
||||
: type(type), ne(ne), softcap(softcap) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
|
||||
ggml_set_param(a);
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * out = ggml_scale(ctx, ggml_tanh(ctx, ggml_scale(ctx, a, 1.0f / softcap)), softcap);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SILU_BACK
|
||||
struct test_silu_back : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -5421,6 +5456,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_add1());
|
||||
test_cases.emplace_back(new test_scale());
|
||||
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
|
||||
test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
|
||||
test_cases.emplace_back(new test_silu_back());
|
||||
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) {
|
||||
|
||||
Reference in New Issue
Block a user