mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-16 07:45:46 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac79caa7ce | ||
|
|
fdd109883d | ||
|
|
4196b477da | ||
|
|
ad39ccaa19 | ||
|
|
7dad2f1a17 | ||
|
|
e36a602ba3 | ||
|
|
38d546330a | ||
|
|
a1eb756c0b |
@@ -7,7 +7,7 @@ ARG APP_REVISION=N/A
|
||||
|
||||
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
ARG GGML_SYCL_F16=ON
|
||||
ARG LEVEL_ZERO_VERSION=1.28.2
|
||||
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
|
||||
RUN apt-get update && \
|
||||
@@ -24,7 +24,8 @@ COPY . .
|
||||
|
||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
|
||||
&& export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
|
||||
fi && \
|
||||
echo "Building with dynamic libs" && \
|
||||
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
|
||||
|
||||
@@ -37,7 +37,7 @@ LLM inference in C/C++
|
||||
|
||||
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
|
||||
|
||||
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
|
||||
- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md)
|
||||
- Run with Docker - see our [Docker documentation](docs/docker.md)
|
||||
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
|
||||
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
|
||||
|
||||
@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
|
||||
auto arg_name_parser = literal(prop_name);
|
||||
|
||||
common_peg_parser arg_value_parser = eps();
|
||||
auto string_value_parser = choice({
|
||||
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
|
||||
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
|
||||
});
|
||||
// Quoted literal as a value: normalize_quotes_to_json preserves escapes.
|
||||
auto string_value_parser = tool_arg_value(choice({
|
||||
literal("\"") + string_content('"') + literal("\""),
|
||||
literal("'") + string_content('\'') + literal("'")
|
||||
}));
|
||||
|
||||
if (is_string_type) {
|
||||
arg_value_parser = string_value_parser;
|
||||
|
||||
@@ -2678,10 +2678,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format),
|
||||
effective_input.substr(result.end).c_str());
|
||||
throw std::runtime_error(std::string("The model produced output that does not match the expected ") +
|
||||
common_chat_format_name(params.format) + " format");
|
||||
LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str());
|
||||
LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str());
|
||||
throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format");
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
|
||||
@@ -253,6 +253,7 @@ When targeting an intel GPU, the user should expect one or more devices among th
|
||||
#### Intel GPU
|
||||
|
||||
```sh
|
||||
# Uses FP32, consider using FP16 for better performance in most cases
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
@@ -262,12 +263,12 @@ or
|
||||
# Export relevant ENV variables
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Option 2: Use FP16
|
||||
# Option 1: Use FP16 (recommended for better performance in most cases)
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||
|
||||
# Option 2: Use FP32
|
||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# build all binary
|
||||
cmake --build build --config Release -j -v
|
||||
```
|
||||
@@ -469,6 +470,7 @@ Choose one of following methods to build from source code.
|
||||
##### Option 1: Script
|
||||
|
||||
```sh
|
||||
# Uses FP32, consider using FP16 for better performance in most cases
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
@@ -479,11 +481,11 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||
```
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
# Option 1: Use FP16 (recommended for better performance in most cases)
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
|
||||
# Option 2: Or FP16
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||
# Option 2: Or FP32
|
||||
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
cmake --build build --config Release -j
|
||||
```
|
||||
@@ -491,10 +493,10 @@ cmake --build build --config Release -j
|
||||
Or, use CMake presets to build:
|
||||
|
||||
```sh
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-completion
|
||||
|
||||
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||
cmake --preset x64-windows-sycl-release
|
||||
cmake --build build-x64-windows-sycl-release -j --target llama-completion
|
||||
|
||||
cmake --preset x64-windows-sycl-debug
|
||||
|
||||
@@ -1,12 +1,40 @@
|
||||
# Install pre-built version of llama.cpp
|
||||
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|-----|-------|
|
||||
| Install via | Windows | Mac | Linux |
|
||||
|-------------|---------|------|-------|
|
||||
| conda-forge | ✅ | ✅ | ✅ |
|
||||
| Winget | ✅ | | |
|
||||
| Homebrew | | ✅ | ✅ |
|
||||
| MacPorts | | ✅ | |
|
||||
| Nix | | ✅ | ✅ |
|
||||
|
||||
## conda-forge (Windows, Mac and Linux)
|
||||
|
||||
conda-forge provides builds for:
|
||||
- CUDA (Windows and Linux)
|
||||
- Vulkan (Windows and Linux)
|
||||
- Apple Metal (macOS)
|
||||
|
||||
```sh
|
||||
conda install -c conda-forge llama-cpp
|
||||
```
|
||||
|
||||
```sh
|
||||
mamba install -c conda-forge llama-cpp
|
||||
```
|
||||
|
||||
```sh
|
||||
# Project-local installation
|
||||
pixi add llama-cpp
|
||||
|
||||
# Global installation
|
||||
pixi global install llama-cpp
|
||||
```
|
||||
|
||||
This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/).
|
||||
|
||||
Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues).
|
||||
|
||||
## Winget (Windows)
|
||||
|
||||
```sh
|
||||
|
||||
@@ -44,10 +44,10 @@ Legend:
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -89,7 +89,7 @@ Legend:
|
||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
@@ -118,6 +118,6 @@ Legend:
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
@@ -27,20 +27,20 @@
|
||||
"SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
@@ -69,20 +69,20 @@
|
||||
"SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
@@ -111,8 +111,8 @@
|
||||
"SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
|
||||
@@ -153,20 +153,20 @@
|
||||
"SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
|
||||
"SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","SYCL"
|
||||
"SYCL0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","SYCL"
|
||||
"SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","SYCL"
|
||||
@@ -5105,6 +5105,7 @@
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=bf16,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","SYCL"
|
||||
@@ -5112,6 +5113,7 @@
|
||||
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT","type=bf16,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","1","yes","SYCL"
|
||||
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","1","yes","SYCL"
|
||||
@@ -9748,10 +9750,10 @@
|
||||
"SYCL0","COS","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","SYCL"
|
||||
"SYCL0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
|
||||
"SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
@@ -9766,14 +9768,14 @@
|
||||
"SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","0","no","SYCL"
|
||||
"SYCL0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","LEAKY_RELU","type=f16,ne_a=[1024,1024,1,1],negative_slope=0.100000","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
|
||||
"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
|
||||
"SYCL0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
|
||||
"SYCL0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","SYCL"
|
||||
"SYCL0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -287,6 +287,13 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
|
||||
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
|
||||
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
|
||||
main_stream);
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) {
|
||||
op()((const sycl::ext::oneapi::bfloat16 *) src0->data, (const sycl::ext::oneapi::bfloat16 *) src1->data,
|
||||
(sycl::ext::oneapi::bfloat16 *) dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2,
|
||||
ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0),
|
||||
ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
|
||||
#endif
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include "concat.hpp"
|
||||
|
||||
static inline size_t elem_size(ggml_type t) {
|
||||
@@ -192,11 +194,29 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
case GGML_TYPE_F32:
|
||||
concat_impl_sycl<float>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
concat_impl_sycl<sycl::half>(ctx, dst);
|
||||
break;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
case GGML_TYPE_BF16:
|
||||
concat_impl_sycl<sycl::ext::oneapi::bfloat16>(ctx, dst);
|
||||
break;
|
||||
#endif
|
||||
case GGML_TYPE_I32:
|
||||
concat_impl_sycl<int32_t>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I16:
|
||||
concat_impl_sycl<int16_t>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I64:
|
||||
concat_impl_sycl<int64_t>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I8:
|
||||
concat_impl_sycl<int8_t>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s\n", __func__, ggml_type_name(dst->type));
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1022,6 +1022,120 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_k_reorder(const void *__restrict__ vx,
|
||||
const float *__restrict__ yy,
|
||||
float *__restrict__ dst,
|
||||
const int ncols, int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
const int row = item_ct1.get_group(2);
|
||||
const int num_blocks_per_row = ncols / QK_K;
|
||||
const int ib0 = row*num_blocks_per_row;
|
||||
|
||||
// SOA base pointers for the reordered layout:
|
||||
// [qs: nb * QK_K/2] [qh: nb * QK_K/8] [scales: nb * K_SCALE_SIZE] [dm: nb * sizeof(half2)]
|
||||
const int nb = nrows * num_blocks_per_row;
|
||||
const uint8_t * qs_base = (const uint8_t *)vx;
|
||||
const uint8_t * qh_base = qs_base + (size_t)nb * (QK_K / 2);
|
||||
const uint8_t * scales_base = qh_base + (size_t)nb * (QK_K / 8);
|
||||
const sycl::half2 * dm_base = (const sycl::half2 *)(scales_base + (size_t)nb * K_SCALE_SIZE);
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
|
||||
const int tid = item_ct1.get_local_id(2) / 2; // 0...15
|
||||
const int ix = item_ct1.get_local_id(2) % 2;
|
||||
|
||||
const int il = tid/4; // 0...3
|
||||
const int ir = tid - 4*il;// 0...3
|
||||
const int n = 2;
|
||||
|
||||
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
||||
const int in = il%2;
|
||||
|
||||
const int l0 = n*(2*ir + in);
|
||||
const int q_offset = 32*im + l0;
|
||||
const int y_offset = 64*im + l0;
|
||||
|
||||
const uint8_t hm1 = 1 << (2*im);
|
||||
const uint8_t hm2 = hm1 << 4;
|
||||
|
||||
uint16_t aux[4];
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
|
||||
uint16_t q16[8];
|
||||
const uint8_t * q4 = (const uint8_t *)q16;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
||||
const int bi = ib0 + i;
|
||||
|
||||
const uint8_t * ql1 = qs_base + bi * (QK_K / 2) + q_offset;
|
||||
const uint8_t * qh = qh_base + bi * (QK_K / 8) + l0;
|
||||
const float * y1 = yy + i*QK_K + y_offset;
|
||||
const float * y2 = y1 + 128;
|
||||
|
||||
const sycl::half2 dm_val = dm_base[bi];
|
||||
const float dall = dm_val[0];
|
||||
const float dmin = dm_val[1];
|
||||
|
||||
const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE);
|
||||
aux[0] = a[im+0] & kmask1;
|
||||
aux[1] = a[im+2] & kmask1;
|
||||
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
||||
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
||||
|
||||
sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
|
||||
float smin = 0;
|
||||
const uint16_t * q1 = (const uint16_t *)ql1;
|
||||
const uint16_t * q2 = q1 + 32;
|
||||
q16[0] = q1[0] & 0x0f0f;
|
||||
q16[1] = q1[8] & 0x0f0f;
|
||||
q16[2] = (q1[0] >> 4) & 0x0f0f;
|
||||
q16[3] = (q1[8] >> 4) & 0x0f0f;
|
||||
q16[4] = q2[0] & 0x0f0f;
|
||||
q16[5] = q2[8] & 0x0f0f;
|
||||
q16[6] = (q2[0] >> 4) & 0x0f0f;
|
||||
q16[7] = (q2[8] >> 4) & 0x0f0f;
|
||||
for (int l = 0; l < n; ++l) {
|
||||
sum.x() +=
|
||||
y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
|
||||
y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
|
||||
sum.y() +=
|
||||
y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
|
||||
y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
|
||||
sum.z() +=
|
||||
y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
|
||||
y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
|
||||
sum.w() +=
|
||||
y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
|
||||
y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
|
||||
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
||||
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
||||
}
|
||||
tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
|
||||
sum.w() * sc[5]) -
|
||||
dmin * smin;
|
||||
}
|
||||
#else
|
||||
// The reordered Q5_K layout is only produced for QK_K == 256.
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
#pragma unroll
|
||||
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
|
||||
if (item_ct1.get_local_id(2) == 0) {
|
||||
dst[row] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
@@ -1599,6 +1713,19 @@ static void dequantize_mul_mat_vec_q4_K_sycl_reorder(const void *vx, const float
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_K_sycl_reorder(const void *vx, const float *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_q5_k_reorder(vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_K_sycl_reorder(const void *vx, const float *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -1695,7 +1822,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
dequantize_mul_mat_vec_q5_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
|
||||
@@ -124,6 +124,11 @@ static __dpct_inline__ T op_exp(T x) {
|
||||
return sycl::exp(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __dpct_inline__ T op_expm1(T x) {
|
||||
return sycl::expm1(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __dpct_inline__ T op_log(T x) {
|
||||
if (x <= static_cast<T>(0)) {
|
||||
@@ -266,13 +271,6 @@ static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl:
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
dst[i] = op_floor(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
@@ -280,20 +278,6 @@ static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
dst[i] = op_round(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
|
||||
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
|
||||
dst[i] = op_trunc(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void clamp(const T * x, T * dst, const float min, const float max, const int k,
|
||||
const sycl::nd_item<1> &item_ct1) {
|
||||
@@ -605,6 +589,12 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_expm1(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
@@ -728,16 +718,9 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1);
|
||||
});
|
||||
});
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_floor(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
@@ -747,29 +730,15 @@ static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1);
|
||||
});
|
||||
});
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_round(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
|
||||
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
|
||||
const int num_blocks = ceil_div(k_elements, 256);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
|
||||
sycl::range<1>(256)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1);
|
||||
});
|
||||
});
|
||||
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
|
||||
return op_trunc(x);
|
||||
});
|
||||
}
|
||||
|
||||
static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
@@ -1018,6 +987,11 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
ggml_sycl_op_exp(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_expm1(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
|
||||
ggml_sycl_op_log(ctx, dst);
|
||||
|
||||
@@ -59,6 +59,8 @@ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -3685,6 +3685,149 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reorder each expert slice into a self-contained SoA layout.
|
||||
static bool reorder_qw_q4_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(expert_bytes % sizeof(block_q4_K) == 0);
|
||||
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q4_K));
|
||||
const size_t total_bytes = expert_bytes * (size_t) n_expert;
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, total_bytes);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
const int total_blocks = blocks_per_expert * (int) n_expert;
|
||||
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
|
||||
const int gb = gb_;
|
||||
const int e = gb / blocks_per_expert;
|
||||
const int ib = gb % blocks_per_expert;
|
||||
const block_q4_K * x = (const block_q4_K *) (tmp_buf + (size_t) e * expert_bytes);
|
||||
uint8_t * base = data_device + (size_t) e * expert_bytes;
|
||||
|
||||
auto * qs_ptr = base;
|
||||
auto * scales_ptr = qs_ptr + QK_K / 2 * blocks_per_expert;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reorder each Q5_K expert slice into [qs][qh][scales][dm].
|
||||
static bool reorder_qw_q5_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(expert_bytes % sizeof(block_q5_K) == 0);
|
||||
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q5_K));
|
||||
const size_t total_bytes = expert_bytes * (size_t) n_expert;
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, total_bytes);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
const int total_blocks = blocks_per_expert * (int) n_expert;
|
||||
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
|
||||
const int gb = gb_;
|
||||
const int e = gb / blocks_per_expert;
|
||||
const int ib = gb % blocks_per_expert;
|
||||
const block_q5_K * x = (const block_q5_K *) (tmp_buf + (size_t) e * expert_bytes);
|
||||
uint8_t * base = data_device + (size_t) e * expert_bytes;
|
||||
|
||||
auto * qs_ptr = base;
|
||||
auto * qh_ptr = qs_ptr + (QK_K / 2) * blocks_per_expert;
|
||||
auto * scales_ptr = qh_ptr + (QK_K / 8) * blocks_per_expert;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
for (int j = 0; j < QK_K / 8; ++j) {
|
||||
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
|
||||
}
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reorder each Q6_K expert slice into [ql][qh][scales][d].
|
||||
static bool reorder_qw_q6_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(expert_bytes % sizeof(block_q6_K) == 0);
|
||||
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q6_K));
|
||||
const size_t total_bytes = expert_bytes * (size_t) n_expert;
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, total_bytes);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
const int total_blocks = blocks_per_expert * (int) n_expert;
|
||||
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
|
||||
const int gb = gb_;
|
||||
const int e = gb / blocks_per_expert;
|
||||
const int ib = gb % blocks_per_expert;
|
||||
const block_q6_K * x = (const block_q6_K *) (tmp_buf + (size_t) e * expert_bytes);
|
||||
uint8_t * base = data_device + (size_t) e * expert_bytes;
|
||||
|
||||
auto * ql_ptr = base;
|
||||
auto * qh_ptr = ql_ptr + (QK_K / 2) * blocks_per_expert;
|
||||
auto * scales_ptr = qh_ptr + (QK_K / 4) * blocks_per_expert;
|
||||
auto * d_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * blocks_per_expert);
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
ql_ptr[ib * (QK_K / 2) + j] = x[ib].ql[j];
|
||||
}
|
||||
for (int j = 0; j < QK_K / 4; ++j) {
|
||||
qh_ptr[ib * (QK_K / 4) + j] = x[ib].qh[j];
|
||||
}
|
||||
for (int j = 0; j < QK_K / 16; ++j) {
|
||||
scales_ptr[ib * (QK_K / 16) + j] = x[ib].scales[j];
|
||||
}
|
||||
d_ptr[ib] = x[ib].d;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
|
||||
@@ -3840,6 +3983,22 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
size_t nrows = src0->ne[1];
|
||||
size_t size = ggml_nbytes(src0);
|
||||
|
||||
// MoE expert weights are addressed per expert via nb[2], so each slice must
|
||||
// remain self-contained after reorder.
|
||||
if (src0->ne[2] > 1) {
|
||||
GGML_ASSERT((size_t) size == (size_t) src0->ne[2] * src0->nb[2]);
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_K:
|
||||
return reorder_qw_q4_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
|
||||
case GGML_TYPE_Q5_K:
|
||||
return reorder_qw_q5_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
|
||||
case GGML_TYPE_Q6_K:
|
||||
return reorder_qw_q6_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
||||
@@ -3854,7 +4013,6 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
case GGML_TYPE_Q6_K:
|
||||
return reorder_qw_q6_k(data_device, size, 0, stream);
|
||||
default:
|
||||
GGML_ABORT("reorder_qw() called with unsupported type");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -3902,6 +4060,23 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
||||
}
|
||||
}
|
||||
|
||||
// Lazily reorder supported MoE expert weights once their fused path is used.
|
||||
static void opt_for_reorder_id(ggml_backend_sycl_context * ctx, const ggml_tensor * src0) {
|
||||
if (g_ggml_sycl_disable_optimize || !ctx->opt_feature.reorder) {
|
||||
return;
|
||||
}
|
||||
if (src0->type != GGML_TYPE_Q4_K && src0->type != GGML_TYPE_Q5_K && src0->type != GGML_TYPE_Q6_K) {
|
||||
return;
|
||||
}
|
||||
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (!extra || extra->optimized_feature.reorder) {
|
||||
return;
|
||||
}
|
||||
if (reorder_qw(src0, ctx->stream())) {
|
||||
extra->optimized_feature.reorder = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
|
||||
@@ -4067,11 +4242,6 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
||||
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
|
||||
if (!ggml_is_contiguous(src1)) return false;
|
||||
|
||||
// Reorder layout not supported; fall back.
|
||||
const ggml_tensor_extra_gpu * src0_extra =
|
||||
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
||||
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
|
||||
|
||||
const int64_t n_ids_per_group = ids->ne[0];
|
||||
if (ids->ne[1] != 1) return false;
|
||||
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
|
||||
@@ -4081,16 +4251,37 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
|
||||
const int n_experts_used = (int) n_ids_per_group;
|
||||
const int nrows = (int) src0->ne[1];
|
||||
|
||||
// Lazily reorder the (Q4_K) expert weights into a per-expert SoA layout, then run the reorder
|
||||
// GEMV. Placed after the bail checks so a non-dispatchable op does not pay the reorder cost.
|
||||
opt_for_reorder_id(&ctx, src0);
|
||||
const ggml_tensor_extra_gpu * src0_extra =
|
||||
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
|
||||
const bool use_reorder = src0_extra && src0_extra->optimized_feature.reorder;
|
||||
|
||||
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
|
||||
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
|
||||
char * src1_ddq = src1_q8_alloc.get();
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
if (use_reorder) {
|
||||
quantize_row_q8_1_sycl<quantize_and_reorder_q8_1_soa>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
} else {
|
||||
quantize_row_q8_1_sycl<quantize_q8_1>(
|
||||
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
|
||||
src1_padded_cols, stream);
|
||||
}
|
||||
|
||||
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
|
||||
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
|
||||
|
||||
if (use_reorder) {
|
||||
return ggml_sycl_mul_mat_vec_q_id_reorder(
|
||||
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
||||
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
||||
/*expert_weight_stride=*/ src0->nb[2],
|
||||
/*dst_row_stride=*/ dst->nb[1],
|
||||
src1_row_stride, stream);
|
||||
}
|
||||
return ggml_sycl_mul_mat_vec_q_id(
|
||||
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
|
||||
(float *) dst->data, (int) ne10, nrows, n_experts_used,
|
||||
@@ -4489,6 +4680,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_EXP:
|
||||
ggml_sycl_exp(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
ggml_sycl_expm1(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
ggml_sycl_softplus(ctx, dst);
|
||||
break;
|
||||
@@ -5138,6 +5332,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_EXPM1:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_CEIL:
|
||||
@@ -5145,11 +5340,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_FLOOR:
|
||||
case GGML_UNARY_OP_ROUND:
|
||||
case GGML_UNARY_OP_TRUNC:
|
||||
#if defined (GGML_SYCL_F16)
|
||||
return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
|
||||
#else
|
||||
return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
|
||||
#endif
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2468,3 +2468,118 @@ bool ggml_sycl_mul_mat_vec_q_id(
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reorder (SoA) MoE expert GEMV: MoE expert/row/lane indexing (from mul_mat_vec_q_moe) with the
|
||||
// dense-reorder per-block reads (from mul_mat_vec_q_reorder). Each expert slice in vx_base is a
|
||||
// self-contained SoA, so nblocks = nrows*(ncols/qk) per expert and the constant expert stride holds.
|
||||
template <typename reorder_vec_dot_q_sycl>
|
||||
static void mul_mat_vec_q_moe_reorder(
|
||||
const void * __restrict__ vx_base, const void * __restrict__ vy_base,
|
||||
float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
|
||||
const int ncols, const int nrows,
|
||||
const size_t expert_weight_stride, const size_t dst_row_stride,
|
||||
const size_t src1_row_stride,
|
||||
const sycl::nd_item<3> & item_ct1) {
|
||||
using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
|
||||
using block_traits = typename block_type::traits;
|
||||
|
||||
const int expert_idx = item_ct1.get_group(1);
|
||||
const int i02 = ids_dev[expert_idx];
|
||||
|
||||
const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
|
||||
const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
|
||||
float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
|
||||
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto sg = item_ct1.get_sub_group();
|
||||
|
||||
const int blocks_per_row = ncols / block_traits::qk;
|
||||
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
|
||||
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
|
||||
const int nblocks = nrows * (ncols / block_traits::qk);
|
||||
|
||||
static_assert(blocks_per_subgroup > 0);
|
||||
static_assert(block_elements_per_subgroup > 0);
|
||||
|
||||
float partial_sum = 0.0f;
|
||||
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
|
||||
const int ibx = row * blocks_per_row + i;
|
||||
|
||||
const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
|
||||
const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
|
||||
|
||||
const int iby = i * block_type::block_to_q8_1_ratio();
|
||||
const int8_t * q8_1_quant_ptr = (const int8_t *) vy + iby * QK8_1;
|
||||
const sycl::half2 * q8_1_ds_ptr = (const sycl::half2 *) ((const char *) vy + ncols + iby * sizeof(sycl::half2));
|
||||
|
||||
#pragma unroll
|
||||
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
|
||||
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
||||
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
|
||||
}
|
||||
}
|
||||
|
||||
auto sum = sycl::reduce_over_group(sg, partial_sum, std::plus<>());
|
||||
if (sg.leader()) {
|
||||
dst[row] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename reorder_vec_dot_q_sycl>
|
||||
static void launch_mul_mat_vec_q_moe_reorder(
|
||||
const void * vx_base, const void * vy, const int32_t * ids_dev,
|
||||
float * dst_base, const int ncols, const int nrows, const int n_experts_used,
|
||||
const size_t expert_weight_stride, const size_t dst_row_stride,
|
||||
const size_t src1_row_stride,
|
||||
dpct::queue_ptr stream) {
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl>(
|
||||
vx_base, vy, dst_base, ids_dev, ncols, nrows,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
bool ggml_sycl_mul_mat_vec_q_id_reorder(
|
||||
enum ggml_type src0_type,
|
||||
const void * vx_base,
|
||||
const void * vy,
|
||||
const int32_t * ids_dev,
|
||||
float * dst_base,
|
||||
int ncols,
|
||||
int nrows,
|
||||
int n_experts_used,
|
||||
size_t expert_weight_stride,
|
||||
size_t dst_row_stride,
|
||||
size_t src1_row_stride,
|
||||
dpct::queue_ptr stream) {
|
||||
switch (src0_type) {
|
||||
case GGML_TYPE_Q4_K:
|
||||
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q5_K:
|
||||
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
case GGML_TYPE_Q6_K:
|
||||
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(
|
||||
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
|
||||
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,4 +40,21 @@ bool ggml_sycl_mul_mat_vec_q_id(
|
||||
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
|
||||
dpct::queue_ptr stream);
|
||||
|
||||
// Reorder (SoA) variant of the fused MoE expert GEMV.
|
||||
// vx_base: each expert slice (stride expert_weight_stride == src0->nb[2]) is a self-contained reorder/SoA layout.
|
||||
// vy: src1 quantized with quantize_and_reorder_q8_1_soa (per-row SoA). Returns false if src0_type isn't handled.
|
||||
bool ggml_sycl_mul_mat_vec_q_id_reorder(
|
||||
enum ggml_type src0_type,
|
||||
const void * vx_base,
|
||||
const void * vy,
|
||||
const int32_t * ids_dev,
|
||||
float * dst_base,
|
||||
int ncols,
|
||||
int nrows,
|
||||
int n_experts_used,
|
||||
size_t expert_weight_stride,
|
||||
size_t dst_row_stride,
|
||||
size_t src1_row_stride,
|
||||
dpct::queue_ptr stream);
|
||||
|
||||
#endif // GGML_SYCL_MMVQ_HPP
|
||||
|
||||
@@ -902,6 +902,9 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_col2im_1d_f32;
|
||||
vk_pipeline pipeline_col2im_1d_f16;
|
||||
vk_pipeline pipeline_col2im_1d_bf16;
|
||||
vk_pipeline pipeline_snake_f32;
|
||||
vk_pipeline pipeline_snake_f16;
|
||||
vk_pipeline pipeline_snake_bf16;
|
||||
@@ -1552,6 +1555,16 @@ struct vk_op_timestep_embedding_push_constants {
|
||||
uint32_t max_period;
|
||||
};
|
||||
|
||||
struct vk_op_col2im_1d_push_constants {
|
||||
uint32_t T_out;
|
||||
uint32_t OC;
|
||||
uint32_t K_OC;
|
||||
uint32_t T_in;
|
||||
uint32_t K;
|
||||
int32_t stride;
|
||||
int32_t p0;
|
||||
};
|
||||
|
||||
struct vk_op_conv_transpose_1d_push_constants {
|
||||
uint32_t Cout;
|
||||
uint32_t Cin;
|
||||
@@ -5203,6 +5216,9 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32, "col2im_1d_f32", col2im_1d_f32_len, col2im_1d_f32_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16, "col2im_1d_f16", col2im_1d_f16_len, col2im_1d_f16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
@@ -10702,6 +10718,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_conv_transpose_1d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32;
|
||||
case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16;
|
||||
case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16;
|
||||
default: return nullptr;
|
||||
}
|
||||
case GGML_OP_POOL_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
@@ -11147,6 +11170,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
{
|
||||
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
||||
} break;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
{
|
||||
elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 };
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
const uint32_t N = dst->ne[3];
|
||||
@@ -12936,6 +12963,32 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
|
||||
}
|
||||
|
||||
static void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
// src0: [K_OC, T_in] columns from matmul
|
||||
// dst: [T_out, OC]
|
||||
|
||||
const int32_t stride = dst->op_params[0];
|
||||
const int32_t oc = dst->op_params[1];
|
||||
const int32_t p0 = dst->op_params[2];
|
||||
|
||||
const uint32_t K_OC = static_cast<uint32_t>(src0->ne[0]);
|
||||
const uint32_t T_in = static_cast<uint32_t>(src0->ne[1]);
|
||||
const uint32_t T_out = static_cast<uint32_t>(dst->ne[0]);
|
||||
const uint32_t OC = static_cast<uint32_t>(oc);
|
||||
const uint32_t K = K_OC / OC;
|
||||
|
||||
vk_op_col2im_1d_push_constants p{};
|
||||
p.T_out = T_out;
|
||||
p.OC = OC;
|
||||
p.K_OC = K_OC;
|
||||
p.T_in = T_in;
|
||||
p.K = K;
|
||||
p.stride = stride;
|
||||
p.p0 = p0;
|
||||
|
||||
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p));
|
||||
}
|
||||
|
||||
// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
|
||||
// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
|
||||
// dedicated kernel directly. The pattern is validated by
|
||||
@@ -14423,6 +14476,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
ggml_vk_col2im_1d(ctx, compute_ctx, src0, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
|
||||
@@ -17188,6 +17245,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_COL2IM_1D:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 ||
|
||||
op->src[0]->type == GGML_TYPE_F16 ||
|
||||
op->src[0]->type == GGML_TYPE_BF16) &&
|
||||
op->type == op->src[0]->type &&
|
||||
ggml_is_contiguous(op->src[0]) &&
|
||||
ggml_is_contiguous(op);
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
@@ -18019,6 +18083,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
const int32_t p0 = tensor->op_params[1];
|
||||
const int32_t d0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
||||
} else if (tensor->op == GGML_OP_COL2IM_1D) {
|
||||
const int32_t stride = tensor->op_params[0];
|
||||
const int32_t oc = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
|
||||
61
ggml/src/ggml-vulkan/vulkan-shaders/col2im_1d.comp
Normal file
61
ggml/src/ggml-vulkan/vulkan-shaders/col2im_1d.comp
Normal file
@@ -0,0 +1,61 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // columns: [K_OC, T_in]
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; // output: [T_out, OC]
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t T_out;
|
||||
uint32_t OC;
|
||||
uint32_t K_OC;
|
||||
uint32_t T_in;
|
||||
uint32_t K;
|
||||
int32_t stride;
|
||||
int32_t p0;
|
||||
} p;
|
||||
|
||||
// Load A_TYPE to float
|
||||
float load_col(uint32_t idx) {
|
||||
#if defined(DATA_A_BF16)
|
||||
return bf16_to_fp32(uint32_t(data_a[idx]));
|
||||
#else
|
||||
return float(data_a[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Store float as D_TYPE
|
||||
void store_dst(uint32_t idx, float v) {
|
||||
#if defined(DATA_A_BF16)
|
||||
data_d[idx] = D_TYPE(fp32_to_bf16(v));
|
||||
#else
|
||||
data_d[idx] = D_TYPE(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint32_t t_out = gl_GlobalInvocationID.x;
|
||||
const uint32_t oc = gl_GlobalInvocationID.y;
|
||||
if (t_out >= p.T_out || oc >= p.OC) return;
|
||||
|
||||
const int32_t t_abs = int32_t(t_out) + p.p0; // absolute position in uncropped signal
|
||||
|
||||
// Gather: only the ceil(K/stride) columns that scatter into t_abs, no modulo
|
||||
int32_t t_in_min = (t_abs - int32_t(p.K) + p.stride) / p.stride;
|
||||
if (t_in_min < 0) t_in_min = 0;
|
||||
int32_t t_in_max = t_abs / p.stride;
|
||||
if (t_in_max >= int32_t(p.T_in)) t_in_max = int32_t(p.T_in) - 1;
|
||||
|
||||
float val = 0.0;
|
||||
for (int32_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
|
||||
int32_t k = t_abs - t_in * p.stride;
|
||||
// col layout: [K_OC, T_in], column index = oc * K + k
|
||||
uint32_t col_idx = (oc * p.K + uint32_t(k)) + uint32_t(t_in) * p.K_OC;
|
||||
val += load_col(col_idx);
|
||||
}
|
||||
|
||||
// dst layout: [T_out, OC], element (t_out, oc) = t_out + oc * T_out
|
||||
store_dst(t_out + oc * p.T_out, val);
|
||||
}
|
||||
@@ -1003,6 +1003,9 @@ void process_shaders() {
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("col2im_1d_f32", "col2im_1d.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("col2im_1d_f16", "col2im_1d.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("col2im_1d_bf16", "col2im_1d.comp", {{"DATA_A_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
|
||||
|
||||
string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
|
||||
@@ -1882,11 +1882,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de
|
||||
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
|
||||
.run();
|
||||
|
||||
// Python tool with multiline code in string
|
||||
// Python tool with multiline code in string: the \n in the literal decodes to a real
|
||||
// newline, emitted as a JSON \n escape (not a doubled backslash).
|
||||
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
|
||||
{ "python", R"#({"code": "def hello():\n print('hey')"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
// String escape sequences decode to their actual characters (newline + tab here),
|
||||
// so a "write a two line file" style call produces real line breaks, not literal "\n".
|
||||
tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "First line\nSecond line\tindented"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
// Escaped quotes inside a string argument survive the round-trip.
|
||||
tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>")
|
||||
.tools({ python_tool })
|
||||
.expect_tool_calls({
|
||||
{ "python", R"#({"code": "print(\"hi\")"})#", "" }
|
||||
})
|
||||
.run();
|
||||
|
||||
|
||||
@@ -96,16 +96,15 @@ struct mtmd_image_tokens {
|
||||
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
return (nx + 1) * ny + 2;
|
||||
}
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
if (batch_f32.entries.size() == 1 || n_temporal_merge == 1) {
|
||||
return nx * ny;
|
||||
}
|
||||
uint32_t nz = batch_f32.entries.size();
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
if (n_temporal_merge > 1) {
|
||||
// [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future
|
||||
// TODO: simplify this by repeating the last frame until it fits the temporal merge
|
||||
if (nz % n_temporal_merge != 0) {
|
||||
nz = nz / n_temporal_merge + 1;
|
||||
} else {
|
||||
nz = nz / n_temporal_merge;
|
||||
}
|
||||
}
|
||||
return nx * ny * nz;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user