mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-04-26 04:49:43 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b760272f1a | ||
|
|
dcad77cc3b | ||
|
|
98dc1418ea | ||
|
|
9725a313be | ||
|
|
d1649047a3 | ||
|
|
9d34231bb8 | ||
|
|
8ea8fee966 | ||
|
|
eddd7a13a5 |
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
@@ -6,7 +6,7 @@
|
||||
|
||||
<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
|
||||
|
||||
# Requirements
|
||||
## Requirements
|
||||
|
||||
<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
|
||||
|
||||
|
||||
12
.gitignore
vendored
12
.gitignore
vendored
@@ -34,7 +34,6 @@
|
||||
/.vscode/
|
||||
/nppBackup
|
||||
|
||||
|
||||
# Coverage
|
||||
|
||||
/gcovr-report/
|
||||
@@ -74,6 +73,7 @@
|
||||
!/models/templates
|
||||
|
||||
# Zig
|
||||
|
||||
/zig-out/
|
||||
/zig-cache/
|
||||
|
||||
@@ -93,6 +93,7 @@
|
||||
!/examples/sycl/*.sh
|
||||
|
||||
# Server Web UI temporary files
|
||||
|
||||
/tools/server/webui/node_modules
|
||||
/tools/server/webui/dist
|
||||
# we no longer use gz for index.html
|
||||
@@ -106,9 +107,11 @@ __pycache__/
|
||||
poetry.toml
|
||||
|
||||
# Nix
|
||||
|
||||
/result
|
||||
|
||||
# Test binaries
|
||||
|
||||
/tests/test-backend-ops
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
@@ -124,6 +127,7 @@ poetry.toml
|
||||
/tests/test-tokenizer-1-spm
|
||||
|
||||
# Scripts
|
||||
|
||||
!/scripts/install-oneapi.bat
|
||||
|
||||
# Generated by scripts
|
||||
@@ -132,18 +136,24 @@ poetry.toml
|
||||
/wikitext-2-raw/
|
||||
|
||||
# Test models for lora adapters
|
||||
|
||||
/lora-tests
|
||||
|
||||
# Local scripts
|
||||
|
||||
/run-vim.sh
|
||||
/run-chat.sh
|
||||
/run-spec.sh
|
||||
/.ccache/
|
||||
|
||||
# IDE
|
||||
|
||||
/*.code-workspace
|
||||
/.windsurf/
|
||||
# emscripten
|
||||
a.out.*
|
||||
|
||||
# AGENTS
|
||||
|
||||
AGENTS.local.md
|
||||
.pi/SYSTEM.md
|
||||
|
||||
33
.pi/gg/SYSTEM.md
Normal file
33
.pi/gg/SYSTEM.md
Normal file
@@ -0,0 +1,33 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
|
||||
Coding:
|
||||
- When in doubt, always refer to the CONTRIBUTING.md file of the project
|
||||
- When referencing issues or PRs in comments, use the format:
|
||||
- C/C++ code: `// ref: <url>`
|
||||
- Other (CMake, etc.): `# ref: <url>`
|
||||
|
||||
Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
|
||||
- Do not explicitly set the git author in commits - rely on the default git config
|
||||
|
||||
Resources (read on demand):
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
- [Build documentation](docs/build.md)
|
||||
- [Server usage documentation](tools/server/README.md)
|
||||
- [Server development documentation](tools/server/README-dev.md)
|
||||
- [PEG parser](docs/development/parsing.md)
|
||||
- [Auto parser](docs/autoparser.md)
|
||||
- [Jinja engine](common/jinja/README.md)
|
||||
- [PR template](.github/pull_request_template.md)
|
||||
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
|
||||
});
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
|
||||
});
|
||||
// try the more aggressive parse first, if it fails, fall back to the delimiter one
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
if (result.result.success()) {
|
||||
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
start = trim_leading_whitespace(result.tags["pre"]);
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
start = result.tags["pre"];
|
||||
end = result.tags["post"];
|
||||
} else if (!result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
end = result.tags["post"];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,18 +61,26 @@ static bool common_speculative_are_compatible(
|
||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
|
||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (
|
||||
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
||||
) {
|
||||
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
|
||||
LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
|
||||
__func__,
|
||||
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
|
||||
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
|
||||
LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
|
||||
__func__,
|
||||
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
|
||||
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -51,6 +51,12 @@ The packages for FP32 and FP16 would have different accuracy and performance on
|
||||
|
||||
## News
|
||||
|
||||
- 2026.04
|
||||
|
||||
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
|
||||
- Fused MoE.
|
||||
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
|
||||
|
||||
- 2026.03
|
||||
- Support Flash-Attention: less memory usage, performance impact depends on LLM.
|
||||
|
||||
@@ -349,6 +355,12 @@ Choose one of following methods to run.
|
||||
./examples/sycl/test.sh
|
||||
```
|
||||
|
||||
- Run llama-server:
|
||||
|
||||
```sh
|
||||
./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
|
||||
```
|
||||
|
||||
2. Command line
|
||||
Launch inference
|
||||
|
||||
@@ -637,10 +649,18 @@ Choose one of following methods to run.
|
||||
|
||||
1. Script
|
||||
|
||||
- Run test:
|
||||
|
||||
```
|
||||
examples\sycl\win-test.bat
|
||||
```
|
||||
|
||||
- Run llama-server:
|
||||
|
||||
```
|
||||
examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
|
||||
```
|
||||
|
||||
2. Command line
|
||||
|
||||
Launch inference
|
||||
|
||||
124
examples/sycl/start-svr.sh
Executable file
124
examples/sycl/start-svr.sh
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
Help() {
|
||||
cat << EOF
|
||||
Usage: $(basename "$0") [OPTIONS]
|
||||
|
||||
This script processes files with specified options.
|
||||
|
||||
Options:
|
||||
-h, --help Display this help message and exit.
|
||||
-c, --context <value> Set context length. Bigger need more memory.
|
||||
-p, --promote <value> Prompt to start generation with.
|
||||
-m, --model <value> Full model file path.
|
||||
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
|
||||
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
|
||||
- none: use one GPU only
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
|
||||
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
ignored. Values:
|
||||
- 0: generic output
|
||||
- 1: error
|
||||
- 2: warning
|
||||
- 3: info
|
||||
- 4: debug
|
||||
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
BIN_FILE=./build/bin/llama-server
|
||||
SEED=0
|
||||
GPUS_SETTING=""
|
||||
|
||||
MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
|
||||
NGL=99
|
||||
CONTEXT=4096
|
||||
GGML_SYCL_DEVICE=-1
|
||||
SPLIT_MODE=layer
|
||||
LOG_VERBOSE=3
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-c|--context)
|
||||
CONTEXT=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-m|--model)
|
||||
MODEL_FILE="$2"
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-mg|--main-gpu)
|
||||
GGML_SYCL_DEVICE=$2
|
||||
SPLIT_MODE=none
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-sm|--split-mode)
|
||||
SPLIT_MODE=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-ngl|--n-gpu-layers)
|
||||
NGL=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-lv|--log-verbosity)
|
||||
LOG_VERBOSE=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
Help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# Handle unknown options or stop processing options
|
||||
echo "Invalid option: $1"
|
||||
# Optional: exit script or shift to treat remaining as positional args
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
#support malloc device memory more than 4GB.
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
|
||||
|
||||
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
GPUS_SETTING="-sm ${SPLIT_MODE}"
|
||||
fi
|
||||
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ SEED=0
|
||||
GPUS_SETTING=""
|
||||
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
||||
MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
|
||||
NGL=99
|
||||
CONTEXT=4096
|
||||
GGML_SYCL_DEVICE=-1
|
||||
@@ -122,9 +122,10 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
GPUS_SETTING="-sm ${SPLIT_MODE}"
|
||||
fi
|
||||
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
|
||||
|
||||
|
||||
179
examples/sycl/win-start-svr.bat
Normal file
179
examples/sycl/win-start-svr.bat
Normal file
@@ -0,0 +1,179 @@
|
||||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
@echo off
|
||||
setlocal EnableExtensions EnableDelayedExpansion
|
||||
|
||||
set "BIN_FILE=.\build\bin\llama-server.exe"
|
||||
set "SEED=0"
|
||||
set "GPUS_SETTING="
|
||||
|
||||
set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
|
||||
set "NGL=99"
|
||||
set "CONTEXT=4096"
|
||||
set "GGML_SYCL_DEVICE=-1"
|
||||
set "SPLIT_MODE=layer"
|
||||
set "LOG_VERBOSE=3"
|
||||
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
:parse_args
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
if /I "%~1"=="-c" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--context" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-m" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--model" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-mg" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--main-gpu" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-sm" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--split-mode" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-ngl" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--n-gpu-layers" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-lv" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--log-verbosity" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-h" goto help
|
||||
if /I "%~1"=="--help" goto help
|
||||
|
||||
echo Invalid option: %~1
|
||||
exit /b 1
|
||||
|
||||
:missing_value
|
||||
echo Missing value for option: %~1
|
||||
exit /b 1
|
||||
|
||||
:help
|
||||
echo Usage: %~n0 [OPTIONS]
|
||||
echo.
|
||||
echo This script processes files with specified options.
|
||||
echo.
|
||||
echo Options:
|
||||
echo -h, --help Display this help message and exit.
|
||||
echo -c, --context ^<value^> Set context length. Bigger need more memory.
|
||||
echo -m, --model ^<value^> Full model file path.
|
||||
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
|
||||
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
|
||||
echo - none: use one GPU only
|
||||
echo - layer (default): split layers and KV across GPUs
|
||||
echo - row: split rows across GPUs
|
||||
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
|
||||
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
echo ignored. Values:
|
||||
echo - 0: generic output
|
||||
echo - 1: error
|
||||
echo - 2: warning
|
||||
echo - 3: info
|
||||
echo - 4: debug
|
||||
exit /b 0
|
||||
|
||||
:after_args
|
||||
|
||||
REM In Windows CMD, source is not available; call oneAPI setvars if present.
|
||||
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
|
||||
) else (
|
||||
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
|
||||
)
|
||||
|
||||
REM Support malloc device memory more than 4GB.
|
||||
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
|
||||
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
|
||||
|
||||
if not "%GGML_SYCL_DEVICE%"=="-1" (
|
||||
echo Use %GGML_SYCL_DEVICE% as main GPU
|
||||
REM Use single GPU only.
|
||||
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
|
||||
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
|
||||
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
|
||||
) else (
|
||||
echo Use all Intel GPUs, including iGPU ^& dGPU
|
||||
set "GPUS_SETTING=-sm %SPLIT_MODE%"
|
||||
)
|
||||
|
||||
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
|
||||
set "ZES_ENABLE_SYSMAN=1"
|
||||
%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
|
||||
|
||||
endlocal
|
||||
|
||||
@@ -2,10 +2,200 @@
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
:: support malloc device memory more than 4GB.
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
set LOAD_MODE="--mmap"
|
||||
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
|
||||
@echo off
|
||||
setlocal EnableExtensions EnableDelayedExpansion
|
||||
|
||||
REM MIT license
|
||||
REM Copyright (C) 2024 Intel Corporation
|
||||
REM SPDX-License-Identifier: MIT
|
||||
|
||||
set "BIN_FILE=.\build\bin\llama-completion.exe"
|
||||
set "SEED=0"
|
||||
set "GPUS_SETTING="
|
||||
|
||||
set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
|
||||
set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
|
||||
set "NGL=99"
|
||||
set "CONTEXT=4096"
|
||||
set "GGML_SYCL_DEVICE=-1"
|
||||
set "SPLIT_MODE=layer"
|
||||
set "LOG_VERBOSE=3"
|
||||
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
:parse_args
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
if /I "%~1"=="-c" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--context" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-p" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "INPUT_PROMPT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--promote" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "INPUT_PROMPT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-m" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--model" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-mg" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--main-gpu" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-sm" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--split-mode" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-ngl" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--n-gpu-layers" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-lv" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--log-verbosity" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-h" goto help
|
||||
if /I "%~1"=="--help" goto help
|
||||
|
||||
echo Invalid option: %~1
|
||||
exit /b 1
|
||||
|
||||
:missing_value
|
||||
echo Missing value for option: %~1
|
||||
exit /b 1
|
||||
|
||||
:help
|
||||
echo Usage: %~n0 [OPTIONS]
|
||||
echo.
|
||||
echo This script processes files with specified options.
|
||||
echo.
|
||||
echo Options:
|
||||
echo -h, --help Display this help message and exit.
|
||||
echo -c, --context ^<value^> Set context length. Bigger need more memory.
|
||||
echo -p, --promote ^<value^> Prompt to start generation with.
|
||||
echo -m, --model ^<value^> Full model file path.
|
||||
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
|
||||
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
|
||||
echo - none: use one GPU only
|
||||
echo - layer (default): split layers and KV across GPUs
|
||||
echo - row: split rows across GPUs
|
||||
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
|
||||
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
echo ignored. Values:
|
||||
echo - 0: generic output
|
||||
echo - 1: error
|
||||
echo - 2: warning
|
||||
echo - 3: info
|
||||
echo - 4: debug
|
||||
exit /b 0
|
||||
|
||||
:after_args
|
||||
|
||||
REM In Windows CMD, source is not available; call oneAPI setvars if present.
|
||||
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
|
||||
) else (
|
||||
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
|
||||
)
|
||||
|
||||
REM Support malloc device memory more than 4GB.
|
||||
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
|
||||
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
|
||||
|
||||
if not "%GGML_SYCL_DEVICE%"=="-1" (
|
||||
echo Use %GGML_SYCL_DEVICE% as main GPU
|
||||
REM Use single GPU only.
|
||||
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
|
||||
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
|
||||
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
|
||||
) else (
|
||||
echo Use all Intel GPUs, including iGPU ^& dGPU
|
||||
set "GPUS_SETTING=-sm %SPLIT_MODE%"
|
||||
)
|
||||
|
||||
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
|
||||
set "ZES_ENABLE_SYSMAN=1"
|
||||
%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
|
||||
|
||||
endlocal
|
||||
|
||||
|
||||
@@ -3478,10 +3478,10 @@ template <ggml_type type, int mmq_x, bool need_check>
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const uint3 ntx) {
|
||||
|
||||
// Skip unused template specializations for faster compilation:
|
||||
if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
|
||||
@@ -3495,8 +3495,7 @@ static __global__ void mul_mat_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
// Initialize the ids for writing back data with just the index.
|
||||
// For regular matrix multiplications this is never changed.
|
||||
@@ -3517,8 +3516,9 @@ static __global__ void mul_mat_q(
|
||||
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
{
|
||||
const int wt = blockIdx.z / nchannels_y;
|
||||
const int zt = blockIdx.z - wt*nchannels_y;
|
||||
const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
|
||||
const int wt = tmp2.x;
|
||||
const int zt = tmp2.y;
|
||||
const int jt = blockIdx.y;
|
||||
const int it = blockIdx.x;
|
||||
|
||||
@@ -3561,40 +3561,40 @@ static __global__ void mul_mat_q(
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
|
||||
|
||||
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||
tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
|
||||
return;
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int kbc = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
|
||||
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
|
||||
kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;
|
||||
|
||||
// kb0 == k index when doing the matrix multiplication for an output tile.
|
||||
int kb0_start = kbc % blocks_per_ne00;
|
||||
int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
|
||||
while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
|
||||
int tmp = kbc;
|
||||
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
|
||||
const int zt = tmp / (ntx*blocks_per_ne00);
|
||||
tmp -= zt * (ntx*blocks_per_ne00);
|
||||
const int jt = tmp / blocks_per_ne00;
|
||||
int kb0_start = fastmodulo(kbc, blocks_per_ne00);
|
||||
int kb0_stop = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
|
||||
while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
|
||||
int tmp = fastdiv(kbc, blocks_per_ne00);
|
||||
uint2 tmp2 = fast_div_modulo(tmp, ntx);
|
||||
const int jt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nchannels_y);
|
||||
const int zt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nsamples_y);
|
||||
const int wt = tmp2.y;
|
||||
const int it = tmp2.x;
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
@@ -3612,11 +3612,11 @@ static __global__ void mul_mat_q(
|
||||
offset_dst = 0;
|
||||
|
||||
if (jt*mmq_x >= col_diff) {
|
||||
kbc += blocks_per_ne00;
|
||||
kbc -= kbc % blocks_per_ne00;
|
||||
kbc += blocks_per_ne00.z;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00);
|
||||
|
||||
kb0_start = 0;
|
||||
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
|
||||
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -3641,32 +3641,34 @@ static __global__ void mul_mat_q(
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
|
||||
|
||||
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
|
||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
|
||||
kbc += blocks_per_ne00;
|
||||
kbc -= kbc % blocks_per_ne00;
|
||||
kbc += blocks_per_ne00.z;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00);
|
||||
|
||||
kb0_start = 0;
|
||||
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
|
||||
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
|
||||
}
|
||||
|
||||
if (kbc >= kbc_stop) {
|
||||
return;
|
||||
}
|
||||
|
||||
int tmp = kbc;
|
||||
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
|
||||
const int zt = tmp / (ntx*blocks_per_ne00);
|
||||
tmp -= zt * (ntx*blocks_per_ne00);
|
||||
const int jt = tmp / blocks_per_ne00;
|
||||
int tmp = fastdiv(kbc, blocks_per_ne00);
|
||||
uint2 tmp2 = fast_div_modulo(tmp, ntx);
|
||||
const int jt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nchannels_y);
|
||||
const int zt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nsamples_y);
|
||||
const int wt = tmp2.y;
|
||||
const int it = tmp2.x;
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
@@ -3708,7 +3710,7 @@ static __global__ void mul_mat_q(
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
|
||||
|
||||
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
|
||||
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
|
||||
@@ -3717,46 +3719,37 @@ static __global__ void mul_mat_q(
|
||||
}
|
||||
|
||||
template <ggml_type type, int mmq_x, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
const int32_t * expert_bounds,
|
||||
float * __restrict__ dst,
|
||||
const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int ncols_dst,
|
||||
const size_t stride_col_dst,
|
||||
const int nchannels_y,
|
||||
const size_t stride_channel_dst,
|
||||
const int nsamples_y,
|
||||
const size_t stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
|
||||
float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
|
||||
const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
|
||||
const int stride_sample_dst, const uint3 ntx) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int nwarps = mmq_get_nwarps_device()/2;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
|
||||
float sum[mmq_x / nwarps] = {0.0f};
|
||||
const int i = blockIdx.y*warp_size + threadIdx.x;
|
||||
|
||||
const int ntx = (ncols_max + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
|
||||
const int bidx0 = blockIdx.x;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int kbc0 = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
|
||||
kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc0 -= fastmodulo(kbc0, blocks_per_ne00) % blocks_per_iter;
|
||||
kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;
|
||||
|
||||
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
||||
const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
|
||||
const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
|
||||
const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
|
||||
const bool did_not_write_last = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
|
||||
if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
|
||||
return;
|
||||
}
|
||||
@@ -3765,11 +3758,11 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
// Iterate over previous blocks and sum up partial sums written to fixup buffer.
|
||||
// All CUDA blocks that get here must have a previous block that needs a fixup.
|
||||
int64_t bidx = bidx0 - 1;
|
||||
int64_t kbc_stop = kbc0;
|
||||
int bidx = bidx0 - 1;
|
||||
int kbc_stop = kbc0;
|
||||
while(true) {
|
||||
int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
|
||||
int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
|
||||
|
||||
if (kbc == kbc_stop) { // Did not have any data.
|
||||
bidx--;
|
||||
@@ -3779,20 +3772,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
any_fixup = true;
|
||||
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
|
||||
}
|
||||
sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
|
||||
}
|
||||
|
||||
// If this block started in a previous tile we are done and don't need to combine additional partial results.
|
||||
if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
|
||||
if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
|
||||
break;
|
||||
}
|
||||
bidx--;
|
||||
@@ -3803,14 +3792,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
return;
|
||||
}
|
||||
|
||||
int tmp = kbc0;
|
||||
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
|
||||
const int zt = tmp / (ntx*blocks_per_ne00);
|
||||
tmp -= zt * (ntx*blocks_per_ne00);
|
||||
const int jt = tmp / blocks_per_ne00;
|
||||
int tmp = fastdiv(kbc0, blocks_per_ne00);
|
||||
uint2 tmp2 = fast_div_modulo(tmp, ntx);
|
||||
const int jt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nchannels_y);
|
||||
const int zt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nsamples_y);
|
||||
const int wt = tmp2.y;
|
||||
const int it = tmp2.x;
|
||||
|
||||
if (!ids_dst) {
|
||||
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
|
||||
@@ -3818,6 +3809,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_dst - jt*mmq_x - 1;
|
||||
if (need_check && i > i_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -3827,16 +3821,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
|
||||
}
|
||||
dst[j*stride_col_dst + i] += sum[j0/nwarps];
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -3856,6 +3841,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = col_diff - jt*mmq_x - 1;
|
||||
if (need_check && i > i_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -3865,16 +3853,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
|
||||
}
|
||||
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3922,29 +3901,44 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
const int channel_ratio = args.nchannels_y / args.nchannels_x;
|
||||
const int sample_ratio = args.nsamples_y / args.nsamples_x;
|
||||
|
||||
const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
|
||||
const uint3 ntx_fd = init_fastdiv_values(ntx);
|
||||
const uint3 nchannels_y_fd = init_fastdiv_values(args.nchannels_y);
|
||||
const uint3 nsamples_y_fd = init_fastdiv_values(args.nsamples_y);
|
||||
const uint3 channel_ratio_fd = init_fastdiv_values(channel_ratio);
|
||||
const uint3 sample_ratio_fd = init_fastdiv_values(sample_ratio);
|
||||
|
||||
if (!args.use_stream_k) {
|
||||
if (args.nrows_x % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_nums_stream_k(nsm, 1, 1);
|
||||
const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
|
||||
// For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
|
||||
// This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
|
||||
const int ntiles_dst = ntx * nty * ntzw;
|
||||
const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
|
||||
const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
|
||||
const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
|
||||
|
||||
GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
|
||||
|
||||
const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool(id);
|
||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
|
||||
@@ -3952,40 +3946,45 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
|
||||
}
|
||||
|
||||
const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
|
||||
const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
|
||||
|
||||
if (args.nrows_x % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
|
||||
if (!fixup_needed) {
|
||||
return;
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
|
||||
if (!fixup_needed) {
|
||||
return;
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -101,6 +101,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
}
|
||||
}
|
||||
|
||||
#if __HVX_ARCH__ >= 75
|
||||
{
|
||||
// Set HMX clock
|
||||
HAP_power_request_t request;
|
||||
@@ -118,6 +119,7 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -677,7 +677,15 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
|
||||
const ggml_type tsrc1 = op->src[1]->type;
|
||||
|
||||
const bool bc_inp = op->src[0]->ne[0] % 32 != 0;
|
||||
const bool bc_out = op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0;
|
||||
|
||||
constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y;
|
||||
constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
|
||||
|
||||
const bool has_tensor = ggml_metal_device_get_props(ggml_metal_library_get_device(lib))->has_tensor;
|
||||
|
||||
const bool bc_out = has_tensor
|
||||
? (op->ne[0] % NRA != 0 || op->ne[1] % NRB != 0)
|
||||
: (op->ne[0] % 64 != 0 || op->ne[1] % 32 != 0);
|
||||
|
||||
snprintf(base, 256, "kernel_mul_mm_%s_%s", ggml_type_name(tsrc0), ggml_type_name(tsrc1));
|
||||
snprintf(name, 256, "%s_bci=%d_bco=%d", base, bc_inp, bc_out);
|
||||
@@ -694,8 +702,20 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm(ggml_meta
|
||||
ggml_metal_cv_free(cv);
|
||||
}
|
||||
|
||||
// when the output size is not multiple of 64x32, we need extra smem to prevent out-of-bounds writes
|
||||
res.smem = bc_out ? 8192 : 4096 + 2048;
|
||||
if (has_tensor) {
|
||||
res.nr0 = NRA;
|
||||
res.nr1 = NRB;
|
||||
|
||||
const size_t smem_a = NRA * N_MM_NK_TOTAL * sizeof(ggml_fp16_t);
|
||||
res.smem = smem_a;
|
||||
} else {
|
||||
res.nr0 = 64;
|
||||
res.nr1 = 32;
|
||||
|
||||
res.smem = bc_out ? 8192 : (4096 + 2048);
|
||||
}
|
||||
|
||||
res.nsg = N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -102,6 +102,8 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
|
||||
|
||||
void ggml_metal_library_free(ggml_metal_library_t lib);
|
||||
|
||||
ggml_metal_device_t ggml_metal_library_get_device(ggml_metal_library_t lib);
|
||||
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
|
||||
|
||||
|
||||
@@ -95,8 +95,8 @@ int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_wi
|
||||
|
||||
struct ggml_metal_library {
|
||||
id<MTLLibrary> obj;
|
||||
id<MTLDevice> device;
|
||||
|
||||
ggml_metal_device_t dev;
|
||||
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
|
||||
|
||||
NSLock * lock;
|
||||
@@ -251,7 +251,7 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
||||
|
||||
res->obj = library;
|
||||
res->device = device;
|
||||
res->dev = dev;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
res->lock = [NSLock new];
|
||||
|
||||
@@ -318,7 +318,7 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
|
||||
}
|
||||
|
||||
res->obj = library;
|
||||
res->device = device;
|
||||
res->dev = dev;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
res->lock = [NSLock new];
|
||||
|
||||
@@ -341,6 +341,10 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
|
||||
free(lib);
|
||||
}
|
||||
|
||||
ggml_metal_device_t ggml_metal_library_get_device(ggml_metal_library_t lib) {
|
||||
return lib->dev;
|
||||
}
|
||||
|
||||
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
|
||||
[lib->lock lock];
|
||||
|
||||
@@ -405,7 +409,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_
|
||||
return res;
|
||||
}
|
||||
|
||||
id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(lib->dev);
|
||||
id<MTLComputePipelineState> obj = [device newComputePipelineStateWithFunction:mtl_function error:&error];
|
||||
|
||||
[mtl_function release];
|
||||
|
||||
@@ -699,7 +704,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
" auto sB = tB.slice(0, 0); \n"
|
||||
" mm.run(sB, sA, cT); \n"
|
||||
" \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(16, 16)); \n"
|
||||
" \n"
|
||||
" cT.store(tC); \n"
|
||||
"}";
|
||||
@@ -749,7 +754,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
|
||||
" auto sB = tB.slice(0, 0); \n"
|
||||
" mm.run(sB, sA, cT); \n"
|
||||
" \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(16, 16)); \n"
|
||||
" \n"
|
||||
" cT.store(tC); \n"
|
||||
"}";
|
||||
|
||||
@@ -1,6 +1,19 @@
|
||||
#ifndef GGML_METAL_IMPL
|
||||
#define GGML_METAL_IMPL
|
||||
|
||||
// kernel parameters for mat-mat threadgroups
|
||||
//
|
||||
// TODO: become function constants
|
||||
|
||||
#define SZ_SIMDGROUP 16
|
||||
#define N_MM_NK 2
|
||||
#define N_MM_NK_TOTAL (SZ_SIMDGROUP * N_MM_NK)
|
||||
|
||||
#define N_MM_BLOCK_X 4
|
||||
#define N_MM_BLOCK_Y 2
|
||||
#define N_MM_SIMD_GROUP_X 2
|
||||
#define N_MM_SIMD_GROUP_Y 2
|
||||
|
||||
// kernel parameters for mat-vec threadgroups
|
||||
//
|
||||
// N_R0: number of src0 rows to process per simdgroup
|
||||
|
||||
@@ -2195,7 +2195,12 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||
const size_t smem = pipeline.smem;
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
|
||||
|
||||
const int nr0 = pipeline.nr0;
|
||||
const int nr1 = pipeline.nr1;
|
||||
const int nsg = pipeline.nsg;
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + nr1 - 1) / nr1), ((ne01 + nr0 - 1) / nr0), ne12 * ne13, 32, nsg, 1);
|
||||
} else {
|
||||
auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
|
||||
|
||||
|
||||
@@ -9306,7 +9306,137 @@ constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
|
||||
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
|
||||
|
||||
// each block_q contains 16*nl weights
|
||||
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
template<
|
||||
typename SA, typename SA_4x4, typename SA_8x8,
|
||||
typename SB, typename SB_2x4, typename SB_8x8,
|
||||
typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread SA_4x4 &),
|
||||
typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm(
|
||||
constant ggml_metal_kargs_mul_mm & args,
|
||||
device const char * srcA,
|
||||
device const char * srcB,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig [[threadgroup_position_in_grid]],
|
||||
ushort tiitg [[thread_index_in_threadgroup]],
|
||||
ushort sgitg [[simdgroup_index_in_threadgroup]]) {
|
||||
(void) sgitg;
|
||||
|
||||
// Matrix dimensions: A(M,K) x B(K,N) -> C(M,N)
|
||||
const int K = args.ne00;
|
||||
const int M = args.ne0;
|
||||
const int N = args.ne1;
|
||||
|
||||
// Batch dimension handling
|
||||
const int im = tgpig.z;
|
||||
const int i12 = im % args.ne12;
|
||||
const int i13 = im / args.ne12;
|
||||
|
||||
// Batch offsets for srcA and srcB
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
|
||||
// Tile dimensions
|
||||
constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
|
||||
constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y;
|
||||
|
||||
// Tile offsets in output matrix
|
||||
const int ra = tgpig.y * NRA;
|
||||
const int rb = tgpig.x * NRB;
|
||||
|
||||
// Threadgroup memory for dequantized A tile only
|
||||
threadgroup SA * sa = (threadgroup SA *)(shmem);
|
||||
|
||||
// Work-item count for A loading
|
||||
constexpr int A_WORK_ITEMS = NRA * N_MM_NK;
|
||||
constexpr int NUM_THREADS = N_SIMDWIDTH * N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y;
|
||||
|
||||
// tA wraps threadgroup memory
|
||||
auto tA = tensor(sa, dextents<int32_t, 2>(N_MM_NK_TOTAL, NRA));
|
||||
|
||||
// tB wraps device memory directly
|
||||
device T1 * ptrB = (device T1 *)(srcB + args.nb12*i12 + args.nb13*i13);
|
||||
const int strideB = args.nb11 / sizeof(T1);
|
||||
auto tB = tensor(ptrB, dextents<int32_t, 2>(K, N), array<int, 2>({1, strideB}));
|
||||
|
||||
// Configure matmul operation
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(
|
||||
NRB, NRA, N_MM_NK_TOTAL, false, true, true,
|
||||
mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y>> mm;
|
||||
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tB), decltype(tA), float>();
|
||||
|
||||
// Accumulate partial results over K dimension
|
||||
for (int loop_k = 0; loop_k < K; loop_k += N_MM_NK_TOTAL) {
|
||||
// === PHASE 1: Dequantization of A into threadgroup memory ===
|
||||
for (int work = tiitg; work < A_WORK_ITEMS; work += NUM_THREADS) {
|
||||
const int row = work / N_MM_NK;
|
||||
const int k_chunk = work % N_MM_NK;
|
||||
const int k_pos = loop_k + k_chunk * 16;
|
||||
const short k_base = k_chunk * 16;
|
||||
|
||||
// Bounds check: skip device read if row is out of matrix bounds
|
||||
if (ra + row < M) {
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
// Element-wise reads when K is not aligned (nb01 not aligned for half4x4/float4x4).
|
||||
// MSL spec Table 2.5: half4x4 requires 8-byte alignment. When K is odd,
|
||||
// nb01 = K*2 is not 8-byte aligned, so odd-row pointers are misaligned.
|
||||
// Mirrors the legacy kernel's existing guard.
|
||||
device const T0 * row_ptr = (device const T0 *)(srcA + args.nb01 * (ra + row) + offset0);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? (SA) row_ptr[k_pos + i] : (SA)0;
|
||||
}
|
||||
} else {
|
||||
const int block_idx = k_pos / (16 * nl);
|
||||
const short il = (k_pos / 16) % nl;
|
||||
|
||||
device const block_q * row_ptr = (device const block_q *)(srcA + args.nb01 * (ra + row) + offset0);
|
||||
|
||||
SA_4x4 temp_a;
|
||||
dequantize_func(row_ptr + block_idx, il, temp_a);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
// Zero-pad A for K positions beyond valid range (handles partial K iterations)
|
||||
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? temp_a[i/4][i%4] : (SA)0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Zero-pad rows beyond matrix bounds
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (SA)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// === PHASE 2: Tensor matmul ===
|
||||
auto mA = tA.slice(0, 0);
|
||||
auto mB = tB.slice(loop_k, rb);
|
||||
|
||||
mm.run(mB, mA, cT);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// Store result tile to output matrix (with batch offset)
|
||||
// cT.store handles bounds checking via tD's extents (M, N)
|
||||
device float * dstBatch = (device float *)dst + im * N * M;
|
||||
|
||||
auto tD = tensor(dstBatch, dextents<int32_t, 2>(M, N), array<int, 2>({1, M}));
|
||||
cT.store(tD.slice(ra, rb));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<
|
||||
typename S0, typename S0_4x4, typename S0_8x8,
|
||||
typename S1, typename S1_2x4, typename S1_8x8,
|
||||
typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &),
|
||||
typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm(
|
||||
constant ggml_metal_kargs_mul_mm & args,
|
||||
device const char * src0,
|
||||
@@ -9320,10 +9450,6 @@ kernel void kernel_mul_mm(
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
threadgroup float * sc = (threadgroup float *)(shmem);
|
||||
#endif
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
@@ -9363,7 +9489,6 @@ kernel void kernel_mul_mm(
|
||||
+ args.nb11*(r1 + lr1)
|
||||
+ args.nb10*iy);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
@@ -9372,19 +9497,8 @@ kernel void kernel_mul_mm(
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
#else
|
||||
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
|
||||
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
|
||||
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<4>> mm;
|
||||
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
|
||||
#endif
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
@@ -9454,66 +9568,6 @@ kernel void kernel_mul_mm(
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#else
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#endif
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
@@ -9522,7 +9576,6 @@ kernel void kernel_mul_mm(
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
@@ -9549,24 +9602,10 @@ kernel void kernel_mul_mm(
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
#else
|
||||
auto sA = tA.slice(0, 0);
|
||||
auto sB = tB.slice(0, 0);
|
||||
|
||||
mm.run(sB, sA, cT);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
|
||||
// if no bounds checks on the output are needed, we can directly write to device memory
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
device float * C = (device float *) dst +
|
||||
r0 + \
|
||||
r1 * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(args.ne0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
device float * C = (device float *) dst +
|
||||
(r0 + 32*(sgitg & 1)) + \
|
||||
(r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
@@ -9574,21 +9613,15 @@ kernel void kernel_mul_mm(
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
@@ -9614,6 +9647,8 @@ kernel void kernel_mul_mm(
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GGML_METAL_HAS_TENSOR
|
||||
|
||||
template<short ne20> // n_expert_used
|
||||
kernel void kernel_mul_mm_id_map0(
|
||||
constant ggml_metal_kargs_mul_mm_id_map0 & args,
|
||||
@@ -9789,7 +9824,7 @@ kernel void kernel_mul_mm_id(
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? (S0) *((device T0 *) x + i) : (S0) 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
|
||||
@@ -224,7 +224,7 @@ struct sycl_device_info {
|
||||
// cudaOccupancyMaxActiveBlocksPerMultiprocessor
|
||||
bool vmm; // virtual memory support
|
||||
size_t total_vram;
|
||||
//sycl_hw_info hw_info; \\ device id and aarch, currently not used
|
||||
sycl_hw_info hw_info;
|
||||
optimize_feature opt_feature;
|
||||
};
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||
|
||||
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
|
||||
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
|
||||
info.devices[i].hw_info = get_device_hw_info(&device);
|
||||
|
||||
}
|
||||
|
||||
@@ -3703,9 +3704,16 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
|
||||
// is enabled takes precedence over DMMV, the current if-else implementation
|
||||
// requires disabling DMMV if both conditions are met
|
||||
|
||||
if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
|
||||
ggml_sycl_supports_reorder_mmvq(src0->type)))) {
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
// Arc770 get benefit with Q4_0 by skipping it.
|
||||
if (!(ggml_sycl_info().devices[ctx.device].hw_info.arch ==
|
||||
gpu_arch::intel_gpu_acm_g10 &&
|
||||
src0->type == GGML_TYPE_Q4_0)) {
|
||||
use_dequantize_mul_mat_vec =
|
||||
use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
}
|
||||
}
|
||||
|
||||
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
|
||||
@@ -1,15 +1,67 @@
|
||||
#include "sycl_hw.hpp"
|
||||
|
||||
// TODO: currently not used
|
||||
/*
|
||||
sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
|
||||
sycl_hw_info res;
|
||||
int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
||||
res.device_id = id;
|
||||
using namespace std;
|
||||
|
||||
syclex::architecture arch = device_ptr->get_info<syclex::info::device::architecture>();
|
||||
res.arch = arch;
|
||||
|
||||
return res;
|
||||
}
|
||||
/*defined in
|
||||
* /opt/intel/oneapi/compiler/latest/include/sycl/ext/oneapi/experimental/device_architecture.def
|
||||
*/
|
||||
static map<gpu_arch, std::pair<const char*, sycl_intel_gpu_family>> arch2name = {
|
||||
{gpu_arch::intel_gpu_bdw, {"intel_gpu_bdw", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_skl, {"intel_gpu_skl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_kbl, {"intel_gpu_kbl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_cfl, {"intel_gpu_cfl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_apl, {"intel_gpu_apl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_glk, {"intel_gpu_glk", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_whl, {"intel_gpu_whl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_aml, {"intel_gpu_aml", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_cml, {"intel_gpu_cml", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_icllp, {"intel_gpu_icllp", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_ehl, {"intel_gpu_ehl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_tgllp, {"intel_gpu_tgllp", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_rkl, {"intel_gpu_rkl", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_adl_s, {"intel_gpu_adl_s", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_adl_p, {"intel_gpu_adl_p", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_adl_n, {"intel_gpu_adl_n", GPU_FAMILY_IGPU_NON_XE}},
|
||||
{gpu_arch::intel_gpu_dg1, {"intel_gpu_dg1", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_acm_g10, {"intel_gpu_acm_g10", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_acm_g11, {"intel_gpu_acm_g11", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_acm_g12, {"intel_gpu_acm_g12", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_pvc, {"intel_gpu_pvc", GPU_FAMILY_DGPU_CLOUD}},
|
||||
{gpu_arch::intel_gpu_pvc_vg, {"intel_gpu_pvc_vg", GPU_FAMILY_DGPU_CLOUD}},
|
||||
{gpu_arch::intel_gpu_mtl_u, {"intel_gpu_mtl_u", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_mtl_h, {"intel_gpu_mtl_h", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_arl_h, {"intel_gpu_arl_h", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_bmg_g21, {"intel_gpu_bmg_g21", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_bmg_g31, {"intel_gpu_bmg_g31", GPU_FAMILY_DGPU_CLIENT_GAME}},
|
||||
{gpu_arch::intel_gpu_lnl_m, {"intel_gpu_lnl_m", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_ptl_h, {"intel_gpu_ptl_h", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_ptl_u, {"intel_gpu_ptl_u", GPU_FAMILY_IGPU_XE}},
|
||||
{gpu_arch::intel_gpu_wcl, {"intel_gpu_wcl", GPU_FAMILY_IGPU_XE}}
|
||||
};
|
||||
|
||||
|
||||
sycl_hw_info get_device_hw_info(sycl::device* device_ptr) {
|
||||
sycl_hw_info res;
|
||||
int32_t id =
|
||||
device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
|
||||
res.device_id = id;
|
||||
|
||||
res.name = device_ptr->get_info<sycl::info::device::name>();
|
||||
|
||||
syclex::architecture arch =
|
||||
device_ptr->get_info<syclex::info::device::architecture>();
|
||||
res.arch = arch;
|
||||
|
||||
map<syclex::architecture,
|
||||
std::pair<const char*, sycl_intel_gpu_family>>::iterator it =
|
||||
arch2name.find(res.arch);
|
||||
if (it != arch2name.end()) {
|
||||
res.arch_name = it->second.first;
|
||||
res.gpu_family = it->second.second;
|
||||
} else {
|
||||
res.arch_name = "unknown";
|
||||
res.gpu_family = GPU_FAMILY_UKNOWN;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -9,18 +9,30 @@
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
namespace syclex = sycl::ext::oneapi::experimental;
|
||||
using gpu_arch = sycl::ext::oneapi::experimental::architecture;
|
||||
|
||||
// TODO: currently not used
|
||||
/*
|
||||
struct sycl_hw_info {
|
||||
syclex::architecture arch;
|
||||
int32_t device_id;
|
||||
// It's used to mark the GPU computing capacity
|
||||
// The value must flow the order of performance.
|
||||
enum sycl_intel_gpu_family {
|
||||
GPU_FAMILY_UKNOWN = -1,
|
||||
// iGPU without Xe core, before Meteor Lake iGPU(Xe)
|
||||
GPU_FAMILY_IGPU_NON_XE = 0,
|
||||
// iGPU with Xe core, Meteor Lake iGPU or newer.
|
||||
GPU_FAMILY_IGPU_XE = 1,
|
||||
// dGPU for gaming in client/data center (DG1/FLex 140 or newer).
|
||||
GPU_FAMILY_DGPU_CLIENT_GAME = 2,
|
||||
// dGPU for AI in cloud, PVC or newer.
|
||||
GPU_FAMILY_DGPU_CLOUD = 3
|
||||
};
|
||||
|
||||
bool is_in_vector(std::vector<int> &vec, int item);
|
||||
struct sycl_hw_info {
|
||||
syclex::architecture arch;
|
||||
const char* arch_name;
|
||||
int32_t device_id;
|
||||
std::string name;
|
||||
sycl_intel_gpu_family gpu_family;
|
||||
};
|
||||
|
||||
sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
|
||||
*/
|
||||
|
||||
|
||||
#endif // SYCL_HW_HPP
|
||||
|
||||
@@ -1283,7 +1283,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
llama_model_quantize_params result = {
|
||||
/*.nthread =*/ 0,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
|
||||
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
|
||||
/*.allow_requantize =*/ false,
|
||||
|
||||
@@ -1331,7 +1331,7 @@ static void test_nemotron_reasoning_detection(testing & t) {
|
||||
|
||||
// Check reasoning markers
|
||||
t.assert_equal("reasoning_start should be '<think>\\n'", "<think>\n", analysis.reasoning.start);
|
||||
t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end);
|
||||
t.assert_equal("reasoning_end should be '\\n</think>\\n'", "\n</think>\n", analysis.reasoning.end);
|
||||
|
||||
// Check reasoning mode detection
|
||||
// Nemotron uses tag-based reasoning; prefill handles the template's forced markers
|
||||
|
||||
@@ -1642,22 +1642,16 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Qwen3.5 (basically same as Nemotron, but keeping separate tests just in case)
|
||||
auto tst = peg_tester("models/templates/Qwen3.5-4B.jinja", detailed_debug);
|
||||
|
||||
tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?")
|
||||
tst.test("I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.expect(message_assist_thoughts)
|
||||
.run();
|
||||
|
||||
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
|
||||
tst.test("I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
|
||||
.expect_content("<think>\nI'm\nthinking\n</think>\nHello, world!\nWhat's up?")
|
||||
.run();
|
||||
|
||||
tst.test("I'm\nthinking\n</think>\nHello, world!\nWhat's up?")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.expect(message_assist_thoughts)
|
||||
.expect_content("<think>\nI'm\nthinking\n</think>\n\nHello, world!\nWhat's up?")
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1673,7 +1667,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"I'm\nthinking\n</think>\n"
|
||||
"I'm\nthinking\n</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=special_function>\n"
|
||||
"<parameter=arg1>\n1\n</parameter>\n"
|
||||
@@ -1731,7 +1725,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
tst.test(
|
||||
"I need to output the invoice details in JSON\n"
|
||||
"</think>\n"
|
||||
"</think>\n\n"
|
||||
R"({"amount": 123.45, "date": "2025-12-03"})")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
@@ -1751,7 +1745,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call></think>\n"
|
||||
"</tool_call>\n</think>\n\n"
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
@@ -1994,7 +1988,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call></think>\n"
|
||||
"</tool_call>\n</think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
@@ -3463,7 +3457,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning (enable_thinking=true)
|
||||
tst.test("I'm\nthinking</think><tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
|
||||
tst.test("I'm\nthinking\n</think>\n\n<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
@@ -3487,7 +3481,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning and content
|
||||
tst.test("I need to call a function</think>"
|
||||
tst.test("I need to call a function\n</think>\n\n"
|
||||
"Let me check the time.<tool_call>\n{\"name\": \"get_time\", \"arguments\": {\"city\": \"XYZCITY\"}}</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
@@ -3514,7 +3508,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
// fake tool call marker in reasoning
|
||||
tst.test(
|
||||
"Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm</think>"
|
||||
"Let me think about <tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 2}}</tool_call> hmm\n</think>\n\n"
|
||||
"<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
@@ -3542,11 +3536,11 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Format: <minimax:tool_call><invoke name="func"><parameter name="key">value</parameter></invoke></minimax:tool_call>
|
||||
{
|
||||
auto tst = peg_tester("models/templates/MiniMax-M2.jinja", detailed_debug);
|
||||
tst.test("</think>Hello, world!\nWhat's up?").enable_thinking(true).reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(message_assist).run();
|
||||
tst.test("\n</think>\n\nHello, world!\nWhat's up?").enable_thinking(true).reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(message_assist).run();
|
||||
|
||||
tst.test("I'm\nthinking</think>Hello, world!\nWhat's up?").enable_thinking(true).reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(message_assist_thoughts).run();
|
||||
tst.test("I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?").enable_thinking(true).reasoning_format(COMMON_REASONING_FORMAT_AUTO).expect(message_assist_thoughts).run();
|
||||
|
||||
tst.test("Let's call a tool:</think><minimax:tool_call>\n<invoke name=\"empty_args\">\n</invoke>\n</minimax:tool_call>").
|
||||
tst.test("Let's call a tool:\n</think>\n\n<minimax:tool_call>\n<invoke name=\"empty_args\">\n</invoke>\n</minimax:tool_call>").
|
||||
enable_thinking(true).
|
||||
reasoning_format(COMMON_REASONING_FORMAT_AUTO).
|
||||
tools({ empty_args_tool }).
|
||||
@@ -3554,7 +3548,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
run();
|
||||
|
||||
tst.test(
|
||||
"</think><minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
|
||||
"\n</think>\n\n<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
|
||||
"name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
@@ -3714,7 +3708,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.enable_thinking(false)
|
||||
.expect(message_assist)
|
||||
.run();
|
||||
tst.test("I'm\nthinking</think>\n\nHello, world!\nWhat's up?")
|
||||
tst.test("I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.expect(message_assist_thoughts)
|
||||
@@ -3729,7 +3723,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.run();
|
||||
tst.test("I'm\nthinking</think>\n\n<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
tst.test("I'm\nthinking\n</think>\n\n<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.tools({ special_function_tool })
|
||||
@@ -4006,7 +4000,8 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
{
|
||||
auto tst = peg_tester("models/templates/StepFun3.5-Flash.jinja", detailed_debug);
|
||||
tst.test("I was thinking</think>\nNow I'm not.").
|
||||
|
||||
tst.test("I was thinking\n</think>\nNow I'm not.").
|
||||
enable_thinking(true).
|
||||
reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK).
|
||||
expect_reasoning("I was thinking").
|
||||
|
||||
Reference in New Issue
Block a user