mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-05-11 20:15:56 +02:00
Compare commits
75 Commits
b9085
...
gg/scripts
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f49c636db0 | ||
|
|
d5165e8f2e | ||
|
|
85c6aa006d | ||
|
|
e5ac6d1da6 | ||
|
|
094554dbcc | ||
|
|
f64d56bcd8 | ||
|
|
43f14a0a46 | ||
|
|
d26b1ffcc9 | ||
|
|
9f10d8d195 | ||
|
|
4d5dedc569 | ||
|
|
81a65cf035 | ||
|
|
7d433f767b | ||
|
|
633a68d6c2 | ||
|
|
e0a2cf48ca | ||
|
|
bad9565a1e | ||
|
|
752b703a5e | ||
|
|
fc571f3a1e | ||
|
|
6797d80dff | ||
|
|
3649793811 | ||
|
|
7e8c88c5e0 | ||
|
|
2e0b6766f3 | ||
|
|
f95f4dd1ca | ||
|
|
095c8ab655 | ||
|
|
d830acacc5 | ||
|
|
f35b10f0a9 | ||
|
|
802d85e26e | ||
|
|
91bd92c6b6 | ||
|
|
f20b5a72cf | ||
|
|
122dfe3eab | ||
|
|
8b94ab4f4a | ||
|
|
f99d77f3bd | ||
|
|
55a7cf4a06 | ||
|
|
6e7e1a5a63 | ||
|
|
9f02fa6382 | ||
|
|
e7b8646098 | ||
|
|
55ce1b4e2f | ||
|
|
abec77e068 | ||
|
|
65e3c5a928 | ||
|
|
4f176f6a4d | ||
|
|
9578e83ac2 | ||
|
|
530f38f9c3 | ||
|
|
cda8cae01a | ||
|
|
64720e1e01 | ||
|
|
1a780f7c44 | ||
|
|
940364e4c9 | ||
|
|
ee9b715eb6 | ||
|
|
d639ee52ea | ||
|
|
fb40d1a04a | ||
|
|
2fe445cc60 | ||
|
|
3732aea2df | ||
|
|
edc766c919 | ||
|
|
d7d2c22909 | ||
|
|
30ea5124de | ||
|
|
0ca458d892 | ||
|
|
de8eda468b | ||
|
|
a2b96e0444 | ||
|
|
deed078654 | ||
|
|
05b8425bd6 | ||
|
|
58bd57ba99 | ||
|
|
5cbe95b6e5 | ||
|
|
c7f3ce25f5 | ||
|
|
4db4497ca7 | ||
|
|
db8b09d6e8 | ||
|
|
0b047287fe | ||
|
|
efbada936f | ||
|
|
f3c3e0e9a0 | ||
|
|
5755a100cd | ||
|
|
1e5ad35d56 | ||
|
|
65d7a8bbf0 | ||
|
|
00d56b11c3 | ||
|
|
5757c4dcb1 | ||
|
|
e20b83930c | ||
|
|
fd89556567 | ||
|
|
60489932ec | ||
|
|
4a4f819cb6 |
@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \
|
||||
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
|
||||
|
||||
ARG IGC_VERSION=v2.30.1
|
||||
ARG IGC_VERSION_FULL=2_2.30.1+20950
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
|
||||
ARG IGC_VERSION=v2.32.7
|
||||
ARG IGC_VERSION_FULL=2_2.32.7+21184
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
|
||||
ARG IGDGMM_VERSION=22.9.0
|
||||
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
|
||||
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
|
||||
|
||||
@@ -103,6 +103,7 @@ let
|
||||
vulkan-headers
|
||||
vulkan-loader
|
||||
shaderc
|
||||
spirv-headers
|
||||
];
|
||||
in
|
||||
|
||||
@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
ninja
|
||||
pkg-config
|
||||
git
|
||||
spirv-headers
|
||||
]
|
||||
++ optionals useCuda [
|
||||
cudaPackages.cuda_nvcc
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -110,6 +110,7 @@ uv.lock
|
||||
|
||||
# Nix
|
||||
|
||||
flake.lock
|
||||
/result
|
||||
|
||||
# Test binaries
|
||||
|
||||
@@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
|
||||
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
|
||||
res = "f2llmv2"
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -11591,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
|
||||
class SarvamMoEModel(BailingMoeV2Model):
|
||||
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
|
||||
# Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
|
||||
# - full rotary (no partial_rotary_factor)
|
||||
# - expert bias is zero-mean normalized at load time
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
# Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
if name.endswith(".expert_bias"):
|
||||
# Sarvam normalizes expert bias to zero mean
|
||||
inner = gen
|
||||
|
||||
def gen():
|
||||
t = inner()
|
||||
return t - t.mean()
|
||||
return super().filter_tensors((name, gen))
|
||||
|
||||
|
||||
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
|
||||
class GroveMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GROVEMOE
|
||||
|
||||
@@ -155,6 +155,7 @@ models = [
|
||||
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -737,6 +737,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
|
||||
|
||||
## Compile-time Flags
|
||||
|
||||
Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spot.
|
||||
|
||||
| Name | Function |
|
||||
|-----------------|----------------------------------------------------------------------------------|
|
||||
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
|
||||
|
||||
## Design Rule
|
||||
|
||||
- Open to all contributors.
|
||||
|
||||
26
examples/llama-eval/README.md
Normal file
26
examples/llama-eval/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# llama-eval
|
||||
|
||||
Simple evaluation tool for llama.cpp with support for multiple datasets.
|
||||
|
||||
For a full description, usage examples, and sample results, see:
|
||||
|
||||
- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
|
||||
|
||||
## Quick start
|
||||
|
||||
```bash
|
||||
# Single server
|
||||
python3 llama-eval.py \
|
||||
--server http://localhost:8033 \
|
||||
--model my-model \
|
||||
--dataset gsm8k --n_cases 100 \
|
||||
--grader-type regex --threads 32
|
||||
|
||||
# Multiple servers (comma-separated URLs and thread counts)
|
||||
python3 llama-eval.py \
|
||||
--server http://gpu1:8033,http://gpu2:8033 \
|
||||
--server-name gpu1,gpu2 \
|
||||
--threads 16,16 \
|
||||
--dataset aime2025 --n_cases 240 \
|
||||
--grader-type regex
|
||||
```
|
||||
1428
examples/llama-eval/llama-eval.py
Executable file
1428
examples/llama-eval/llama-eval.py
Executable file
File diff suppressed because it is too large
Load Diff
317
examples/llama-eval/llama-server-simulator.py
Executable file
317
examples/llama-eval/llama-server-simulator.py
Executable file
@@ -0,0 +1,317 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import threading
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
|
||||
# Set cache directory for HuggingFace datasets
|
||||
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
|
||||
|
||||
def dice(s1: str, s2: str) -> float:
|
||||
"""Calculate Dice coefficient between two strings based on bigram overlap."""
|
||||
if not s1 and not s2:
|
||||
return 1.0
|
||||
|
||||
def _bigrams(s: str):
|
||||
return [s[i : i + 2] for i in range(len(s) - 1)]
|
||||
|
||||
bigrams1 = _bigrams(s1)
|
||||
bigrams2 = _bigrams(s2)
|
||||
|
||||
if not bigrams1 and not bigrams2:
|
||||
return 1.0
|
||||
|
||||
from collections import Counter
|
||||
|
||||
freq1 = Counter(bigrams1)
|
||||
freq2 = Counter(bigrams2)
|
||||
|
||||
intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
|
||||
dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
|
||||
return dice_coeff
|
||||
|
||||
def debug_log(message: str):
|
||||
"""Log debug messages to both stdout and a file"""
|
||||
print(message, file=sys.stderr)
|
||||
with open("/tmp/simulator-debug.log", "a") as f:
|
||||
f.write(message + "\n")
|
||||
|
||||
simulator: Optional["Simulator"] = None
|
||||
|
||||
@dataclass
|
||||
class EvalState:
|
||||
id: str
|
||||
tasks: List[str]
|
||||
task_states: Dict[str, Dict]
|
||||
sampling_config: Dict
|
||||
|
||||
def normalize_number(s: str) -> Optional[int]:
|
||||
match = re.match(r"\d+", s) # match digits from the start
|
||||
if not match:
|
||||
return None
|
||||
return int(match.group(0))
|
||||
|
||||
class AimeDataset:
|
||||
def __init__(self, split: str = "train"):
|
||||
self.split = split
|
||||
self.questions: List[Dict] = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
|
||||
self.questions = list(ds)
|
||||
print(f"AIME dataset loaded: {len(self.questions)} questions")
|
||||
|
||||
def find_question(self, request_text: str) -> Optional[Dict]:
|
||||
best_match = None
|
||||
best_distance = -1
|
||||
best_index = -1
|
||||
|
||||
for i, question in enumerate(self.questions):
|
||||
question_text = question["problem"]
|
||||
request_lower = request_text.lower()
|
||||
question_lower = question_text.lower()
|
||||
|
||||
# Exact match
|
||||
if question_lower == request_lower:
|
||||
debug_log(f"DEBUG: Found exact match at index {i}")
|
||||
return question
|
||||
|
||||
# Remove LaTeX formatting for more flexible matching
|
||||
question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
|
||||
if question_no_latex.lower() == request_lower:
|
||||
debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
|
||||
return question
|
||||
|
||||
# Calculate Dice coefficient for partial matches
|
||||
# Only consider if request is at least 50% of question length
|
||||
if len(request_lower) >= len(question_lower) * 0.5:
|
||||
distance = dice(question_lower, request_lower)
|
||||
|
||||
if distance > best_distance:
|
||||
best_distance = distance
|
||||
best_match = question
|
||||
best_index = i
|
||||
|
||||
if best_match and best_distance > 0.3: # Threshold for partial match
|
||||
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
|
||||
return best_match
|
||||
|
||||
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
|
||||
return None
|
||||
|
||||
def get_answer(self, question: Dict) -> str:
|
||||
answer = question["answer"]
|
||||
if isinstance(answer, str):
|
||||
normalized = normalize_number(answer)
|
||||
return str(normalized) if normalized is not None else answer
|
||||
return str(answer)
|
||||
|
||||
class Simulator:
|
||||
def __init__(
|
||||
self,
|
||||
port: int = 8033,
|
||||
host: str = "localhost",
|
||||
success_rate: float = 0.8,
|
||||
dataset_split: str = "train"
|
||||
):
|
||||
self.port = port
|
||||
self.host = host
|
||||
self.success_rate = success_rate
|
||||
self.dataset = AimeDataset(dataset_split)
|
||||
self.eval_state = EvalState(
|
||||
id="aime-2025",
|
||||
tasks=["aime"],
|
||||
task_states={},
|
||||
sampling_config={"temperature": 0, "max_tokens": 2048}
|
||||
)
|
||||
|
||||
def _generate_response(
|
||||
self,
|
||||
question: Dict,
|
||||
should_be_correct: bool
|
||||
) -> Dict:
|
||||
expected_answer = self.dataset.get_answer(question)
|
||||
|
||||
if should_be_correct:
|
||||
response_text = expected_answer
|
||||
else:
|
||||
response_text = self._generate_wrong_answer(question)
|
||||
|
||||
return {
|
||||
"id": f"chatcmpl-{int(time.time())}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": "llama",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
}
|
||||
}
|
||||
|
||||
def _generate_wrong_answer(self, question: Dict) -> str:
|
||||
expected_answer = self.dataset.get_answer(question)
|
||||
|
||||
if expected_answer.isdigit():
|
||||
wrong_answer = str(int(expected_answer) + 1)
|
||||
else:
|
||||
wrong_answer = expected_answer + " (wrong)"
|
||||
|
||||
return wrong_answer
|
||||
|
||||
def _process_request(self, request_data: Dict) -> Dict:
|
||||
messages = request_data.get("messages", [])
|
||||
if not messages:
|
||||
return {"error": "No messages in request"}
|
||||
|
||||
request_text = messages[0].get("content", "")
|
||||
debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
|
||||
|
||||
question = self.dataset.find_question(request_text)
|
||||
if not question:
|
||||
debug_log(f"DEBUG: find_question returned None")
|
||||
return {"error": "No matching question found"}
|
||||
|
||||
should_be_correct = random.random() < self.success_rate
|
||||
|
||||
response = self._generate_response(question, should_be_correct)
|
||||
|
||||
task_id = "aime"
|
||||
self.eval_state.task_states[task_id] = {
|
||||
"correct": should_be_correct,
|
||||
"expected": self.dataset.get_answer(question),
|
||||
"predicted": response["choices"][0]["message"]["content"]
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
class RequestHandler(BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
if self.path != "/v1/chat/completions":
|
||||
self._send_json({"error": "Not found"}, 404)
|
||||
return
|
||||
|
||||
try:
|
||||
content_length = int(self.headers.get("Content-Length", 0))
|
||||
body = self.rfile.read(content_length)
|
||||
request_data = json.loads(body) if body else None
|
||||
|
||||
if not request_data:
|
||||
self._send_json({"error": "Invalid JSON"}, 400)
|
||||
return
|
||||
|
||||
if simulator is None:
|
||||
self._send_json({"error": "Simulator not initialized"}, 500)
|
||||
return
|
||||
|
||||
response = simulator._process_request(request_data)
|
||||
self._send_json(response, 200)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
self._send_json({"error": "Invalid JSON"}, 400)
|
||||
except Exception as e:
|
||||
print(f"Error processing request: {e}")
|
||||
self._send_json({"error": str(e)}, 500)
|
||||
|
||||
def _send_json(self, data: dict, status: int = 200):
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
# Suppress default request logging
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="llama-server simulator for testing eval scripts"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8033,
|
||||
help="Server port (default: 8033)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="Server host (default: localhost)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--success-rate",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Success rate 0-1 (default: 0.8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-split",
|
||||
type=str,
|
||||
default="train",
|
||||
help="AIME dataset split to use (default: train)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
global simulator
|
||||
simulator = Simulator(
|
||||
port=args.port,
|
||||
host=args.host,
|
||||
success_rate=args.success_rate,
|
||||
dataset_split=args.dataset_split
|
||||
)
|
||||
|
||||
server = HTTPServer((args.host, args.port), RequestHandler)
|
||||
server_thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
server_thread.start()
|
||||
|
||||
print("\n=== llama-server-simulator ===")
|
||||
print(f"Server running on http://{args.host}:{args.port}")
|
||||
print(f"Success rate: {args.success_rate}")
|
||||
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print("\nPress Ctrl+C to stop\n")
|
||||
|
||||
try:
|
||||
server_thread.join()
|
||||
except KeyboardInterrupt:
|
||||
print("\nShutting down...")
|
||||
server.shutdown()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
86
examples/llama-eval/test-simulator.sh
Executable file
86
examples/llama-eval/test-simulator.sh
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Get the directory where this script is located
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
echo "=== llama-server-simulator Test Script ==="
|
||||
echo ""
|
||||
|
||||
PORT=8033
|
||||
SUCCESS_RATE=0.8
|
||||
TEST_PORT=8034
|
||||
|
||||
echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
|
||||
source "$SCRIPT_DIR/venv/bin/activate"
|
||||
python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
|
||||
SIMULATOR_PID=$!
|
||||
|
||||
echo "Waiting for simulator to start..."
|
||||
sleep 5
|
||||
|
||||
# Helper function to make a request and extract the answer
|
||||
make_request() {
|
||||
local question="$1"
|
||||
curl -s -X POST http://localhost:$PORT/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"model\": \"llama\",
|
||||
\"messages\": [
|
||||
{\"role\": \"user\", \"content\": \"$question\"}
|
||||
],
|
||||
\"temperature\": 0,
|
||||
\"max_tokens\": 2048
|
||||
}" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
|
||||
}
|
||||
|
||||
# Test question (repeated in multiple tests)
|
||||
TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
|
||||
|
||||
echo ""
|
||||
echo "=== Test 1: Correct Answer ==="
|
||||
echo "Sending request with known question..."
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
echo "Answer: $answer"
|
||||
echo "Expected: 116"
|
||||
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 2: Wrong Answer ==="
|
||||
echo "Sending request with known question (success rate 0.0)..."
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
echo "Answer: $answer"
|
||||
echo "Expected: 116"
|
||||
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 3: No Matching Question ==="
|
||||
echo "Sending request with non-matching text..."
|
||||
response=$(make_request "What is the capital of France?")
|
||||
echo "Response: $response"
|
||||
echo "Expected: No matching question found"
|
||||
echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 4: Success Rate Verification ==="
|
||||
echo "Sending 10 requests to test success rate..."
|
||||
correct_count=0
|
||||
for i in {1..10}; do
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
if [ "$answer" == "116" ]; then
|
||||
correct_count=$((correct_count + 1))
|
||||
fi
|
||||
echo " Request $i: Answer = $answer"
|
||||
done
|
||||
echo "Correct answers: $correct_count/10"
|
||||
echo "Expected: ~8/10 (80% success rate)"
|
||||
echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
|
||||
|
||||
echo ""
|
||||
echo "=== Test Complete ==="
|
||||
echo "Stopping simulator..."
|
||||
kill $SIMULATOR_PID 2>/dev/null
|
||||
wait $SIMULATOR_PID 2>/dev/null || true
|
||||
|
||||
echo "Simulator stopped."
|
||||
58
flake.lock
generated
58
flake.lock
generated
@@ -1,58 +0,0 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-parts": {
|
||||
"inputs": {
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1730504689,
|
||||
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1732014248,
|
||||
"narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1730504152,
|
||||
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-parts": "flake-parts",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 11)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_PATCH 1)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
968
ggml/src/ggml-cuda/allreduce.cu
Normal file
968
ggml/src/ggml-cuda/allreduce.cu
Normal file
@@ -0,0 +1,968 @@
|
||||
#include "allreduce.cuh"
|
||||
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#include "convert.cuh"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// CUDA AllReduce for tensor-parallel inference across two GPUs.
|
||||
//
|
||||
// Provides an in-place sum reduction over matching tensors on two CUDA
|
||||
// devices in the same process. Used by the tensor-split path alongside
|
||||
// NCCL; targets setups without NVLink, where data is exchanged between the
|
||||
// GPUs by staging it through pinned host memory over PCIe.
|
||||
//
|
||||
// Two reduction strategies are selected per call by tensor size:
|
||||
//
|
||||
// * Chunked kernel path (small reductions): a single CUDA kernel both
|
||||
// stages data through pinned host memory and performs the local sum.
|
||||
// Cross-GPU synchronization happens *inside the kernel* (busy-wait on
|
||||
// a host-memory flag), which keeps launch overhead low for the
|
||||
// latency-sensitive token-generation case.
|
||||
//
|
||||
// * Copy-engine path (large reductions): the transfer is split into
|
||||
// D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
|
||||
// followed by a small device-side add kernel. Cross-GPU
|
||||
// synchronization happens *outside the kernel*, via CUDA events
|
||||
// between streams. This keeps the compute engine free while large
|
||||
// transfers are in flight, which matters for prefill-sized tensors.
|
||||
// Reductions larger than the per-call inner cap are processed by an
|
||||
// outer chunker that issues sequential inner calls.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Cross-GPU signal mechanism
|
||||
//
|
||||
// One int per (slot, rank) pair in pinned host memory. Each AR call writes a
|
||||
// strictly increasing token (= the AR call number) into its own arrival int.
|
||||
// The peer spins until its read of the other's arrival int equals the token
|
||||
// it expects for this call -- a mismatch means the peer hasn't arrived yet.
|
||||
// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
|
||||
// days at thousands of ARs/sec), so arrival ints don't need to be reset
|
||||
// between calls; we initialize once at pipeline init and let the values
|
||||
// accumulate.
|
||||
//
|
||||
// There is exactly one writer (the owning GPU) and one reader (the peer), so
|
||||
// we don't need atomics. A volatile store paired with __threadfence_system()
|
||||
// provides the release ordering that makes the D2H writes visible system-wide
|
||||
// before the arrival token is observed.
|
||||
//
|
||||
// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
|
||||
// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
|
||||
// portable choice.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
|
||||
*(volatile int *)p = token;
|
||||
}
|
||||
static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
|
||||
return *(const volatile int *)p;
|
||||
}
|
||||
|
||||
// Byte spacing between adjacent arrival ints. 64 bytes (one cache line)
|
||||
// ensures each GPU/block's arrival slot lives on its own line, preventing
|
||||
// false-sharing stalls on the polling GPU.
|
||||
static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
|
||||
|
||||
// Number of blocks the chunked kernel launches with. Each block stripes a
|
||||
// disjoint slice of the data and synchronizes through its own arrival-token
|
||||
// slot so multiple SMs can pump PCIe stores in parallel.
|
||||
static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
|
||||
//
|
||||
// Both GPUs run this kernel simultaneously on independent streams. sendbuf
|
||||
// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
|
||||
// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
|
||||
// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When
|
||||
// T_dst == T_wire the casts below are no-ops.
|
||||
//
|
||||
// Each GPU runs three phases:
|
||||
//
|
||||
// Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
|
||||
// single-instruction-width vectors into host_mine.
|
||||
// __threadfence_system() commits these writes to host
|
||||
// memory.
|
||||
// Phase 2 (thread 0): write token to arrival_mine; spin until
|
||||
// arrival_other == token.
|
||||
// Phase 3 (all threads): read T_wire vectors from host_other, cast
|
||||
// each element to T_dst, and sum with the local
|
||||
// sendbuf value (also rounded through T_wire so that
|
||||
// both GPUs truncate identically -- this guarantees
|
||||
// bit-equivalent results across the two devices).
|
||||
//
|
||||
// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
|
||||
// threads to keep multiple SMs issuing PCIe stores in parallel. Each block
|
||||
// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
|
||||
// thread 0 of each block signals/spins on that slot independently of other
|
||||
// blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are
|
||||
// handled only by block 0 to avoid cross-block writes to the same slots.
|
||||
// ---------------------------------------------------------------------------
|
||||
template <typename T_dst, typename T_wire>
|
||||
static __global__ void ggml_cuda_ar_kernel(
|
||||
const T_dst * sendbuf,
|
||||
T_dst * recvbuf,
|
||||
T_wire * __restrict__ host_mine,
|
||||
const T_wire * __restrict__ host_other,
|
||||
int count,
|
||||
int * arrival_mine,
|
||||
int * arrival_other,
|
||||
int token) {
|
||||
|
||||
// Vector unit for the wire type, sized to the arch's widest single-instruction
|
||||
// copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory;
|
||||
// each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
|
||||
constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
|
||||
constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int nt = blockDim.x;
|
||||
const int bid = blockIdx.x;
|
||||
const int gtid = bid * nt + tid;
|
||||
const int gnt = gridDim.x * nt;
|
||||
const int count_vec = count / ELEMS_PER_VEC;
|
||||
const int tail = count_vec * ELEMS_PER_VEC;
|
||||
|
||||
// Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
|
||||
{
|
||||
for (int i = gtid; i < count_vec; i += gnt) {
|
||||
const int off = i * ELEMS_PER_VEC;
|
||||
T_wire wire[ELEMS_PER_VEC];
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
}
|
||||
ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
}
|
||||
}
|
||||
|
||||
// Commit this block's host writes before signalling.
|
||||
__threadfence_system();
|
||||
__syncthreads();
|
||||
|
||||
// Phase 2: thread 0 of each block signals on its own arrival slot, then
|
||||
// spins for the matching slot from peer. Per-block tokens mean blocks
|
||||
// proceed independently -- no inter-block barrier needed.
|
||||
if (tid == 0) {
|
||||
int * my_slot = arrival_mine + bid * ARRIVAL_INTS;
|
||||
const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
|
||||
|
||||
ggml_cuda_ar_signal_set(my_slot, token);
|
||||
__threadfence_system(); // make our signal visible system-wide
|
||||
|
||||
while (ggml_cuda_ar_signal_get(other_slot) != token) {
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
__nanosleep(100);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Acquire peer's host_other writes (this block's stripe of them).
|
||||
__threadfence_system();
|
||||
|
||||
// Phase 3: read peer's T_wire vector, cast both sides through T_wire for
|
||||
// bit-equivalence, sum in T_dst precision, and write back to recvbuf.
|
||||
{
|
||||
for (int i = gtid; i < count_vec; i += gnt) {
|
||||
const int off = i * ELEMS_PER_VEC;
|
||||
T_wire wire[ELEMS_PER_VEC];
|
||||
ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
|
||||
}
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
recvbuf[tail + tid] =
|
||||
ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Combined load-convert-add kernel. The peer's contribution arrives as T_src
|
||||
// (which may be a lower-precision type than T_dst when the BF16 round-trip is
|
||||
// active). For bit-equivalence between the two GPUs, dst is first rounded
|
||||
// through T_src's precision via ggml_cuda_cast -- peer already truncated its
|
||||
// own value the same way before sending -- so both sides perform identical
|
||||
// arithmetic. When T_dst == T_src the round-trip cast is a no-op.
|
||||
template <typename T_dst, typename T_src>
|
||||
static __global__ void ggml_cuda_ar_add_kernel(
|
||||
T_dst * __restrict__ dst,
|
||||
const T_src * __restrict__ src,
|
||||
int count) {
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int nt = gridDim.x * blockDim.x;
|
||||
for (int i = tid; i < count; i += nt) {
|
||||
const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
|
||||
dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pipeline structure
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Number of slots in the event / arrival ring. Two slots is sufficient:
|
||||
// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
|
||||
// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
|
||||
// from AR N-2 by the time we get to AR N. acquire_slot's
|
||||
// cudaEventSynchronize on ev.ker for both devices makes that consumption
|
||||
// explicit before we overwrite host_buf[slot] for the new AR.
|
||||
static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
|
||||
|
||||
// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
|
||||
// Larger tensors are reduced by issuing multiple chunked launches.
|
||||
static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
|
||||
|
||||
// Copy-engine path: largest tensor accepted on this path; sets host_large /
|
||||
// dev_tmp allocation size.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
|
||||
|
||||
// AR wire size at which the copy-engine path takes over from the chunked-
|
||||
// kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
|
||||
// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
|
||||
// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
|
||||
// the peer); the clamps cover the cases where nbytes/4 is too small (per-
|
||||
// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
|
||||
// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB
|
||||
// Absolute floor that an env-var override is allowed to set; this caps the
|
||||
// per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor.
|
||||
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
|
||||
static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
|
||||
static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
|
||||
GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
|
||||
|
||||
struct ggml_cuda_ar_event_slot {
|
||||
cudaEvent_t app = nullptr; // upstream computation complete
|
||||
cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete
|
||||
cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream)
|
||||
cudaEvent_t ker = nullptr; // AllReduce kernel complete
|
||||
};
|
||||
|
||||
// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
|
||||
// in one place, with the host handle preserved for cudaFreeHost. Used where
|
||||
// the CPU never touches the buffer -- only the device reads/writes via the
|
||||
// mapped device pointer. Required on systems where cudaDevAttrCanUseHost-
|
||||
// PointerForRegisteredMem is 0 and the host pointer can't be used as a
|
||||
// device pointer.
|
||||
struct ggml_cuda_ar_host_mapping {
|
||||
uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
|
||||
uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset
|
||||
|
||||
cudaError_t alloc(size_t bytes) {
|
||||
cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
|
||||
cudaHostAllocPortable | cudaHostAllocMapped);
|
||||
if (rc != cudaSuccess) {
|
||||
host = nullptr;
|
||||
return rc;
|
||||
}
|
||||
rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
|
||||
if (rc != cudaSuccess) {
|
||||
cudaFreeHost(host);
|
||||
host = nullptr;
|
||||
dev = nullptr;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
void free() {
|
||||
if (host) {
|
||||
cudaFreeHost(host);
|
||||
host = nullptr;
|
||||
dev = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_cuda_ar_pipeline {
|
||||
int n_devices;
|
||||
int devices[GGML_CUDA_MAX_DEVICES];
|
||||
size_t buf_bytes; // bytes per device in host_buf[]
|
||||
size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[]
|
||||
size_t copy_threshold;
|
||||
size_t copy_chunk_bytes;
|
||||
size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
|
||||
uint64_t call_count;
|
||||
|
||||
// Per-device resources.
|
||||
ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel)
|
||||
ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
|
||||
char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path
|
||||
cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking
|
||||
ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
|
||||
|
||||
// Copy-engine: per-device "I finished reading my peer's host_large"
|
||||
// event. Indexed by RECORDER device. Recorded same-device on streams[i]
|
||||
// after stage 2's last H2D from host_large[peer]. Waited cross-device
|
||||
// by peer's stage-1 stream before the next AR overwrites host_large[peer].
|
||||
cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES];
|
||||
bool host_large_read_done_valid;
|
||||
|
||||
// Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
|
||||
// Recorded on the compute stream after each add_kernel; the AR stream
|
||||
// waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us
|
||||
// single-buffer dev_tmp despite add_kernel running on a separate stream.
|
||||
cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
|
||||
bool dev_tmp_kernel_done_valid;
|
||||
|
||||
// Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned
|
||||
// memory; CPU never reads/writes -- only the kernel and cudaMemset.
|
||||
// Use ggml_cuda_ar_arrival_ptr() to index.
|
||||
ggml_cuda_ar_host_mapping arrival;
|
||||
};
|
||||
|
||||
// Base pointer for the (slot, rank) per-block token block. The kernel adds
|
||||
// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
|
||||
static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
|
||||
const size_t offset = ((size_t)slot * p->n_devices + rank) *
|
||||
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
|
||||
return reinterpret_cast<int *>(p->arrival.dev + offset);
|
||||
}
|
||||
|
||||
static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
|
||||
const char * value = getenv(name);
|
||||
if (value == nullptr || value[0] == '\0') {
|
||||
return default_value;
|
||||
}
|
||||
|
||||
char * end = nullptr;
|
||||
const unsigned long long parsed = strtoull(value, &end, 10);
|
||||
return end != value ? (uint64_t) parsed : default_value;
|
||||
}
|
||||
|
||||
struct ggml_cuda_ar_slot_info {
|
||||
int slot;
|
||||
int token;
|
||||
};
|
||||
|
||||
static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
|
||||
const int slot = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
|
||||
const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
|
||||
p->call_count++;
|
||||
|
||||
if (pool_lapped) {
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
|
||||
}
|
||||
}
|
||||
|
||||
return { slot, (int) p->call_count };
|
||||
}
|
||||
|
||||
// Per-AR copy-engine chunk size: env-var override if set, else heuristic
|
||||
// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
|
||||
static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
|
||||
if (p->copy_chunk_bytes > 0) {
|
||||
return p->copy_chunk_bytes;
|
||||
}
|
||||
return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
|
||||
std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
|
||||
}
|
||||
|
||||
static void ggml_cuda_ar_wait_for_compute(
|
||||
ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
|
||||
ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
|
||||
CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Init / free
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
|
||||
|
||||
if (n_devices != 2) {
|
||||
GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
|
||||
"falling back\n", __func__, n_devices);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
const int cc = ggml_cuda_info().devices[devices[i]].cc;
|
||||
if (cc < GGML_CUDA_CC_VOLTA) {
|
||||
GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
|
||||
"(device %d has cc=%d); falling back\n",
|
||||
__func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * p = new ggml_cuda_ar_pipeline{};
|
||||
p->n_devices = n_devices;
|
||||
p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES;
|
||||
p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
|
||||
// 0 = use the per-call heuristic (default). Non-zero env value forces a
|
||||
// fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
|
||||
p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
|
||||
if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
|
||||
GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
|
||||
__func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
|
||||
p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
|
||||
}
|
||||
// Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
|
||||
// ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
|
||||
// byte threshold to opt out for small tensors.
|
||||
p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
p->devices[i] = devices[i];
|
||||
}
|
||||
|
||||
// Per-device streams and event pools.
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
|
||||
cudaStream_t stream = nullptr;
|
||||
if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
p->streams[i] = stream;
|
||||
|
||||
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
|
||||
bool ok =
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
|
||||
cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
|
||||
for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
|
||||
ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
|
||||
}
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
|
||||
__func__, p->devices[i], s);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
|
||||
__func__, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Arrival ring: cache-line padded so each GPU's int is on its own line.
|
||||
const size_t arrival_bytes =
|
||||
(size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
|
||||
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
|
||||
if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
|
||||
__func__, arrival_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_cuda_set_device(p->devices[0]);
|
||||
if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
|
||||
__func__, arrival_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
|
||||
// kernel can write the next slot's data while the peer is still reading
|
||||
// the previous slot's. Indexed by (slot * buf_bytes) at the call site.
|
||||
p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
|
||||
const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
|
||||
__func__, host_buf_total);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy-engine path: pinned host staging + device scratch, sized for the
|
||||
// largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
|
||||
// dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
|
||||
// cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
|
||||
for (size_t i = 0; i < n_devices; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
|
||||
__func__, p->copy_bytes);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
|
||||
GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
|
||||
__func__, p->copy_bytes, p->devices[i]);
|
||||
ggml_cuda_ar_pipeline_free(p);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
|
||||
"%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
|
||||
__func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
|
||||
if (!p) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Drain all in-flight kernels before tearing down resources.
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
if (p->streams[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaStreamSynchronize(p->streams[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
p->host_buf[i].free();
|
||||
p->host_large[i].free();
|
||||
if (p->dev_tmp[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaFree(p->dev_tmp[i]);
|
||||
}
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
|
||||
if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
|
||||
for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
|
||||
if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
|
||||
}
|
||||
if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
|
||||
if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
|
||||
}
|
||||
if (p->host_large_read_done[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaEventDestroy(p->host_large_read_done[i]);
|
||||
}
|
||||
if (p->dev_tmp_kernel_done[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaEventDestroy(p->dev_tmp_kernel_done[i]);
|
||||
}
|
||||
if (p->streams[i]) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cudaStreamDestroy(p->streams[i]);
|
||||
}
|
||||
}
|
||||
p->arrival.free();
|
||||
delete p;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
|
||||
// nbytes per ne element); accumulated locally into a T_dst buffer. When
|
||||
// T_src == T_dst this is the original homogeneous reduction. When they differ
|
||||
// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
|
||||
// for bit-equivalence between GPUs and we skip the otherwise-needed
|
||||
// post-conversion entirely.
|
||||
template <typename T_src, typename T_dst>
|
||||
static bool ggml_cuda_ar_allreduce_copy_impl(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
|
||||
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
|
||||
const bool compute[GGML_CUDA_MAX_DEVICES],
|
||||
int64_t ne,
|
||||
size_t nbytes) {
|
||||
GGML_ASSERT(p->n_devices == 2);
|
||||
GGML_ASSERT(nbytes <= p->copy_bytes);
|
||||
GGML_ASSERT(ne <= std::numeric_limits<int>::max());
|
||||
|
||||
const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
|
||||
GGML_ASSERT(chunk_bytes > 0);
|
||||
|
||||
const int slot = ggml_cuda_ar_acquire_slot(p).slot;
|
||||
const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
|
||||
GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
|
||||
|
||||
ggml_backend_cuda_context * cuda_ctx[2] = {};
|
||||
|
||||
// Stage 1: both GPUs copy their local contribution to pinned host memory.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
|
||||
|
||||
ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
|
||||
|
||||
// Wait for peer's H2D from our host_large[i] (recorded in the
|
||||
// previous AR's stage 2) to complete before we overwrite host_large[i].
|
||||
// host_large_read_done[peer] = peer finished reading host_large[i].
|
||||
// No-op on the first AR -- no prior record exists.
|
||||
if (p->host_large_read_done_valid) {
|
||||
const int peer = 1 - i;
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
|
||||
}
|
||||
|
||||
if (!compute[i]) {
|
||||
CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < copy_chunks; ++c) {
|
||||
const size_t offset = c * chunk_bytes;
|
||||
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
|
||||
(nbytes - offset) : chunk_bytes;
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
|
||||
cudaMemcpyDeviceToHost, p->streams[i]));
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
|
||||
}
|
||||
}
|
||||
|
||||
// Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
|
||||
// local device scratch (dev_tmp), then performs one device-local add over
|
||||
// the assembled peer tensor. The H2Ds run on the AR stream (copy engine)
|
||||
// and the add_kernel runs on the caller's compute stream, so the AR stream
|
||||
// stays pure-copy and avoids an in-stream copy->compute engine switch every
|
||||
// AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the
|
||||
// prior AR's add_kernel-done event before overwriting it.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
const int peer = 1 - i;
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
|
||||
// Wait for the previous AR's add_kernel (on the compute stream) to
|
||||
// finish reading dev_tmp before our H2D overwrites it. No-op on the
|
||||
// first copy_impl call.
|
||||
if (p->dev_tmp_kernel_done_valid) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < copy_chunks; ++c) {
|
||||
const size_t offset = c * chunk_bytes;
|
||||
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
|
||||
(nbytes - offset) : chunk_bytes;
|
||||
|
||||
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
|
||||
CUDA_CHECK(cudaMemcpyAsync(
|
||||
p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
|
||||
cudaMemcpyHostToDevice, p->streams[i]));
|
||||
}
|
||||
|
||||
// Mark our reads of host_large[peer] complete so peer's next AR can
|
||||
// safely overwrite it.
|
||||
CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
|
||||
|
||||
// Hand off from AR stream (copy engine) to compute stream: compute
|
||||
// stream waits for all H2Ds to finish, then runs the add_kernel.
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
|
||||
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
|
||||
|
||||
const int block_size = 256;
|
||||
int n_blocks = (int) ((ne + block_size - 1) / block_size);
|
||||
if (n_blocks > 1024) {
|
||||
n_blocks = 1024;
|
||||
}
|
||||
ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
|
||||
dst_buf[i],
|
||||
reinterpret_cast<const T_src *>(p->dev_tmp[i]),
|
||||
(int) ne);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// Record dev_tmp-released on the compute stream so the next copy_impl
|
||||
// can wait for the kernel to finish before overwriting dev_tmp. Also
|
||||
// record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
|
||||
CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
|
||||
}
|
||||
p->host_large_read_done_valid = true;
|
||||
p->dev_tmp_kernel_done_valid = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
|
||||
// the host_large / dev_tmp allocation size). When the full AR exceeds that,
|
||||
// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
|
||||
// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
|
||||
// slot, so cross-AR fences and pool wraparound work the same way as for any
|
||||
// other sequence of small ARs.
|
||||
template <typename T_src, typename T_dst>
|
||||
static bool ggml_cuda_ar_allreduce_copy_outer(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
|
||||
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
|
||||
const bool compute[GGML_CUDA_MAX_DEVICES],
|
||||
int64_t ne) {
|
||||
const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
|
||||
GGML_ASSERT(outer_max_elems > 0);
|
||||
|
||||
bool ok = true;
|
||||
for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
|
||||
const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start);
|
||||
const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src);
|
||||
|
||||
T_src * src[GGML_CUDA_MAX_DEVICES] = {};
|
||||
T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < p->n_devices; ++i) {
|
||||
src[i] = src_buf[i] + outer_start;
|
||||
dst[i] = dst_buf[i] + outer_start;
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
|
||||
p, backends, src, dst, compute, outer_ne, outer_nbytes);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
bool ggml_cuda_ar_allreduce(
|
||||
ggml_cuda_ar_pipeline * p,
|
||||
ggml_backend_t * backends,
|
||||
ggml_tensor ** tensors) {
|
||||
GGML_ASSERT(p != nullptr);
|
||||
|
||||
const int n = p->n_devices;
|
||||
GGML_ASSERT(n == 2);
|
||||
|
||||
const ggml_type input_type = tensors[0]->type;
|
||||
GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
GGML_ASSERT(ne > 0);
|
||||
|
||||
const size_t input_nbytes = ggml_nbytes(tensors[0]);
|
||||
|
||||
// BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
|
||||
// the reduction (chunked or copy-engine), halving on-wire bytes. Matches
|
||||
// NCCL's behaviour. The pre-conversion zeroes inactive shards so the
|
||||
// inner paths see them as already-prepared compute tensors.
|
||||
const bool use_bf16 =
|
||||
input_type == GGML_TYPE_F32 &&
|
||||
p->bf16_threshold > 0 &&
|
||||
input_nbytes >= p->bf16_threshold;
|
||||
|
||||
const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
|
||||
const size_t type_size = ggml_type_size(kernel_type);
|
||||
GGML_ASSERT(p->buf_bytes >= type_size);
|
||||
const size_t nbytes = (size_t) ne * type_size;
|
||||
|
||||
bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
|
||||
}
|
||||
|
||||
// Decide between copy-engine and chunked kernel paths based on the working
|
||||
// type's actual byte count. No upper bound: copy_outer slices reductions
|
||||
// larger than copy_bytes into copy_bytes-sized pieces.
|
||||
const bool use_copy_engine =
|
||||
p->copy_threshold > 0 &&
|
||||
nbytes >= p->copy_threshold;
|
||||
|
||||
// BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
|
||||
// (chunked kernel path) and the combined add kernel (copy_engine path)
|
||||
// both accumulate into the F32 tensor data directly, so an inactive
|
||||
// shard's accumulator must start at zero.
|
||||
if (use_bf16) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (!compute_flag[i]) {
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
|
||||
// path; the chunked kernel path's combined kernel does the conversion
|
||||
// inline as it writes to host_buf.
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
|
||||
void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
|
||||
|
||||
if (use_copy_engine && use_bf16) {
|
||||
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
bf16_tmp[i].pool = &cuda_ctx->pool();
|
||||
bf16_tmp[i].alloc(ne);
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
if (compute_flag[i]) {
|
||||
to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
|
||||
}
|
||||
copy_src_ptr[i] = bf16_tmp[i].get();
|
||||
}
|
||||
}
|
||||
|
||||
bool ok = true;
|
||||
if (use_copy_engine) {
|
||||
// After up-front BF16 conversion, the tmp buffers already hold the
|
||||
// (possibly zeroed-for-inactive) data, so the inner path can treat
|
||||
// every shard as compute.
|
||||
bool inner_compute[GGML_CUDA_MAX_DEVICES];
|
||||
for (int i = 0; i < n; ++i) {
|
||||
inner_compute[i] = use_bf16 ? true : compute_flag[i];
|
||||
}
|
||||
|
||||
// Dispatch into copy_impl with explicit src/dst types. When use_bf16
|
||||
// is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
|
||||
// is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
|
||||
// through BF16 for bit-equivalence and writes F32 directly, so no
|
||||
// post-conversion is needed. Otherwise src == dst (same native type).
|
||||
if (use_bf16) {
|
||||
GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
|
||||
nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
|
||||
float * dst[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
|
||||
dst[i] = static_cast<float *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
|
||||
p, backends, src, dst, inner_compute, ne);
|
||||
} else {
|
||||
switch (kernel_type) {
|
||||
case GGML_TYPE_F32: {
|
||||
float * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<float *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_BF16: {
|
||||
nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
half * buf[GGML_CUDA_MAX_DEVICES] = {};
|
||||
for (int i = 0; i < n; ++i) {
|
||||
buf[i] = static_cast<half *>(tensors[i]->data);
|
||||
}
|
||||
ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
|
||||
p, backends, buf, buf, inner_compute, ne);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// host_buf carries T_wire-typed data; max_chunk_elems is the count that
|
||||
// fits in one host_buf at the wire size.
|
||||
const size_t max_chunk_elems = p->buf_bytes / type_size;
|
||||
const size_t input_type_size = ggml_type_size(input_type);
|
||||
|
||||
// Chunked kernel path runs entirely on the caller's compute stream:
|
||||
// since AR is a barrier here, same-stream ordering subsumes any
|
||||
// cross-stream event handshake that the copy-engine path needs, and
|
||||
// skips the cross-stream scheduling overhead that was hurting the
|
||||
// small-tensor (tg) latency on the AR-stream variant. Only ev.ker is
|
||||
// still recorded at end-of-AR for acquire_slot's pool-wraparound check.
|
||||
for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
|
||||
const size_t remaining_elems = (size_t) (ne - chunk_start);
|
||||
const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
|
||||
const size_t chunk_dst_bytes = chunk_elems * input_type_size;
|
||||
|
||||
const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
|
||||
const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
const int peer = 1 - i; // valid for n == 2 only
|
||||
ggml_cuda_set_device(p->devices[i]);
|
||||
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
|
||||
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
|
||||
cudaStream_t stream = cuda_ctx->stream();
|
||||
|
||||
char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
|
||||
|
||||
// Match NCCL/meta-backend semantics: inactive shards contribute
|
||||
// zeros. On the BF16 path the F32 tensor data was already
|
||||
// zeroed up-front (above), so per-chunk zeroing isn't needed.
|
||||
if (!compute_flag[i] && !use_bf16) {
|
||||
CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
|
||||
}
|
||||
|
||||
#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
|
||||
ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
|
||||
reinterpret_cast<const T_dst *>(data), \
|
||||
reinterpret_cast<T_dst *>(data), \
|
||||
reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
|
||||
reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
|
||||
static_cast<int>(chunk_elems), \
|
||||
ggml_cuda_ar_arrival_ptr(p, slot, i), \
|
||||
ggml_cuda_ar_arrival_ptr(p, slot, peer), \
|
||||
token)
|
||||
|
||||
if (use_bf16) {
|
||||
GGML_ASSERT(input_type == GGML_TYPE_F32);
|
||||
LAUNCH_AR_KERNEL(float, nv_bfloat16);
|
||||
} else {
|
||||
switch (input_type) {
|
||||
case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break;
|
||||
case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break;
|
||||
case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
#undef LAUNCH_AR_KERNEL
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (last_chunk) {
|
||||
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
|
||||
|
||||
// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
|
||||
// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
|
||||
// implementation relies on, so the internal AllReduce is a CUDA-only feature.
|
||||
// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
|
||||
// and silently falls back to the meta backend's generic AllReduce.
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
|
||||
return nullptr;
|
||||
}
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
|
||||
}
|
||||
bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
29
ggml/src/ggml-cuda/allreduce.cuh
Normal file
29
ggml/src/ggml-cuda/allreduce.cuh
Normal file
@@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
// Opaque pipeline context -- owns all pinned buffers, streams, and events.
|
||||
struct ggml_cuda_ar_pipeline;
|
||||
|
||||
// Allocate a pipeline for n_devices GPUs.
|
||||
// devices[] holds the CUDA device IDs in rank order.
|
||||
// Returns nullptr on allocation failure.
|
||||
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
|
||||
const int * devices, size_t n_devices);
|
||||
|
||||
// Release all resources owned by the pipeline.
|
||||
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
|
||||
|
||||
// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
|
||||
// tensors[i] must live on the device managed by backends[i] and be
|
||||
// contiguous F32, F16, or BF16.
|
||||
// Preconditions are checked by the CUDA comm dispatcher before calling this.
|
||||
// Returns true once the reduction work has been enqueued successfully.
|
||||
bool ggml_cuda_ar_allreduce(
|
||||
ggml_cuda_ar_pipeline * pipeline,
|
||||
ggml_backend_t * backends,
|
||||
ggml_tensor ** tensors);
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/allreduce.cuh"
|
||||
#include "ggml-cuda/common.cuh"
|
||||
#include "ggml-cuda/acc.cuh"
|
||||
#include "ggml-cuda/add-id.cuh"
|
||||
@@ -86,6 +87,9 @@
|
||||
|
||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
#define GGML_LOG_WARN_ONCE(str) \
|
||||
{ static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
|
||||
|
||||
[[noreturn]]
|
||||
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
||||
int id = -1; // in case cudaGetDevice fails
|
||||
@@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
|
||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
// Communication context for multi-GPU AllReduce during tensor parallelism.
|
||||
//
|
||||
// Created once per meta backend instance. Resources for the selected mode
|
||||
// (NCCL communicators or the internal AllReduce pipeline) are initialised
|
||||
// eagerly during comm_init so any init failure surfaces at startup rather
|
||||
// than mid-run.
|
||||
struct ggml_backend_cuda_comm_context {
|
||||
using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
|
||||
|
||||
std::vector<ggml_backend_t> backends;
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<int> dev_ids;
|
||||
|
||||
// Set by the init chain (comm_init_{nccl, internal, none}) to one of
|
||||
// try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
|
||||
// internal needs `ar_pipeline`, butterfly needs nothing. Per-call
|
||||
// failures return false; the meta backend's generic implementation then
|
||||
// handles that call.
|
||||
try_allreduce_fn try_allreduce = nullptr;
|
||||
|
||||
ggml_cuda_ar_pipeline * ar_pipeline = nullptr;
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
std::vector<ncclComm_t> comms;
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
~ggml_backend_cuda_comm_context() {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (ncclComm_t comm : comms) {
|
||||
NCCL_CHECK(ncclCommDestroy(comm));
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
ggml_cuda_ar_pipeline_free(ar_pipeline);
|
||||
}
|
||||
};
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
delete comm_ctx;
|
||||
#else
|
||||
GGML_UNUSED(comm_ctx_v);
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
|
||||
std::vector<int> dev_ids;
|
||||
ret->backends.reserve(n_backends);
|
||||
dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->backends.push_back(backends[i]);
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
|
||||
dev_ids.push_back(cuda_ctx->device);
|
||||
}
|
||||
|
||||
ret->comms.resize(n_backends);
|
||||
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
|
||||
return ret;
|
||||
#else
|
||||
// If NCCL is installed it is used by default for optimal performance.
|
||||
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
|
||||
// RCCL is disabled by default, users are explicitly opting in.
|
||||
// Therefore print no warning for RCCL.
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
static bool warning_printed = false;
|
||||
if (!warning_printed) {
|
||||
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
|
||||
warning_printed = true;
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
GGML_UNUSED_VARS(backends, n_backends);
|
||||
return nullptr;
|
||||
#endif // GGML_USE_NCCL
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
|
||||
// tensors (bandwidth-bound), then converts back to FP32.
|
||||
static bool ggml_backend_cuda_comm_allreduce_nccl(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
|
||||
// This then causes a crash in this function
|
||||
@@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(comm_ctx_v != nullptr);
|
||||
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
|
||||
}
|
||||
NCCL_CHECK(ncclGroupEnd());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
|
||||
}
|
||||
|
||||
return true;
|
||||
#else
|
||||
GGML_UNUSED_VARS(comm_ctx_v, tensors);
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
// Run the internal AR pipeline. Returns false on unsupported / failed input
|
||||
// -- the caller decides whether to abort (env-forced) or fall back silently.
|
||||
static bool ggml_backend_cuda_comm_allreduce_internal(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
|
||||
|
||||
const size_t n_backends = comm_ctx->backends.size();
|
||||
GGML_ASSERT(n_backends == 2);
|
||||
GGML_ASSERT(tensors[0] != nullptr);
|
||||
|
||||
const int64_t ne = ggml_nelements(tensors[0]);
|
||||
const ggml_type type = tensors[0]->type;
|
||||
|
||||
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ne == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < n_backends; ++i) {
|
||||
if (tensors[i] == nullptr) {
|
||||
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
|
||||
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
|
||||
__func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
|
||||
return false;
|
||||
}
|
||||
if (!ggml_is_contiguously_allocated(tensors[i])) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
|
||||
__func__, i, ne, ggml_nbytes(tensors[i]),
|
||||
(size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
|
||||
return false;
|
||||
}
|
||||
if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
|
||||
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
|
||||
__func__, i, tensors[i]->data, (int) type, ne);
|
||||
return false;
|
||||
}
|
||||
GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
|
||||
}
|
||||
|
||||
return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-call dispatch -- three variants, one per backend. Each is set as
|
||||
// comm_ctx->try_allreduce by the matching init step. Per-call failure
|
||||
// returns false; the meta backend's generic implementation handles that call.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#ifdef GGML_USE_NCCL
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_nccl(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
|
||||
}
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_internal(
|
||||
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
|
||||
return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
|
||||
ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return;
|
||||
}
|
||||
delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Init -- chained nccl -> internal -> none. Each step tries to bring up its
|
||||
// resource; on failure it warns and recurses into the next step.
|
||||
// ---------------------------------------------------------------------------
|
||||
static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
|
||||
ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
|
||||
if (ret->ar_pipeline) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
|
||||
return;
|
||||
}
|
||||
|
||||
// Clear sticky CUDA error from the failed init.
|
||||
(void) cudaGetLastError();
|
||||
GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
|
||||
"falling back to meta-backend butterfly\n");
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
|
||||
#ifdef GGML_USE_NCCL
|
||||
const size_t n = ret->dev_ids.size();
|
||||
ret->comms.resize(n);
|
||||
ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
|
||||
if (rc == ncclSuccess) {
|
||||
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
|
||||
return;
|
||||
}
|
||||
|
||||
ret->comms.clear();
|
||||
GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
|
||||
ncclGetErrorString(rc));
|
||||
#else // GGML_USE_NCCL
|
||||
#ifndef GGML_USE_HIP
|
||||
GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. "
|
||||
"Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
|
||||
#endif // !GGML_USE_HIP
|
||||
#endif // GGML_USE_NCCL
|
||||
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
}
|
||||
|
||||
// Top-level init. Picks one of the three init paths based on
|
||||
// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
|
||||
// any fallback. Unrecognised env values warn and fall through to the
|
||||
// platform default.
|
||||
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
if (!ggml_backend_is_cuda(backends[i])) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto * ret = new ggml_backend_cuda_comm_context;
|
||||
ret->backends.assign(backends, backends + n_backends);
|
||||
ret->dev_ids.reserve(n_backends);
|
||||
for (size_t i = 0; i < n_backends; i++) {
|
||||
ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
|
||||
}
|
||||
|
||||
const char * env = getenv("GGML_CUDA_ALLREDUCE");
|
||||
if (!env) {
|
||||
// Platform default: Linux uses NCCL, otherwise (generally Windows) internal
|
||||
#if defined(__linux__)
|
||||
ggml_backend_cuda_comm_init_nccl(ret);
|
||||
#else
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
#endif // defined(__linux__)
|
||||
} else {
|
||||
std::string env_str(env);
|
||||
if (env_str == "nccl") {
|
||||
ggml_backend_cuda_comm_init_nccl(ret);
|
||||
} else if (env_str == "internal") {
|
||||
ggml_backend_cuda_comm_init_internal(ret);
|
||||
} else if (env_str == "none") {
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
} else {
|
||||
GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
|
||||
ggml_backend_cuda_comm_init_none(ret);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Top-level dispatch -- calls the function pointer chosen by comm_init.
|
||||
// Returns false to let the meta-backend's butterfly run.
|
||||
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
|
||||
if (comm_ctx_v == nullptr) {
|
||||
return false;
|
||||
}
|
||||
auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
|
||||
return comm_ctx->try_allreduce(comm_ctx, tensors);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
|
||||
|
||||
@@ -135,7 +135,11 @@ endif()
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
||||
if (NOT GGML_SYCL_DEVICE_ARCH)
|
||||
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
|
||||
else()
|
||||
message(STATUS "Skipping -ze-intel-greater-than-4GB-buffer-required for spir64_gen AOT")
|
||||
endif()
|
||||
|
||||
# Link against Intel oneMKL
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
@@ -160,7 +164,15 @@ if (GGML_SYCL_HOST_MEM_FALLBACK)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL_DEVICE_ARCH)
|
||||
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
|
||||
message(STATUS "GGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} (AOT via spir64_gen)")
|
||||
target_compile_options(
|
||||
ggml-sycl PRIVATE
|
||||
-fsycl-targets=spir64_gen
|
||||
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
|
||||
)
|
||||
target_link_options(
|
||||
ggml-sycl PRIVATE
|
||||
-fsycl-targets=spir64_gen
|
||||
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "presets.hpp"
|
||||
#include "type.hpp"
|
||||
#include "sycl_hw.hpp"
|
||||
#include "fattn-buffers.hpp"
|
||||
|
||||
namespace syclexp = sycl::ext::oneapi::experimental;
|
||||
|
||||
@@ -404,12 +405,16 @@ struct ggml_backend_sycl_context {
|
||||
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
||||
std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> fattn_bufs[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
|
||||
|
||||
static std::unique_ptr<ggml_sycl_fattn_kv_buffers> new_fattn_kv_buffers(queue_ptr qptr, int device);
|
||||
|
||||
ggml_sycl_pool & pool(int device) {
|
||||
if (pools[device] == nullptr) {
|
||||
pools[device] = new_pool_for_device(stream(device,0), device);
|
||||
@@ -421,6 +426,17 @@ struct ggml_backend_sycl_context {
|
||||
return pool(device);
|
||||
}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers & fattn_buffers(int device) {
|
||||
if (fattn_bufs[device] == nullptr) {
|
||||
fattn_bufs[device] = new_fattn_kv_buffers(stream(device, 0), device);
|
||||
}
|
||||
return *fattn_bufs[device];
|
||||
}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers & fattn_buffers() {
|
||||
return fattn_buffers(device);
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_GRAPH
|
||||
std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
|
||||
#endif
|
||||
|
||||
@@ -252,6 +252,23 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q5_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
||||
const int64_t nb = k / QK_K;
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(K_SCALE_SIZE), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_q5_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
@@ -643,7 +660,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
return dequantize_row_q4_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q5_K_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_row_q5_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q6_K:
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q6_K_sycl_reorder;
|
||||
@@ -718,7 +739,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return dequantize_row_q4_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q5_K_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_row_q5_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q6_K:
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q6_K_sycl_reorder;
|
||||
|
||||
@@ -537,6 +537,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
|
||||
const int64_t ib = item_ct1.get_group(2);
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads
|
||||
const int64_t tid = item_ct1.get_local_id(2);
|
||||
const int64_t il = tid / 16; // 0...3
|
||||
const int64_t ir = tid % 16; // 0...15
|
||||
const int64_t is = 2 * il;
|
||||
|
||||
dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;
|
||||
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vx);
|
||||
|
||||
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
|
||||
const size_t qs_offset = ib * (QK_K / 2);
|
||||
const size_t qh_offset = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
|
||||
const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
|
||||
const size_t dm_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);
|
||||
|
||||
const uint8_t * qs_ptr = base + qs_offset;
|
||||
const uint8_t * qh_ptr = base + qh_offset;
|
||||
const uint8_t * scales_ptr = base + scales_offset;
|
||||
const ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
|
||||
|
||||
const float dall = dm_values.x();
|
||||
const float dmin = dm_values.y();
|
||||
|
||||
const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
|
||||
const uint8_t * qh = qh_ptr + 2 * ir;
|
||||
|
||||
if (tid < K_SCALE_SIZE) {
|
||||
scales_local[tid] = scales_ptr[tid];
|
||||
}
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, scales_local, sc, m);
|
||||
const float d1 = dall * sc; const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, scales_local, sc, m);
|
||||
const float d2 = dall * sc; const float m2 = dmin * m;
|
||||
|
||||
uint8_t hm = 1 << (2 * il);
|
||||
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
|
||||
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
|
||||
GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
56
ggml/src/ggml-sycl/fattn-buffers.cpp
Normal file
56
ggml/src/ggml-sycl/fattn-buffers.cpp
Normal file
@@ -0,0 +1,56 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2025 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
sycl::half * ggml_sycl_fattn_kv_buffers::kv_buffer::ensure_half(size_t n_elems) {
|
||||
const size_t need_bytes = n_elems * sizeof(sycl::half);
|
||||
|
||||
if (capacity >= need_bytes) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
if (ptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(qptr->wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
|
||||
ptr = nullptr;
|
||||
capacity = 0;
|
||||
}
|
||||
|
||||
size_t cap = 0;
|
||||
while (cap < need_bytes) {
|
||||
cap += CHUNK_SIZE;
|
||||
}
|
||||
|
||||
void * dev_ptr;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dev_ptr = sycl::malloc_device(
|
||||
cap, *qptr)));
|
||||
|
||||
if (!dev_ptr) {
|
||||
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, cap);
|
||||
GGML_ABORT("fattn buffer alloc failed");
|
||||
}
|
||||
|
||||
ptr = static_cast<sycl::half *>(dev_ptr);
|
||||
capacity = cap;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers::kv_buffer::~kv_buffer() {
|
||||
#ifdef DEBUG_SYCL_POOL
|
||||
GGML_LOG_INFO("ggml_sycl_fattn_kv_buffer[%d]: %.2f MiB\n", device, capacity / 1024.0 / 1024.0);
|
||||
#endif
|
||||
if (ptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
|
||||
}
|
||||
}
|
||||
63
ggml/src/ggml-sycl/fattn-buffers.hpp
Normal file
63
ggml/src/ggml-sycl/fattn-buffers.hpp
Normal file
@@ -0,0 +1,63 @@
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2025 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#ifndef GGML_SYCL_FATTN_BUFFERS_HPP
|
||||
#define GGML_SYCL_FATTN_BUFFERS_HPP
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
typedef sycl::queue *queue_ptr;
|
||||
|
||||
struct ggml_sycl_fattn_kv_buffers {
|
||||
// buffers grow in chunks of this size
|
||||
static constexpr size_t CHUNK_SIZE = 16ull << 20; // 16 MiB
|
||||
|
||||
struct kv_buffer {
|
||||
kv_buffer(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
|
||||
~kv_buffer();
|
||||
|
||||
kv_buffer(const kv_buffer &) = delete;
|
||||
kv_buffer & operator=(const kv_buffer &) = delete;
|
||||
|
||||
sycl::half * ensure_half(size_t n_elems);
|
||||
|
||||
private:
|
||||
sycl::half * ptr = nullptr;
|
||||
size_t capacity = 0;
|
||||
queue_ptr qptr = nullptr;
|
||||
[[maybe_unused]] int device = 0;
|
||||
};
|
||||
|
||||
kv_buffer K;
|
||||
kv_buffer V;
|
||||
|
||||
ggml_sycl_fattn_kv_buffers(queue_ptr qptr, int device) : K(qptr, device), V(qptr, device) {}
|
||||
|
||||
ggml_sycl_fattn_kv_buffers(const ggml_sycl_fattn_kv_buffers &) = delete;
|
||||
ggml_sycl_fattn_kv_buffers & operator=(const ggml_sycl_fattn_kv_buffers &) = delete;
|
||||
};
|
||||
|
||||
/**
|
||||
* Imitates `ggml_sycl_pool_alloc` to keep the code calling alloc unchanged.
|
||||
*/
|
||||
struct ggml_sycl_fattn_alloc {
|
||||
ggml_sycl_fattn_kv_buffers::kv_buffer & buf;
|
||||
sycl::half * ptr = nullptr;
|
||||
|
||||
explicit ggml_sycl_fattn_alloc(ggml_sycl_fattn_kv_buffers::kv_buffer & buf_) : buf(buf_) {}
|
||||
|
||||
sycl::half * alloc(size_t n_elems) {
|
||||
ptr = buf.ensure_half(n_elems);
|
||||
return ptr;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "common.hpp"
|
||||
#include "convert.hpp"
|
||||
#include "vecdotq.hpp"
|
||||
#include "fattn-buffers.hpp"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
@@ -918,12 +919,13 @@ void launch_fattn(
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
|
||||
ggml_sycl_pool & pool = ctx.pool();
|
||||
ggml_sycl_fattn_kv_buffers & fbuf = ctx.fattn_buffers();
|
||||
dpct::queue_ptr main_stream = ctx.stream();
|
||||
const int id = ggml_sycl_get_device();
|
||||
const int nsm = ggml_sycl_info().devices[id].nsm;
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> K_f16(pool);
|
||||
ggml_sycl_pool_alloc<sycl::half> V_f16(pool);
|
||||
ggml_sycl_fattn_alloc K_f16(fbuf.K);
|
||||
ggml_sycl_fattn_alloc V_f16(fbuf.V);
|
||||
ggml_sycl_pool_alloc<int> KV_max(pool);
|
||||
ggml_sycl_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_sycl_pool_alloc<sycl::float2> dst_tmp_meta(pool);
|
||||
|
||||
@@ -183,6 +183,10 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
||||
get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data,
|
||||
src1_i32, (float *)dst->data, ctx.stream());
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::ext::oneapi::bfloat16 *)dst->src[0]->data,
|
||||
src1_i32, (float *)dst->data, ctx.stream());
|
||||
break;
|
||||
case GGML_TYPE_F32:
|
||||
get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
|
||||
src1_i32, (float *)dst->data, ctx.stream());
|
||||
|
||||
@@ -1286,6 +1286,23 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
||||
|
||||
~ggml_sycl_pool_leg() {
|
||||
#ifdef DEBUG_SYCL_POOL
|
||||
int n_cached = 0;
|
||||
size_t bytes_cached = 0;
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
if (buffer_pool[i].ptr != nullptr) {
|
||||
++n_cached;
|
||||
bytes_cached += buffer_pool[i].size;
|
||||
}
|
||||
}
|
||||
GGML_LOG_INFO("%s: %d buffers, cached = %.2f MiB\n", __func__,
|
||||
n_cached, bytes_cached / 1024.0 / 1024.0);
|
||||
const auto slots = format_slots_in_alloc_order();
|
||||
if (!slots.empty()) {
|
||||
GGML_LOG_INFO("%s: slots MiB: %s\n", __func__, slots.c_str());
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
@@ -1296,6 +1313,26 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
GGML_ASSERT(pool_size == 0);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_SYCL_POOL
|
||||
std::string format_slots_in_alloc_order() const {
|
||||
std::string line;
|
||||
char buf[32];
|
||||
bool first = true;
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
if (buffer_pool[i].ptr == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (!first) {
|
||||
line += '/';
|
||||
}
|
||||
first = false;
|
||||
snprintf(buf, sizeof(buf), "%.2f", buffer_pool[i].size / 1024.0 / 1024.0);
|
||||
line += buf;
|
||||
}
|
||||
return line;
|
||||
}
|
||||
#endif
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
#ifdef DEBUG_sycl_MALLOC
|
||||
int nnz = 0;
|
||||
@@ -1459,6 +1496,10 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_leg(qptr, device));
|
||||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_fattn_kv_buffers> ggml_backend_sycl_context::new_fattn_kv_buffers(queue_ptr qptr, int device) {
|
||||
return std::unique_ptr<ggml_sycl_fattn_kv_buffers>(new ggml_sycl_fattn_kv_buffers(qptr, device));
|
||||
}
|
||||
|
||||
// TBD pool with virtual memory management
|
||||
// struct ggml_sycl_pool_vmm : public ggml_sycl_pool
|
||||
|
||||
@@ -3303,6 +3344,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return !g_ggml_sycl_prioritize_dmmv;
|
||||
default:
|
||||
@@ -3325,6 +3367,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
return true;
|
||||
default:
|
||||
@@ -3541,6 +3584,54 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool reorder_qw_q5_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q5_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q5_K) == 0);
|
||||
|
||||
const int nblocks = size / sizeof(block_q5_K);
|
||||
|
||||
sycl_reorder_temp_buffer tmp(stream, size);
|
||||
if (!tmp) {
|
||||
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, size);
|
||||
return false;
|
||||
}
|
||||
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
|
||||
|
||||
sycl::event copy_event;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
copy_event.wait();
|
||||
}
|
||||
|
||||
auto * qs_ptr = data_device;
|
||||
auto * qh_ptr = qs_ptr + (QK_K / 2) * nblocks;
|
||||
auto * scales_ptr = qh_ptr + (QK_K / 8) * nblocks;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
||||
|
||||
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
||||
const block_q5_K * x = (const block_q5_K *) tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK_K / 8; ++j) {
|
||||
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
});
|
||||
if (!g_ggml_sycl_use_async_mem_op) {
|
||||
reorder_event.wait_and_throw();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q6_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
|
||||
@@ -3607,6 +3698,8 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
return reorder_qw_q8_0(data_device, ncols, nrows, size, 0, stream);
|
||||
case GGML_TYPE_Q4_K:
|
||||
return reorder_qw_q4_k(data_device, size, 0, stream);
|
||||
case GGML_TYPE_Q5_K:
|
||||
return reorder_qw_q5_k(data_device, size, 0, stream);
|
||||
case GGML_TYPE_Q6_K:
|
||||
return reorder_qw_q6_k(data_device, size, 0, stream);
|
||||
default:
|
||||
@@ -4922,6 +5015,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
|
||||
@@ -839,6 +839,26 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(vx, vy, dst, ncols,
|
||||
nrows, nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
@@ -1125,6 +1145,7 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q8_0_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q8_0_q8_1_sycl\n");
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
@@ -1145,7 +1166,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q5_k_q8_1_sycl\n");
|
||||
reorder_mul_mat_vec_q5_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
GGML_SYCL_DEBUG("Calling mul_mat_vec_q5_K_q8_1_sycl\n");
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
|
||||
@@ -79,6 +79,31 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q5_K> {
|
||||
struct traits {
|
||||
static constexpr uint32_t qk = QK_K;
|
||||
static constexpr uint32_t qi = QI5_K;
|
||||
static constexpr uint32_t qr = QR5_K;
|
||||
static constexpr uint32_t vdr_mmvq = 2;
|
||||
};
|
||||
|
||||
// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales] [dm]
|
||||
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
|
||||
auto qs_offset = block_index * (QK_K / 2);
|
||||
auto qh_offset = n_blocks * (QK_K / 2) + block_index * (QK_K / 8);
|
||||
return { qs_offset, qh_offset };
|
||||
}
|
||||
|
||||
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
auto nblocks = (nrows * (ncols / QK_K));
|
||||
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 8);
|
||||
return { total_qs_bytes + block_index * K_SCALE_SIZE,
|
||||
total_qs_bytes + nblocks * K_SCALE_SIZE + block_index * sizeof(ggml_half2) };
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q6_K> {
|
||||
struct traits {
|
||||
static constexpr uint32_t qk = QK_K;
|
||||
|
||||
@@ -357,38 +357,31 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q8_0> {
|
||||
using q8_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q8_0>;
|
||||
using q8_0_traits = typename q8_0_block::traits;
|
||||
|
||||
__dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int * v, const int * u, const float & d8_0, const sycl::half2 & ds8) {
|
||||
int sumi = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
// Q8_0 values are signed int8, no nibble extraction needed
|
||||
// Direct dp4a: each int packs 4 int8 values
|
||||
sumi = dpct::dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
|
||||
|
||||
// Q8_0 has no bias term (values are signed), so just scale
|
||||
return d8_0 * sumi * ds8f.x();
|
||||
}
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
||||
const sycl::half2 * q8_1_ds, const int & iqs) {
|
||||
const int8_t * bq8_0 = static_cast<const int8_t *>(vbq) + ibx_offset.first;
|
||||
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
|
||||
int v[q8_0_traits::vdr_mmvq];
|
||||
int u[q8_0_traits::vdr_mmvq];
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const int8_t * qs = reinterpret_cast<const int8_t *>(base + ibx_offset.first);
|
||||
const ggml_half d = *reinterpret_cast<const ggml_half *>(base + d_offset.first);
|
||||
|
||||
int v[q8_0_traits::vdr_mmvq];
|
||||
int u[q8_0_traits::vdr_mmvq];
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
v[i] = get_int_from_int8(bq8_0, iqs + i);
|
||||
v[i] = get_int_from_int8(qs, iqs + i);
|
||||
u[i] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
|
||||
}
|
||||
|
||||
return vec_dot_q8_0_q8_1_impl(v, u, d, *q8_1_ds);
|
||||
};
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < q8_0_traits::vdr_mmvq; ++i) {
|
||||
sumi = dpct::dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
const sycl::half2 ds_values = *q8_1_ds;
|
||||
return static_cast<float>(d) * static_cast<float>(ds_values[0]) * sumi;
|
||||
}
|
||||
};
|
||||
|
||||
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
|
||||
@@ -481,6 +474,65 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K> {
|
||||
static constexpr ggml_type gtype = GGML_TYPE_Q5_K;
|
||||
|
||||
using q5_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q5_K>;
|
||||
using q5_k_traits = typename q5_k_block::traits;
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
|
||||
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
|
||||
const sycl::half2 * q8_1_ds, const int & iqs) {
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const uint8_t * qs = base + ibx_offset.first; // low 4 bits
|
||||
const uint8_t * qh_base = base + ibx_offset.second; // high bit
|
||||
const uint8_t * scs = base + d_offset.first;
|
||||
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
|
||||
|
||||
const int bq8_offset = QR5_K * ((iqs / 2) / (QI8_1 / 2));
|
||||
const int * ql_ptr = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
||||
const int * qh_ptr = (const int *) (qh_base + 4 * ((iqs / 2) % 4));
|
||||
const uint16_t * scales = (const uint16_t *) scs;
|
||||
|
||||
int vl[2];
|
||||
int vh[2];
|
||||
int u[2 * QR5_K];
|
||||
float d8[QR5_K];
|
||||
|
||||
vl[0] = ql_ptr[0];
|
||||
vl[1] = ql_ptr[4];
|
||||
|
||||
vh[0] = qh_ptr[0] >> bq8_offset;
|
||||
vh[1] = qh_ptr[4] >> bq8_offset;
|
||||
|
||||
uint16_t aux[2];
|
||||
const int j = (QR5_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
|
||||
if (j < 2) {
|
||||
aux[0] = scales[j + 0] & 0x3f3f;
|
||||
aux[1] = scales[j + 2] & 0x3f3f;
|
||||
} else {
|
||||
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
|
||||
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
|
||||
}
|
||||
|
||||
const uint8_t * sc = (const uint8_t *) aux;
|
||||
const uint8_t * m = sc + 2;
|
||||
|
||||
for (int i = 0; i < QR5_K; ++i) {
|
||||
const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
|
||||
sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
|
||||
|
||||
d8[i] = ds_values[0];
|
||||
|
||||
const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
|
||||
u[2 * i + 0] = q8[0];
|
||||
u[2 * i + 1] = q8[4];
|
||||
}
|
||||
|
||||
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, *dms, d8);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
|
||||
static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
ac6f7b44f60fde0091f0b3d99afde48f8c99b13a
|
||||
628249b398293fc8d2fa81a449ae2920a02c6523
|
||||
|
||||
@@ -1131,10 +1131,6 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
|
||||
}
|
||||
|
||||
// for differentiating model types
|
||||
uint32_t n_vocab = 0;
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
// for classifier models
|
||||
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
|
||||
if (!classifier_labels.empty()) {
|
||||
|
||||
@@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
};
|
||||
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
|
||||
// Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
|
||||
// by the normalizer, BPE merges over the whole text on raw UTF-8.
|
||||
regex_exprs = {
|
||||
"[^\\n]+|[\\n]+",
|
||||
};
|
||||
byte_encode = false;
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "gemma4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
|
||||
escape_whitespaces = true;
|
||||
} else if (
|
||||
tokenizer_pre == "sarvam-moe") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
|
||||
escape_whitespaces = true;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-code" ||
|
||||
|
||||
@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
uint32_t n_vocab = 0;
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
|
||||
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#include "models.h"
|
||||
|
||||
void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
uint32_t n_vocab = 0;
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
|
||||
2
vendor/cpp-httplib/CMakeLists.txt
vendored
2
vendor/cpp-httplib/CMakeLists.txt
vendored
@@ -41,7 +41,7 @@ if (LLAMA_BUILD_BORINGSSL)
|
||||
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
|
||||
|
||||
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
|
||||
set(BORINGSSL_VERSION "0.20260413.0" CACHE STRING "BoringSSL version")
|
||||
set(BORINGSSL_VERSION "0.20260508.0" CACHE STRING "BoringSSL version")
|
||||
|
||||
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user