llama-eval : protect dump() with lock for thread safety

Assisted-by: llama.cpp:local pi
llama-eval : require --grader-model or --model when using --grader-type llm
2026-05-11 20:15:56 +02:00 · 2026-05-10 21:52:43 +03:00 · 2026-05-10 21:49:58 +03:00 · 2026-05-10 21:49:02 +03:00 · 2026-05-10 21:43:35 +03:00 · 2026-05-10 21:22:48 +03:00
20 changed files with 3122 additions and 130 deletions
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGC_VERSION=v2.32.7
+ARG IGC_VERSION_FULL=2_2.32.7+21184
+ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
 ARG IGDGMM_VERSION=22.9.0
 RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -103,6 +103,7 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
+    spirv-headers
  ];
 in

@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      ninja
      pkg-config
      git
-      spirv-headers
    ]
    ++ optionals useCuda [
      cudaPackages.cuda_nvcc
--- a/.gitignore
+++ b/.gitignore
@@ -110,6 +110,7 @@ uv.lock

 # Nix

+flake.lock
 /result

 # Test binaries
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
        if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
            # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
            res = "f2llmv2"
+        if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
+            # ref: https://huggingface.co/sarvamai/sarvam-30b
+            res = "sarvam-moe"

        if res is None:
            logger.warning("\n")
@@ -11591,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
+class SarvamMoEModel(BailingMoeV2Model):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
+    # Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
+    #  - full rotary (no partial_rotary_factor)
+    #  - expert bias is zero-mean normalized at load time
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+        if name.endswith(".expert_bias"):
+            # Sarvam normalizes expert bias to zero mean
+            inner = gen
+
+            def gen():
+                t = inner()
+                return t - t.mean()
+        return super().filter_tensors((name, gen))
+
+
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
 class GroveMoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GROVEMOE
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -155,6 +155,7 @@ models = [
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
+    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@@ -0,0 +1,26 @@
+# llama-eval
+
+Simple evaluation tool for llama.cpp with support for multiple datasets.
+
+For a full description, usage examples, and sample results, see:
+
+- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
+
+## Quick start
+
+```bash
+# Single server
+python3 llama-eval.py \
+  --server http://localhost:8033 \
+  --model my-model \
+  --dataset gsm8k --n_cases 100 \
+  --grader-type regex --threads 32
+
+# Multiple servers (comma-separated URLs and thread counts)
+python3 llama-eval.py \
+  --server http://gpu1:8033,http://gpu2:8033 \
+  --server-name gpu1,gpu2 \
+  --threads 16,16 \
+  --dataset aime2025 --n_cases 240 \
+  --grader-type regex
+```
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import random
+import re
+import time
+import sys
+import os
+import threading
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+from pathlib import Path
+
+import datasets
+
+# Set cache directory for HuggingFace datasets
+cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
+cache_dir.mkdir(parents=True, exist_ok=True)
+os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
+
+def dice(s1: str, s2: str) -> float:
+    """Calculate Dice coefficient between two strings based on bigram overlap."""
+    if not s1 and not s2:
+        return 1.0
+
+    def _bigrams(s: str):
+        return [s[i : i + 2] for i in range(len(s) - 1)]
+
+    bigrams1 = _bigrams(s1)
+    bigrams2 = _bigrams(s2)
+
+    if not bigrams1 and not bigrams2:
+        return 1.0
+
+    from collections import Counter
+
+    freq1 = Counter(bigrams1)
+    freq2 = Counter(bigrams2)
+
+    intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
+    dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
+    return dice_coeff
+
+def debug_log(message: str):
+    """Log debug messages to both stdout and a file"""
+    print(message, file=sys.stderr)
+    with open("/tmp/simulator-debug.log", "a") as f:
+        f.write(message + "\n")
+
+simulator: Optional["Simulator"] = None
+
+@dataclass
+class EvalState:
+    id: str
+    tasks: List[str]
+    task_states: Dict[str, Dict]
+    sampling_config: Dict
+
+def normalize_number(s: str) -> Optional[int]:
+    match = re.match(r"\d+", s)  # match digits from the start
+    if not match:
+        return None
+    return int(match.group(0))
+
+class AimeDataset:
+    def __init__(self, split: str = "train"):
+        self.split = split
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME dataset (split: {self.split})...")
+
+        cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
+        else:
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+
+        self.questions = list(ds)
+        print(f"AIME dataset loaded: {len(self.questions)} questions")
+
+    def find_question(self, request_text: str) -> Optional[Dict]:
+        best_match = None
+        best_distance = -1
+        best_index = -1
+
+        for i, question in enumerate(self.questions):
+            question_text = question["problem"]
+            request_lower = request_text.lower()
+            question_lower = question_text.lower()
+
+            # Exact match
+            if question_lower == request_lower:
+                debug_log(f"DEBUG: Found exact match at index {i}")
+                return question
+
+            # Remove LaTeX formatting for more flexible matching
+            question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
+            if question_no_latex.lower() == request_lower:
+                debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
+                return question
+
+            # Calculate Dice coefficient for partial matches
+            # Only consider if request is at least 50% of question length
+            if len(request_lower) >= len(question_lower) * 0.5:
+                distance = dice(question_lower, request_lower)
+
+                if distance > best_distance:
+                    best_distance = distance
+                    best_match = question
+                    best_index = i
+
+        if best_match and best_distance > 0.3:  # Threshold for partial match
+            debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
+            return best_match
+
+        debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
+        return None
+
+    def get_answer(self, question: Dict) -> str:
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)
+
+class Simulator:
+    def __init__(
+        self,
+        port: int = 8033,
+        host: str = "localhost",
+        success_rate: float = 0.8,
+        dataset_split: str = "train"
+    ):
+        self.port = port
+        self.host = host
+        self.success_rate = success_rate
+        self.dataset = AimeDataset(dataset_split)
+        self.eval_state = EvalState(
+            id="aime-2025",
+            tasks=["aime"],
+            task_states={},
+            sampling_config={"temperature": 0, "max_tokens": 2048}
+        )
+
+    def _generate_response(
+        self,
+        question: Dict,
+        should_be_correct: bool
+    ) -> Dict:
+        expected_answer = self.dataset.get_answer(question)
+
+        if should_be_correct:
+            response_text = expected_answer
+        else:
+            response_text = self._generate_wrong_answer(question)
+
+        return {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": "llama",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": response_text
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 50,
+                "total_tokens": 150
+            }
+        }
+
+    def _generate_wrong_answer(self, question: Dict) -> str:
+        expected_answer = self.dataset.get_answer(question)
+
+        if expected_answer.isdigit():
+            wrong_answer = str(int(expected_answer) + 1)
+        else:
+            wrong_answer = expected_answer + " (wrong)"
+
+        return wrong_answer
+
+    def _process_request(self, request_data: Dict) -> Dict:
+        messages = request_data.get("messages", [])
+        if not messages:
+            return {"error": "No messages in request"}
+
+        request_text = messages[0].get("content", "")
+        debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
+
+        question = self.dataset.find_question(request_text)
+        if not question:
+            debug_log(f"DEBUG: find_question returned None")
+            return {"error": "No matching question found"}
+
+        should_be_correct = random.random() < self.success_rate
+
+        response = self._generate_response(question, should_be_correct)
+
+        task_id = "aime"
+        self.eval_state.task_states[task_id] = {
+            "correct": should_be_correct,
+            "expected": self.dataset.get_answer(question),
+            "predicted": response["choices"][0]["message"]["content"]
+        }
+
+        return response
+
+class RequestHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path != "/v1/chat/completions":
+            self._send_json({"error": "Not found"}, 404)
+            return
+
+        try:
+            content_length = int(self.headers.get("Content-Length", 0))
+            body = self.rfile.read(content_length)
+            request_data = json.loads(body) if body else None
+
+            if not request_data:
+                self._send_json({"error": "Invalid JSON"}, 400)
+                return
+
+            if simulator is None:
+                self._send_json({"error": "Simulator not initialized"}, 500)
+                return
+
+            response = simulator._process_request(request_data)
+            self._send_json(response, 200)
+
+        except json.JSONDecodeError:
+            self._send_json({"error": "Invalid JSON"}, 400)
+        except Exception as e:
+            print(f"Error processing request: {e}")
+            self._send_json({"error": str(e)}, 500)
+
+    def _send_json(self, data: dict, status: int = 200):
+        body = json.dumps(data).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):
+        # Suppress default request logging
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="llama-server simulator for testing eval scripts"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8033,
+        help="Server port (default: 8033)"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server host (default: localhost)"
+    )
+    parser.add_argument(
+        "--success-rate",
+        type=float,
+        default=0.8,
+        help="Success rate 0-1 (default: 0.8)"
+    )
+    parser.add_argument(
+        "--dataset-split",
+        type=str,
+        default="train",
+        help="AIME dataset split to use (default: train)"
+    )
+
+    args = parser.parse_args()
+
+    global simulator
+    simulator = Simulator(
+        port=args.port,
+        host=args.host,
+        success_rate=args.success_rate,
+        dataset_split=args.dataset_split
+    )
+
+    server = HTTPServer((args.host, args.port), RequestHandler)
+    server_thread = threading.Thread(target=server.serve_forever, daemon=True)
+    server_thread.start()
+
+    print("\n=== llama-server-simulator ===")
+    print(f"Server running on http://{args.host}:{args.port}")
+    print(f"Success rate: {args.success_rate}")
+    print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
+    print("\nPress Ctrl+C to stop\n")
+
+    try:
+        server_thread.join()
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+        server.shutdown()
+
+if __name__ == "__main__":
+    main()
--- a/examples/llama-eval/test-simulator.sh
+++ b/examples/llama-eval/test-simulator.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "=== llama-server-simulator Test Script ==="
+echo ""
+
+PORT=8033
+SUCCESS_RATE=0.8
+TEST_PORT=8034
+
+echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
+source "$SCRIPT_DIR/venv/bin/activate"
+python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
+SIMULATOR_PID=$!
+
+echo "Waiting for simulator to start..."
+sleep 5
+
+# Helper function to make a request and extract the answer
+make_request() {
+  local question="$1"
+  curl -s -X POST http://localhost:$PORT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d "{
+      \"model\": \"llama\",
+      \"messages\": [
+        {\"role\": \"user\", \"content\": \"$question\"}
+      ],
+      \"temperature\": 0,
+      \"max_tokens\": 2048
+    }" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
+}
+
+# Test question (repeated in multiple tests)
+TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
+
+echo ""
+echo "=== Test 1: Correct Answer ==="
+echo "Sending request with known question..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 2: Wrong Answer ==="
+echo "Sending request with known question (success rate 0.0)..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 3: No Matching Question ==="
+echo "Sending request with non-matching text..."
+response=$(make_request "What is the capital of France?")
+echo "Response: $response"
+echo "Expected: No matching question found"
+echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 4: Success Rate Verification ==="
+echo "Sending 10 requests to test success rate..."
+correct_count=0
+for i in {1..10}; do
+  answer=$(make_request "$TEST_QUESTION")
+  if [ "$answer" == "116" ]; then
+    correct_count=$((correct_count + 1))
+  fi
+  echo "  Request $i: Answer = $answer"
+done
+echo "Correct answers: $correct_count/10"
+echo "Expected: ~8/10 (80% success rate)"
+echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
+
+echo ""
+echo "=== Test Complete ==="
+echo "Stopping simulator..."
+kill $SIMULATOR_PID 2>/dev/null
+wait $SIMULATOR_PID 2>/dev/null || true
+
+echo "Simulator stopped."
--- a/flake.lock
+++ b/flake.lock
@@ -1,58 +0,0 @@
-{
-  "nodes": {
-    "flake-parts": {
-      "inputs": {
-        "nixpkgs-lib": "nixpkgs-lib"
-      },
-      "locked": {
-        "lastModified": 1730504689,
-        "narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "rev": "506278e768c2a08bec68eb62932193e341f55c90",
-        "type": "github"
-      },
-      "original": {
-        "owner": "hercules-ci",
-        "repo": "flake-parts",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1732014248,
-        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "nixpkgs-lib": {
-      "locked": {
-        "lastModified": 1730504152,
-        "narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      },
-      "original": {
-        "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-parts": "flake-parts",
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 11)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/ggml/src/ggml-cuda/allreduce.cu
+++ b/ggml/src/ggml-cuda/allreduce.cu
@@ -0,0 +1,968 @@
+#include "allreduce.cuh"
+
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+
+#include "convert.cuh"
+#include "ggml-impl.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+
+// ---------------------------------------------------------------------------
+// CUDA AllReduce for tensor-parallel inference across two GPUs.
+//
+// Provides an in-place sum reduction over matching tensors on two CUDA
+// devices in the same process.  Used by the tensor-split path alongside
+// NCCL; targets setups without NVLink, where data is exchanged between the
+// GPUs by staging it through pinned host memory over PCIe.
+//
+// Two reduction strategies are selected per call by tensor size:
+//
+//   * Chunked kernel path (small reductions): a single CUDA kernel both
+//     stages data through pinned host memory and performs the local sum.
+//     Cross-GPU synchronization happens *inside the kernel* (busy-wait on
+//     a host-memory flag), which keeps launch overhead low for the
+//     latency-sensitive token-generation case.
+//
+//   * Copy-engine path (large reductions): the transfer is split into
+//     D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
+//     followed by a small device-side add kernel.  Cross-GPU
+//     synchronization happens *outside the kernel*, via CUDA events
+//     between streams.  This keeps the compute engine free while large
+//     transfers are in flight, which matters for prefill-sized tensors.
+//     Reductions larger than the per-call inner cap are processed by an
+//     outer chunker that issues sequential inner calls.
+// ---------------------------------------------------------------------------
+
+// ---------------------------------------------------------------------------
+// Cross-GPU signal mechanism
+//
+// One int per (slot, rank) pair in pinned host memory.  Each AR call writes a
+// strictly increasing token (= the AR call number) into its own arrival int.
+// The peer spins until its read of the other's arrival int equals the token
+// it expects for this call -- a mismatch means the peer hasn't arrived yet.
+// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
+// days at thousands of ARs/sec), so arrival ints don't need to be reset
+// between calls; we initialize once at pipeline init and let the values
+// accumulate.
+//
+// There is exactly one writer (the owning GPU) and one reader (the peer), so
+// we don't need atomics.  A volatile store paired with __threadfence_system()
+// provides the release ordering that makes the D2H writes visible system-wide
+// before the arrival token is observed.
+//
+// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
+// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
+// portable choice.
+// ---------------------------------------------------------------------------
+
+static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
+    *(volatile int *)p = token;
+}
+static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
+    return *(const volatile int *)p;
+}
+
+// Byte spacing between adjacent arrival ints.  64 bytes (one cache line)
+// ensures each GPU/block's arrival slot lives on its own line, preventing
+// false-sharing stalls on the polling GPU.
+static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
+
+// Number of blocks the chunked kernel launches with.  Each block stripes a
+// disjoint slice of the data and synchronizes through its own arrival-token
+// slot so multiple SMs can pump PCIe stores in parallel.
+static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
+
+// ---------------------------------------------------------------------------
+// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
+//
+// Both GPUs run this kernel simultaneously on independent streams.  sendbuf
+// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
+// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
+// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe).  When
+// T_dst == T_wire the casts below are no-ops.
+//
+// Each GPU runs three phases:
+//
+//   Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
+//                          single-instruction-width vectors into host_mine.
+//                          __threadfence_system() commits these writes to host
+//                          memory.
+//   Phase 2 (thread 0):    write token to arrival_mine; spin until
+//                          arrival_other == token.
+//   Phase 3 (all threads): read T_wire vectors from host_other, cast
+//                          each element to T_dst, and sum with the local
+//                          sendbuf value (also rounded through T_wire so that
+//                          both GPUs truncate identically -- this guarantees
+//                          bit-equivalent results across the two devices).
+//
+// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
+// threads to keep multiple SMs issuing PCIe stores in parallel.  Each block
+// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
+// thread 0 of each block signals/spins on that slot independently of other
+// blocks.  Tail elements (the leftover < ELEMS_PER_VEC at the end) are
+// handled only by block 0 to avoid cross-block writes to the same slots.
+// ---------------------------------------------------------------------------
+template <typename T_dst, typename T_wire>
+static __global__ void ggml_cuda_ar_kernel(
+        const T_dst  *              sendbuf,
+        T_dst        *              recvbuf,
+        T_wire       * __restrict__ host_mine,
+        const T_wire * __restrict__ host_other,
+        int                         count,
+        int *                       arrival_mine,
+        int *                       arrival_other,
+        int                         token) {
+
+    // Vector unit for the wire type, sized to the arch's widest single-instruction
+    // copy (16 B on Volta+).  Each phase-1 iter writes one vector to host memory;
+    // each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
+    constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
+    constexpr int ARRIVAL_INTS  = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
+
+    const int tid       = threadIdx.x;
+    const int nt        = blockDim.x;
+    const int bid       = blockIdx.x;
+    const int gtid      = bid * nt + tid;
+    const int gnt       = gridDim.x * nt;
+    const int count_vec = count / ELEMS_PER_VEC;
+    const int tail      = count_vec * ELEMS_PER_VEC;
+
+    // Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
+    {
+        for (int i = gtid; i < count_vec; i += gnt) {
+            const int off = i * ELEMS_PER_VEC;
+            T_wire wire[ELEMS_PER_VEC];
+            #pragma unroll
+            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
+                wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
+            }
+            ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
+        }
+        if (bid == 0 && tid < count - tail) {
+            host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
+        }
+    }
+
+    // Commit this block's host writes before signalling.
+    __threadfence_system();
+    __syncthreads();
+
+    // Phase 2: thread 0 of each block signals on its own arrival slot, then
+    // spins for the matching slot from peer.  Per-block tokens mean blocks
+    // proceed independently -- no inter-block barrier needed.
+    if (tid == 0) {
+        int       * my_slot    = arrival_mine  + bid * ARRIVAL_INTS;
+        const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
+
+        ggml_cuda_ar_signal_set(my_slot, token);
+        __threadfence_system(); // make our signal visible system-wide
+
+        while (ggml_cuda_ar_signal_get(other_slot) != token) {
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+            __nanosleep(100);
+#else
+            NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+        }
+    }
+
+    __syncthreads();
+
+    // Acquire peer's host_other writes (this block's stripe of them).
+    __threadfence_system();
+
+    // Phase 3: read peer's T_wire vector, cast both sides through T_wire for
+    // bit-equivalence, sum in T_dst precision, and write back to recvbuf.
+    {
+        for (int i = gtid; i < count_vec; i += gnt) {
+            const int off = i * ELEMS_PER_VEC;
+            T_wire wire[ELEMS_PER_VEC];
+            ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
+            #pragma unroll
+            for (int k = 0; k < ELEMS_PER_VEC; ++k) {
+                const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
+                recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
+            }
+        }
+        if (bid == 0 && tid < count - tail) {
+            const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
+            recvbuf[tail + tid] =
+                ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
+        }
+    }
+}
+
+// Combined load-convert-add kernel.  The peer's contribution arrives as T_src
+// (which may be a lower-precision type than T_dst when the BF16 round-trip is
+// active).  For bit-equivalence between the two GPUs, dst is first rounded
+// through T_src's precision via ggml_cuda_cast -- peer already truncated its
+// own value the same way before sending -- so both sides perform identical
+// arithmetic.  When T_dst == T_src the round-trip cast is a no-op.
+template <typename T_dst, typename T_src>
+static __global__ void ggml_cuda_ar_add_kernel(
+        T_dst       * __restrict__ dst,
+        const T_src * __restrict__ src,
+        int count) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int nt  = gridDim.x * blockDim.x;
+    for (int i = tid; i < count; i += nt) {
+        const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
+        dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pipeline structure
+// ---------------------------------------------------------------------------
+
+// Number of slots in the event / arrival ring.  Two slots is sufficient:
+// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
+// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
+// from AR N-2 by the time we get to AR N.  acquire_slot's
+// cudaEventSynchronize on ev.ker for both devices makes that consumption
+// explicit before we overwrite host_buf[slot] for the new AR.
+static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
+
+// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
+// Larger tensors are reduced by issuing multiple chunked launches.
+static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
+
+// Copy-engine path: largest tensor accepted on this path; sets host_large /
+// dev_tmp allocation size.
+static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
+
+// AR wire size at which the copy-engine path takes over from the chunked-
+// kernel path.  Override via GGML_CUDA_AR_COPY_THRESHOLD.
+static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
+// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
+// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
+// the peer); the clamps cover the cases where nbytes/4 is too small (per-
+// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
+// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024;       // 512 KB
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024;  // 2 MB
+// Absolute floor that an env-var override is allowed to set; this caps the
+// per-slot copy-event array.  256 KB -> up to 128 chunks per 32 MB tensor.
+static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
+static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
+    static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
+                    GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
+
+struct ggml_cuda_ar_event_slot {
+    cudaEvent_t app = nullptr;  // upstream computation complete
+    cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {};  // copy-engine D2H chunks complete
+    cudaEvent_t h2d = nullptr;  // copy-engine H2Ds complete (handoff AR stream -> compute stream)
+    cudaEvent_t ker = nullptr;  // AllReduce kernel complete
+};
+
+// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
+// in one place, with the host handle preserved for cudaFreeHost.  Used where
+// the CPU never touches the buffer -- only the device reads/writes via the
+// mapped device pointer.  Required on systems where cudaDevAttrCanUseHost-
+// PointerForRegisteredMem is 0 and the host pointer can't be used as a
+// device pointer.
+struct ggml_cuda_ar_host_mapping {
+    uint8_t * host = nullptr;   // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
+    uint8_t * dev  = nullptr;   // device-side pointer for kernels / cudaMemset
+
+    cudaError_t alloc(size_t bytes) {
+        cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
+                                       cudaHostAllocPortable | cudaHostAllocMapped);
+        if (rc != cudaSuccess) {
+            host = nullptr;
+            return rc;
+        }
+        rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
+        if (rc != cudaSuccess) {
+            cudaFreeHost(host);
+            host = nullptr;
+            dev  = nullptr;
+        }
+        return rc;
+    }
+
+    void free() {
+        if (host) {
+            cudaFreeHost(host);
+            host = nullptr;
+            dev  = nullptr;
+        }
+    }
+};
+
+struct ggml_cuda_ar_pipeline {
+    int      n_devices;
+    int      devices[GGML_CUDA_MAX_DEVICES];
+    size_t   buf_bytes;    // bytes per device in host_buf[]
+    size_t   copy_bytes;   // bytes per device in host_large[] / dev_tmp[]
+    size_t   copy_threshold;
+    size_t   copy_chunk_bytes;
+    size_t   bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
+    uint64_t call_count;
+
+    // Per-device resources.
+    ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES];   // pinned staging (chunked kernel)
+    ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
+    char *                    dev_tmp[GGML_CUDA_MAX_DEVICES];    // device scratch for copy-engine path
+    cudaStream_t             streams[GGML_CUDA_MAX_DEVICES];   // non-blocking
+    ggml_cuda_ar_event_slot  ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
+
+    // Copy-engine: per-device "I finished reading my peer's host_large"
+    // event.  Indexed by RECORDER device.  Recorded same-device on streams[i]
+    // after stage 2's last H2D from host_large[peer].  Waited cross-device
+    // by peer's stage-1 stream before the next AR overwrites host_large[peer].
+    cudaEvent_t              host_large_read_done[GGML_CUDA_MAX_DEVICES];
+    bool                     host_large_read_done_valid;
+
+    // Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
+    // Recorded on the compute stream after each add_kernel; the AR stream
+    // waits on it before the next copy_impl's H2D overwrites dev_tmp.  Lets us
+    // single-buffer dev_tmp despite add_kernel running on a separate stream.
+    cudaEvent_t              dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
+    bool                     dev_tmp_kernel_done_valid;
+
+    // Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints.  Mapped pinned
+    // memory; CPU never reads/writes -- only the kernel and cudaMemset.
+    // Use ggml_cuda_ar_arrival_ptr() to index.
+    ggml_cuda_ar_host_mapping arrival;
+};
+
+// Base pointer for the (slot, rank) per-block token block.  The kernel adds
+// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
+static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
+    const size_t offset = ((size_t)slot * p->n_devices + rank) *
+                          GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
+    return reinterpret_cast<int *>(p->arrival.dev + offset);
+}
+
+static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
+    const char * value = getenv(name);
+    if (value == nullptr || value[0] == '\0') {
+        return default_value;
+    }
+
+    char * end = nullptr;
+    const unsigned long long parsed = strtoull(value, &end, 10);
+    return end != value ? (uint64_t) parsed : default_value;
+}
+
+struct ggml_cuda_ar_slot_info {
+    int slot;
+    int token;
+};
+
+static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
+    const int  slot        = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
+    const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
+    p->call_count++;
+
+    if (pool_lapped) {
+        for (int i = 0; i < p->n_devices; ++i) {
+            ggml_cuda_set_device(p->devices[i]);
+            CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
+        }
+    }
+
+    return { slot, (int) p->call_count };
+}
+
+// Per-AR copy-engine chunk size: env-var override if set, else heuristic
+// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
+static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
+    if (p->copy_chunk_bytes > 0) {
+        return p->copy_chunk_bytes;
+    }
+    return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
+                    std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
+}
+
+static void ggml_cuda_ar_wait_for_compute(
+        ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
+    ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
+    CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
+    CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
+}
+
+// ---------------------------------------------------------------------------
+// Init / free
+// ---------------------------------------------------------------------------
+
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
+
+    if (n_devices != 2) {
+        GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
+                       "falling back\n", __func__, n_devices);
+        return nullptr;
+    }
+
+    // The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
+    for (size_t i = 0; i < n_devices; ++i) {
+        const int cc = ggml_cuda_info().devices[devices[i]].cc;
+        if (cc < GGML_CUDA_CC_VOLTA) {
+            GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
+                           "(device %d has cc=%d); falling back\n",
+                           __func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
+            return nullptr;
+        }
+    }
+
+    auto * p = new ggml_cuda_ar_pipeline{};
+    p->n_devices        = n_devices;
+    p->copy_bytes       = GGML_CUDA_AR_COPY_MAX_BYTES;
+    p->copy_threshold   = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
+    // 0 = use the per-call heuristic (default).  Non-zero env value forces a
+    // fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
+    p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
+    if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
+        GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
+                      __func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
+        p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
+    }
+    // Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
+    // ne).  Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
+    // byte threshold to opt out for small tensors.
+    p->bf16_threshold   = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
+    for (size_t i = 0; i < n_devices; ++i) {
+        p->devices[i] = devices[i];
+    }
+
+    // Per-device streams and event pools.
+    for (size_t i = 0; i < n_devices; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+
+        cudaStream_t stream = nullptr;
+        if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        p->streams[i] = stream;
+
+        for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
+            bool ok =
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
+                cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
+            for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
+                ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
+            }
+            if (!ok) {
+                GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
+                               __func__, p->devices[i], s);
+                ggml_cuda_ar_pipeline_free(p);
+                return nullptr;
+            }
+        }
+
+        if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
+                           __func__, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    // Arrival ring: cache-line padded so each GPU's int is on its own line.
+    const size_t arrival_bytes =
+        (size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
+        GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
+    if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
+        GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
+                       __func__, arrival_bytes);
+        ggml_cuda_ar_pipeline_free(p);
+        return nullptr;
+    }
+    ggml_cuda_set_device(p->devices[0]);
+    if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
+        GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
+                       __func__, arrival_bytes);
+        ggml_cuda_ar_pipeline_free(p);
+        return nullptr;
+    }
+
+    // Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
+    // kernel can write the next slot's data while the peer is still reading
+    // the previous slot's. Indexed by (slot * buf_bytes) at the call site.
+    p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
+    const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
+    for (size_t i = 0; i < n_devices; ++i) {
+        if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
+                           __func__, host_buf_total);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    // Copy-engine path: pinned host staging + device scratch, sized for the
+    // largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
+    // dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
+    // cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
+    for (size_t i = 0; i < n_devices; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+        if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
+                           __func__, p->copy_bytes);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+        if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
+            GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
+                           __func__, p->copy_bytes, p->devices[i]);
+            ggml_cuda_ar_pipeline_free(p);
+            return nullptr;
+        }
+    }
+
+    GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
+                  "%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
+                  __func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
+
+    return p;
+}
+
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
+    if (!p) {
+        return;
+    }
+
+    // Drain all in-flight kernels before tearing down resources.
+    for (int i = 0; i < p->n_devices; ++i) {
+        if (p->streams[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaStreamSynchronize(p->streams[i]);
+        }
+    }
+
+    for (int i = 0; i < p->n_devices; ++i) {
+        p->host_buf[i].free();
+        p->host_large[i].free();
+        if (p->dev_tmp[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaFree(p->dev_tmp[i]);
+        }
+        ggml_cuda_set_device(p->devices[i]);
+        for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
+            if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
+            for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
+                if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
+            }
+            if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
+            if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
+        }
+        if (p->host_large_read_done[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaEventDestroy(p->host_large_read_done[i]);
+        }
+        if (p->dev_tmp_kernel_done[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaEventDestroy(p->dev_tmp_kernel_done[i]);
+        }
+        if (p->streams[i]) {
+            ggml_cuda_set_device(p->devices[i]);
+            cudaStreamDestroy(p->streams[i]);
+        }
+    }
+    p->arrival.free();
+    delete p;
+}
+
+// ---------------------------------------------------------------------------
+// Dispatch
+// ---------------------------------------------------------------------------
+
+// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
+// nbytes per ne element); accumulated locally into a T_dst buffer.  When
+// T_src == T_dst this is the original homogeneous reduction.  When they differ
+// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
+// for bit-equivalence between GPUs and we skip the otherwise-needed
+// post-conversion entirely.
+template <typename T_src, typename T_dst>
+static bool ggml_cuda_ar_allreduce_copy_impl(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        T_src * const           src_buf[GGML_CUDA_MAX_DEVICES],
+        T_dst * const           dst_buf[GGML_CUDA_MAX_DEVICES],
+        const bool              compute[GGML_CUDA_MAX_DEVICES],
+        int64_t                 ne,
+        size_t                  nbytes) {
+    GGML_ASSERT(p->n_devices == 2);
+    GGML_ASSERT(nbytes <= p->copy_bytes);
+    GGML_ASSERT(ne <= std::numeric_limits<int>::max());
+
+    const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
+    GGML_ASSERT(chunk_bytes > 0);
+
+    const int slot = ggml_cuda_ar_acquire_slot(p).slot;
+    const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
+    GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
+
+    ggml_backend_cuda_context * cuda_ctx[2] = {};
+
+    // Stage 1: both GPUs copy their local contribution to pinned host memory.
+    for (int i = 0; i < 2; ++i) {
+        ggml_cuda_set_device(p->devices[i]);
+        cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+        GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
+
+        ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
+
+        // Wait for peer's H2D from our host_large[i] (recorded in the
+        // previous AR's stage 2) to complete before we overwrite host_large[i].
+        // host_large_read_done[peer] = peer finished reading host_large[i].
+        // No-op on the first AR -- no prior record exists.
+        if (p->host_large_read_done_valid) {
+            const int peer = 1 - i;
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
+        }
+
+        if (!compute[i]) {
+            CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
+        }
+
+        for (size_t c = 0; c < copy_chunks; ++c) {
+            const size_t offset = c * chunk_bytes;
+            const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
+                (nbytes - offset) : chunk_bytes;
+
+            CUDA_CHECK(cudaMemcpyAsync(
+                p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
+                cudaMemcpyDeviceToHost, p->streams[i]));
+            CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
+        }
+    }
+
+    // Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
+    // local device scratch (dev_tmp), then performs one device-local add over
+    // the assembled peer tensor.  The H2Ds run on the AR stream (copy engine)
+    // and the add_kernel runs on the caller's compute stream, so the AR stream
+    // stays pure-copy and avoids an in-stream copy->compute engine switch every
+    // AR.  dev_tmp is single-buffered: the AR stream waits cross-stream on the
+    // prior AR's add_kernel-done event before overwriting it.
+    for (int i = 0; i < 2; ++i) {
+        const int peer = 1 - i;
+        ggml_cuda_set_device(p->devices[i]);
+
+        // Wait for the previous AR's add_kernel (on the compute stream) to
+        // finish reading dev_tmp before our H2D overwrites it.  No-op on the
+        // first copy_impl call.
+        if (p->dev_tmp_kernel_done_valid) {
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
+        }
+
+        for (size_t c = 0; c < copy_chunks; ++c) {
+            const size_t offset = c * chunk_bytes;
+            const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
+                (nbytes - offset) : chunk_bytes;
+
+            CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
+            CUDA_CHECK(cudaMemcpyAsync(
+                p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
+                cudaMemcpyHostToDevice, p->streams[i]));
+        }
+
+        // Mark our reads of host_large[peer] complete so peer's next AR can
+        // safely overwrite it.
+        CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
+
+        // Hand off from AR stream (copy engine) to compute stream: compute
+        // stream waits for all H2Ds to finish, then runs the add_kernel.
+        CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
+        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
+
+        const int block_size = 256;
+        int n_blocks = (int) ((ne + block_size - 1) / block_size);
+        if (n_blocks > 1024) {
+            n_blocks = 1024;
+        }
+        ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
+            dst_buf[i],
+            reinterpret_cast<const T_src *>(p->dev_tmp[i]),
+            (int) ne);
+        CUDA_CHECK(cudaGetLastError());
+
+        // Record dev_tmp-released on the compute stream so the next copy_impl
+        // can wait for the kernel to finish before overwriting dev_tmp.  Also
+        // record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
+        CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
+        CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
+    }
+    p->host_large_read_done_valid = true;
+    p->dev_tmp_kernel_done_valid = true;
+
+    return true;
+}
+
+// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
+// the host_large / dev_tmp allocation size).  When the full AR exceeds that,
+// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
+// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
+// slot, so cross-AR fences and pool wraparound work the same way as for any
+// other sequence of small ARs.
+template <typename T_src, typename T_dst>
+static bool ggml_cuda_ar_allreduce_copy_outer(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        T_src * const           src_buf[GGML_CUDA_MAX_DEVICES],
+        T_dst * const           dst_buf[GGML_CUDA_MAX_DEVICES],
+        const bool              compute[GGML_CUDA_MAX_DEVICES],
+        int64_t                 ne) {
+    const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
+    GGML_ASSERT(outer_max_elems > 0);
+
+    bool ok = true;
+    for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
+        const int64_t outer_ne     = std::min(outer_max_elems, ne - outer_start);
+        const size_t  outer_nbytes = (size_t) outer_ne * sizeof(T_src);
+
+        T_src * src[GGML_CUDA_MAX_DEVICES] = {};
+        T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
+        for (int i = 0; i < p->n_devices; ++i) {
+            src[i] = src_buf[i] + outer_start;
+            dst[i] = dst_buf[i] + outer_start;
+        }
+        ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
+            p, backends, src, dst, compute, outer_ne, outer_nbytes);
+    }
+    return ok;
+}
+
+bool ggml_cuda_ar_allreduce(
+        ggml_cuda_ar_pipeline * p,
+        ggml_backend_t        * backends,
+        ggml_tensor           ** tensors) {
+    GGML_ASSERT(p != nullptr);
+
+    const int n = p->n_devices;
+    GGML_ASSERT(n == 2);
+
+    const ggml_type input_type = tensors[0]->type;
+    GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
+
+    const int64_t ne = ggml_nelements(tensors[0]);
+    GGML_ASSERT(ne > 0);
+
+    const size_t   input_nbytes = ggml_nbytes(tensors[0]);
+
+    // BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
+    // the reduction (chunked or copy-engine), halving on-wire bytes. Matches
+    // NCCL's behaviour. The pre-conversion zeroes inactive shards so the
+    // inner paths see them as already-prepared compute tensors.
+    const bool use_bf16 =
+        input_type == GGML_TYPE_F32 &&
+        p->bf16_threshold > 0 &&
+        input_nbytes >= p->bf16_threshold;
+
+    const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
+    const size_t    type_size   = ggml_type_size(kernel_type);
+    GGML_ASSERT(p->buf_bytes >= type_size);
+    const size_t    nbytes      = (size_t) ne * type_size;
+
+    bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
+    for (int i = 0; i < n; ++i) {
+        compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
+    }
+
+    // Decide between copy-engine and chunked kernel paths based on the working
+    // type's actual byte count.  No upper bound: copy_outer slices reductions
+    // larger than copy_bytes into copy_bytes-sized pieces.
+    const bool use_copy_engine =
+        p->copy_threshold > 0 &&
+        nbytes >= p->copy_threshold;
+
+    // BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
+    // (chunked kernel path) and the combined add kernel (copy_engine path)
+    // both accumulate into the F32 tensor data directly, so an inactive
+    // shard's accumulator must start at zero.
+    if (use_bf16) {
+        for (int i = 0; i < n; ++i) {
+            if (!compute_flag[i]) {
+                auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+                GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+                ggml_cuda_set_device(p->devices[i]);
+                CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
+            }
+        }
+    }
+
+    // Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
+    // path; the chunked kernel path's combined kernel does the conversion
+    // inline as it writes to host_buf.
+    ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
+    void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
+
+    if (use_copy_engine && use_bf16) {
+        to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
+        for (int i = 0; i < n; ++i) {
+            auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+            GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+            bf16_tmp[i].pool = &cuda_ctx->pool();
+            bf16_tmp[i].alloc(ne);
+            ggml_cuda_set_device(p->devices[i]);
+            if (compute_flag[i]) {
+                to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
+                CUDA_CHECK(cudaGetLastError());
+            } else {
+                CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
+            }
+            copy_src_ptr[i] = bf16_tmp[i].get();
+        }
+    }
+
+    bool ok = true;
+    if (use_copy_engine) {
+        // After up-front BF16 conversion, the tmp buffers already hold the
+        // (possibly zeroed-for-inactive) data, so the inner path can treat
+        // every shard as compute.
+        bool inner_compute[GGML_CUDA_MAX_DEVICES];
+        for (int i = 0; i < n; ++i) {
+            inner_compute[i] = use_bf16 ? true : compute_flag[i];
+        }
+
+        // Dispatch into copy_impl with explicit src/dst types.  When use_bf16
+        // is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
+        // is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
+        // through BF16 for bit-equivalence and writes F32 directly, so no
+        // post-conversion is needed.  Otherwise src == dst (same native type).
+        if (use_bf16) {
+            GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
+            nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
+            float       * dst[GGML_CUDA_MAX_DEVICES] = {};
+            for (int i = 0; i < n; ++i) {
+                src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
+                dst[i] = static_cast<float *>(tensors[i]->data);
+            }
+            ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
+                p, backends, src, dst, inner_compute, ne);
+        } else {
+            switch (kernel_type) {
+                case GGML_TYPE_F32: {
+                    float * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<float *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                case GGML_TYPE_BF16: {
+                    nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                case GGML_TYPE_F16: {
+                    half * buf[GGML_CUDA_MAX_DEVICES] = {};
+                    for (int i = 0; i < n; ++i) {
+                        buf[i] = static_cast<half *>(tensors[i]->data);
+                    }
+                    ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
+                        p, backends, buf, buf, inner_compute, ne);
+                    break;
+                }
+                default:
+                    GGML_ASSERT(false);
+            }
+        }
+    } else {
+        // host_buf carries T_wire-typed data; max_chunk_elems is the count that
+        // fits in one host_buf at the wire size.
+        const size_t max_chunk_elems = p->buf_bytes / type_size;
+        const size_t input_type_size = ggml_type_size(input_type);
+
+        // Chunked kernel path runs entirely on the caller's compute stream:
+        // since AR is a barrier here, same-stream ordering subsumes any
+        // cross-stream event handshake that the copy-engine path needs, and
+        // skips the cross-stream scheduling overhead that was hurting the
+        // small-tensor (tg) latency on the AR-stream variant.  Only ev.ker is
+        // still recorded at end-of-AR for acquire_slot's pool-wraparound check.
+        for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
+            const size_t remaining_elems = (size_t) (ne - chunk_start);
+            const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
+            const size_t chunk_dst_bytes  = chunk_elems * input_type_size;
+
+            const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
+            const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
+
+            for (int i = 0; i < n; ++i) {
+                const int peer = 1 - i;  // valid for n == 2 only
+                ggml_cuda_set_device(p->devices[i]);
+                auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
+                GGML_ASSERT(cuda_ctx->device == p->devices[i]);
+                cudaStream_t stream = cuda_ctx->stream();
+
+                char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
+
+                // Match NCCL/meta-backend semantics: inactive shards contribute
+                // zeros.  On the BF16 path the F32 tensor data was already
+                // zeroed up-front (above), so per-chunk zeroing isn't needed.
+                if (!compute_flag[i] && !use_bf16) {
+                    CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
+                }
+
+#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
+                ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
+                    reinterpret_cast<const T_dst *>(data), \
+                    reinterpret_cast<T_dst *>(data), \
+                    reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
+                    reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
+                    static_cast<int>(chunk_elems), \
+                    ggml_cuda_ar_arrival_ptr(p, slot, i), \
+                    ggml_cuda_ar_arrival_ptr(p, slot, peer), \
+                    token)
+
+                if (use_bf16) {
+                    GGML_ASSERT(input_type == GGML_TYPE_F32);
+                    LAUNCH_AR_KERNEL(float, nv_bfloat16);
+                } else {
+                    switch (input_type) {
+                        case GGML_TYPE_F32:  LAUNCH_AR_KERNEL(float,       float);       break;
+                        case GGML_TYPE_F16:  LAUNCH_AR_KERNEL(half,        half);        break;
+                        case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
+                        default: GGML_ASSERT(false);
+                    }
+                }
+
+#undef LAUNCH_AR_KERNEL
+                CUDA_CHECK(cudaGetLastError());
+
+                if (last_chunk) {
+                    CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
+                }
+            }
+        }
+    }
+
+    return ok;
+}
+
+#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
+
+// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
+// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
+// implementation relies on, so the internal AllReduce is a CUDA-only feature.
+// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
+// and silently falls back to the meta backend's generic AllReduce.
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
+    return nullptr;
+}
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
+}
+bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
+    return false;
+}
+
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
--- a/ggml/src/ggml-cuda/allreduce.cuh
+++ b/ggml/src/ggml-cuda/allreduce.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "common.cuh"
+#include "ggml-backend-impl.h"
+
+#include <cstddef>
+
+// Opaque pipeline context -- owns all pinned buffers, streams, and events.
+struct ggml_cuda_ar_pipeline;
+
+// Allocate a pipeline for n_devices GPUs.
+// devices[] holds the CUDA device IDs in rank order.
+// Returns nullptr on allocation failure.
+ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
+    const int * devices, size_t n_devices);
+
+// Release all resources owned by the pipeline.
+void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
+
+// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
+// tensors[i] must live on the device managed by backends[i] and be
+// contiguous F32, F16, or BF16.
+// Preconditions are checked by the CUDA comm dispatcher before calling this.
+// Returns true once the reduction work has been enqueued successfully.
+bool ggml_cuda_ar_allreduce(
+    ggml_cuda_ar_pipeline * pipeline,
+    ggml_backend_t        * backends,
+    ggml_tensor           ** tensors);
+
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2,6 +2,7 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"

+#include "ggml-cuda/allreduce.cuh"
 #include "ggml-cuda/common.cuh"
 #include "ggml-cuda/acc.cuh"
 #include "ggml-cuda/add-id.cuh"
@@ -86,6 +87,9 @@

 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

+#define GGML_LOG_WARN_ONCE(str) \
+    { static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
+
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
@@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };

-#ifdef GGML_USE_NCCL
+// Communication context for multi-GPU AllReduce during tensor parallelism.
+//
+// Created once per meta backend instance.  Resources for the selected mode
+// (NCCL communicators or the internal AllReduce pipeline) are initialised
+// eagerly during comm_init so any init failure surfaces at startup rather
+// than mid-run.
 struct ggml_backend_cuda_comm_context {
+    using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
+
    std::vector<ggml_backend_t> backends;
-    std::vector<ncclComm_t> comms;
+    std::vector<int>            dev_ids;
+
+    // Set by the init chain (comm_init_{nccl, internal, none}) to one of
+    // try_allreduce_{nccl, internal, butterfly}.  nccl needs `comms`,
+    // internal needs `ar_pipeline`, butterfly needs nothing.  Per-call
+    // failures return false; the meta backend's generic implementation then
+    // handles that call.
+    try_allreduce_fn            try_allreduce = nullptr;
+
+    ggml_cuda_ar_pipeline *     ar_pipeline = nullptr;
+
+#ifdef GGML_USE_NCCL
+    std::vector<ncclComm_t>     comms;
+#endif // GGML_USE_NCCL

    ~ggml_backend_cuda_comm_context() {
+#ifdef GGML_USE_NCCL
        for (ncclComm_t comm : comms) {
            NCCL_CHECK(ncclCommDestroy(comm));
        }
+#endif // GGML_USE_NCCL
+        ggml_cuda_ar_pipeline_free(ar_pipeline);
    }
 };
-#endif // GGML_USE_NCCL

-static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
-#ifdef GGML_USE_NCCL
-    if (comm_ctx_v == nullptr) {
-        return;
-    }
-    ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
-    delete comm_ctx;
-#else
-    GGML_UNUSED(comm_ctx_v);
-#endif // GGML_USE_NCCL
-}
-
-static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
-#ifdef GGML_USE_NCCL
-    for (size_t i = 0; i < n_backends; i++) {
-        if (!ggml_backend_is_cuda(backends[i])) {
-            return nullptr;
-        }
-    }
-    ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
-    std::vector<int> dev_ids;
-    ret->backends.reserve(n_backends);
-    dev_ids.reserve(n_backends);
-    for (size_t i = 0; i < n_backends; i++) {
-        ret->backends.push_back(backends[i]);
-        ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
-        dev_ids.push_back(cuda_ctx->device);
-    }
-
-    ret->comms.resize(n_backends);
-    NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
-    return ret;
-#else
-    // If NCCL is installed it is used by default for optimal performance.
-    // However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
-    // RCCL is disabled by default, users are explicitly opting in.
-    // Therefore print no warning for RCCL.
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    static bool warning_printed = false;
-    if (!warning_printed) {
-        GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
-        warning_printed = true;
-    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    GGML_UNUSED_VARS(backends, n_backends);
-    return nullptr;
-#endif // GGML_USE_NCCL
-}
-
-static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
 #ifdef GGML_USE_NCCL
+// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
+// tensors (bandwidth-bound), then converts back to FP32.
+static bool ggml_backend_cuda_comm_allreduce_nccl(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
    const int64_t ne = ggml_nelements(tensors[0]);
    // FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
    // This then causes a crash in this function
@@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
        return true;
    }

-    GGML_ASSERT(comm_ctx_v != nullptr);
-    ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
    const size_t n_backends = comm_ctx->backends.size();

    for (size_t i = 0; i < n_backends; ++i) {
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
            NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
        }
        NCCL_CHECK(ncclGroupEnd());
-
        return true;
    }

@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
    }

    return true;
-#else
-    GGML_UNUSED_VARS(comm_ctx_v, tensors);
-    return false;
+}
 #endif // GGML_USE_NCCL
+
+// Run the internal AR pipeline.  Returns false on unsupported / failed input
+// -- the caller decides whether to abort (env-forced) or fall back silently.
+static bool ggml_backend_cuda_comm_allreduce_internal(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
+
+    const size_t n_backends = comm_ctx->backends.size();
+    GGML_ASSERT(n_backends == 2);
+    GGML_ASSERT(tensors[0] != nullptr);
+
+    const int64_t   ne   = ggml_nelements(tensors[0]);
+    const ggml_type type = tensors[0]->type;
+
+    if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
+        GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
+        return false;
+    }
+
+    if (ne == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < n_backends; ++i) {
+        if (tensors[i] == nullptr) {
+            GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
+            return false;
+        }
+        if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
+            GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
+                           __func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
+            return false;
+        }
+        if (!ggml_is_contiguously_allocated(tensors[i])) {
+            GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
+                           __func__, i, ne, ggml_nbytes(tensors[i]),
+                           (size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
+            return false;
+        }
+        if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
+            GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
+                           __func__, i, tensors[i]->data, (int) type, ne);
+            return false;
+        }
+        GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
+    }
+
+    return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
+}
+
+// ---------------------------------------------------------------------------
+// Per-call dispatch -- three variants, one per backend.  Each is set as
+// comm_ctx->try_allreduce by the matching init step.  Per-call failure
+// returns false; the meta backend's generic implementation handles that call.
+// ---------------------------------------------------------------------------
+
+#ifdef GGML_USE_NCCL
+static bool ggml_backend_cuda_comm_try_allreduce_nccl(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
+}
+#endif // GGML_USE_NCCL
+
+static bool ggml_backend_cuda_comm_try_allreduce_internal(
+        ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
+    return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
+}
+
+static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
+        ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
+    return false;
+}
+
+static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
+    if (comm_ctx_v == nullptr) {
+        return;
+    }
+    delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
+}
+
+// ---------------------------------------------------------------------------
+// Init -- chained nccl -> internal -> none.  Each step tries to bring up its
+// resource; on failure it warns and recurses into the next step.
+// ---------------------------------------------------------------------------
+static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
+    ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
+}
+
+static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
+    ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
+    if (ret->ar_pipeline) {
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
+        return;
+    }
+
+    // Clear sticky CUDA error from the failed init.
+    (void) cudaGetLastError();
+    GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
+                  "falling back to meta-backend butterfly\n");
+    ggml_backend_cuda_comm_init_none(ret);
+}
+
+static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
+#ifdef GGML_USE_NCCL
+    const size_t n = ret->dev_ids.size();
+    ret->comms.resize(n);
+    ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
+    if (rc == ncclSuccess) {
+        ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
+        return;
+    }
+
+    ret->comms.clear();
+    GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
+                  ncclGetErrorString(rc));
+#else // GGML_USE_NCCL
+#ifndef GGML_USE_HIP
+    GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce.  "
+                  "Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
+#endif // !GGML_USE_HIP
+#endif // GGML_USE_NCCL
+
+    ggml_backend_cuda_comm_init_internal(ret);
+}
+
+// Top-level init.  Picks one of the three init paths based on
+// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
+// any fallback.  Unrecognised env values warn and fall through to the
+// platform default.
+static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
+    for (size_t i = 0; i < n_backends; i++) {
+        if (!ggml_backend_is_cuda(backends[i])) {
+            return nullptr;
+        }
+    }
+
+    auto * ret = new ggml_backend_cuda_comm_context;
+    ret->backends.assign(backends, backends + n_backends);
+    ret->dev_ids.reserve(n_backends);
+    for (size_t i = 0; i < n_backends; i++) {
+        ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
+    }
+
+    const char * env = getenv("GGML_CUDA_ALLREDUCE");
+    if (!env) {
+        // Platform default: Linux uses NCCL, otherwise (generally Windows) internal
+#if defined(__linux__)
+        ggml_backend_cuda_comm_init_nccl(ret);
+#else
+        ggml_backend_cuda_comm_init_internal(ret);
+#endif // defined(__linux__)
+    } else {
+        std::string env_str(env);
+        if (env_str == "nccl") {
+            ggml_backend_cuda_comm_init_nccl(ret);
+        } else if (env_str == "internal") {
+            ggml_backend_cuda_comm_init_internal(ret);
+        } else if (env_str == "none") {
+            ggml_backend_cuda_comm_init_none(ret);
+        } else {
+            GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
+            ggml_backend_cuda_comm_init_none(ret);
+        }
+    }
+
+    return ret;
+}
+
+// Top-level dispatch -- calls the function pointer chosen by comm_init.
+// Returns false to let the meta-backend's butterfly run.
+static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
+    if (comm_ctx_v == nullptr) {
+        return false;
+    }
+    auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
+    return comm_ctx->try_allreduce(comm_ctx, tensors);
 }

 ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-ac6f7b44f60fde0091f0b3d99afde48f8c99b13a
+628249b398293fc8d2fa81a449ae2920a02c6523
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1131,10 +1131,6 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
    }

-    // for differentiating model types
-    uint32_t n_vocab = 0;
-    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
-
    // for classifier models
    ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
    if (!classifier_labels.empty()) {
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                };
                byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
                break;
+            case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
+                // Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
+                // by the normalizer, BPE merges over the whole text on raw UTF-8.
+                regex_exprs = {
+                    "[^\\n]+|[\\n]+",
+                };
+                byte_encode = false;
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "gemma4") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
                escape_whitespaces = true;
+            } else if (
+                    tokenizer_pre == "sarvam-moe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
+                escape_whitespaces = true;
+                clean_spaces = false;
            } else if (
                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-code" ||
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM       = 48,
    LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
    LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
+    LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE      = 51,
 };

 struct LLM_KV;
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -1,7 +1,8 @@
 #include "models.h"

 void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
-    const auto n_vocab = vocab.n_tokens();
+    uint32_t n_vocab = 0;
+    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);

    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -1,7 +1,8 @@
 #include "models.h"

 void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
-    const auto n_vocab = vocab.n_tokens();
+    uint32_t n_vocab = 0;
+    ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);

    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);