Compare commits

...

70 Commits

Author SHA1 Message Date
Georgi Gerganov
f49c636db0 llama-eval : protect dump() with lock for thread safety
Assisted-by: llama.cpp:local pi
2026-05-10 21:52:43 +03:00
Georgi Gerganov
d5165e8f2e llama-eval : require --grader-model or --model when using --grader-type llm
Assisted-by: llama.cpp:local pi
2026-05-10 21:49:58 +03:00
Georgi Gerganov
85c6aa006d llama-server-simulator : fix comment - Dice coefficient, not Levenshtein
Assisted-by: llama.cpp:local pi
2026-05-10 21:49:02 +03:00
Georgi Gerganov
e5ac6d1da6 llama-eval : track model name in eval state and verify on resume
- Store model_name in EvalState and JSON output
- Display model in HTML summary table
- Verify --model matches stored model when resuming

Assisted-by: llama.cpp:local pi
2026-05-10 21:43:35 +03:00
Georgi Gerganov
094554dbcc llama-eval : update README with PR link and quick-start examples
Assisted-by: llama.cpp:local pi
2026-05-10 21:22:48 +03:00
Georgi Gerganov
f64d56bcd8 llama-server-simulator : replace Flask with stdlib http.server
- Use HTTPServer + BaseHTTPRequestHandler instead of Flask
- RequestHandler handles POST /v1/chat/completions
- Server runs in daemon thread with clean Ctrl+C shutdown
- Remove flask and unused asdict imports

Assisted-by: llama.cpp:local pi
2026-05-10 20:47:08 +03:00
ggerganov
43f14a0a46 llama-eval : support multiple evaluation endpoints with dynamic task distribution
- Add ServerConfig dataclass (url, threads, name)
- Accept comma-separated --server, --threads, --server-name CLI args
- Dynamic shared-queue task distribution across servers (fast servers do more work)
- One ThreadPoolExecutor per server, workers pull from shared Queue
- Track which server processed each task (server_name in results)
- Thread-safe EvalState with threading.Lock for concurrent mutations
- Server column in HTML report and console output
- Backward compatible: single server works as before

Assisted-by: llama.cpp:local pi
2026-05-10 20:42:23 +03:00
Georgi Gerganov
d26b1ffcc9 llama-eval : rename display, escaped, and count variables to use prefix convention
- _display suffix → display_ prefix (answer, tokens, tps, t_gen)
- _escaped suffix → escaped_ prefix (response, prompt, reasoning)
- _count suffix → n_ prefix (correct, incorrect, pending)

Assisted-by: llama.cpp:local pi
2026-05-10 19:24:29 +03:00
Georgi Gerganov
9f10d8d195 llama-eval : add per-task generation time from server timings
Extract predicted_ms from the server timings response and store it as
t_gen_ms per task. Display in seconds with one decimal digit in console
progress, print_all_tasks, and HTML report.

Assisted-by: llama.cpp:local pi
2026-05-10 19:15:34 +03:00
Georgi Gerganov
4d5dedc569 llama-eval : add per-task generation speed from server timings
Extract predicted_per_second from the server timings response and store
it as tps_gen per task. Display in console progress, print_all_tasks,
and HTML report.

Assisted-by: llama.cpp:local pi
2026-05-10 19:05:20 +03:00
Georgi Gerganov
81a65cf035 eval : add Wilson score confidence interval to results
Compute 95% CI on-the-fly from completed cases. Displayed in
terminal output, HTML report, and JSON state.
2026-05-10 18:46:36 +03:00
Georgi Gerganov
7d433f767b eval : unify "judge" terminology to "grader"
Replace all occurrences of "judge" with "grader" for consistency
across the codebase (CLI args, Grader class fields, help text).

Assisted-by: llama.cpp:local pi
2026-05-10 18:23:28 +03:00
Georgi Gerganov
633a68d6c2 remove junk 2026-05-10 18:13:50 +03:00
Georgi Gerganov
e0a2cf48ca track total time 2026-05-10 18:13:50 +03:00
Georgi Gerganov
bad9565a1e refactor 2026-05-10 18:13:50 +03:00
Georgi Gerganov
752b703a5e resoning and error handling 2026-05-10 18:13:50 +03:00
Georgi Gerganov
fc571f3a1e add tokens 2026-05-10 18:13:50 +03:00
Georgi Gerganov
6797d80dff store full response 2026-05-10 18:13:50 +03:00
Georgi Gerganov
3649793811 add html 2026-05-10 18:13:50 +03:00
Georgi Gerganov
7e8c88c5e0 fix prompts 2026-05-10 18:13:49 +03:00
Georgi Gerganov
2e0b6766f3 simplify 2026-05-10 18:13:49 +03:00
Georgi Gerganov
f95f4dd1ca fix counts 2026-05-10 18:13:49 +03:00
Georgi Gerganov
095c8ab655 cleanup 2026-05-10 18:13:49 +03:00
Georgi Gerganov
d830acacc5 resume eval 2026-05-10 18:13:49 +03:00
Georgi Gerganov
f35b10f0a9 ignore errors 2026-05-10 18:13:49 +03:00
Georgi Gerganov
802d85e26e add AGENTS.md 2026-05-10 18:13:49 +03:00
Georgi Gerganov
91bd92c6b6 cleanup 2026-05-10 18:13:48 +03:00
Georgi Gerganov
f20b5a72cf datasets : fix aime2025 2026-05-10 18:13:48 +03:00
Georgi Gerganov
122dfe3eab grade : improve regex + logs 2026-05-10 18:13:48 +03:00
Georgi Gerganov
8b94ab4f4a grader : update prompt 2026-05-10 18:13:48 +03:00
Georgi Gerganov
f99d77f3bd datasets : add aime2025 2026-05-10 18:13:48 +03:00
Georgi Gerganov
55a7cf4a06 cont 2026-05-10 18:13:48 +03:00
Georgi Gerganov
6e7e1a5a63 grader : improve example answers 2026-05-10 18:13:48 +03:00
Georgi Gerganov
9f02fa6382 rename 2026-05-10 18:13:47 +03:00
Georgi Gerganov
e7b8646098 add gpqa + sampling + docs 2026-05-10 18:13:47 +03:00
Georgi Gerganov
55ce1b4e2f datasets : add gsm8k 2026-05-10 18:13:47 +03:00
Georgi Gerganov
abec77e068 remove old files 2026-05-10 18:13:47 +03:00
Georgi Gerganov
65e3c5a928 docs 2026-05-10 18:13:47 +03:00
Georgi Gerganov
4f176f6a4d improve grader 2026-05-10 18:13:47 +03:00
Georgi Gerganov
9578e83ac2 minor 2026-05-10 18:13:47 +03:00
Georgi Gerganov
530f38f9c3 eval : support multiple dataset runs 2026-05-10 18:13:46 +03:00
Georgi Gerganov
cda8cae01a sim : fix answer matching 2026-05-10 18:13:46 +03:00
Georgi Gerganov
64720e1e01 test : fix path 2026-05-10 18:13:46 +03:00
Georgi Gerganov
1a780f7c44 eval : add prompts 2026-05-10 18:13:46 +03:00
Georgi Gerganov
940364e4c9 eval : print progress 2026-05-10 18:13:46 +03:00
Georgi Gerganov
ee9b715eb6 examples: add task summary table to llama-eval-new.py 2026-05-10 18:13:46 +03:00
Georgi Gerganov
d639ee52ea docs: update llama-eval-discussion.md with threading and model parameter updates
- Add threading support implementation details
- Document ThreadPoolExecutor usage and thread safety
- Add model parameter implementation details
- Include testing results for both features
2026-05-10 18:13:46 +03:00
Georgi Gerganov
fb40d1a04a examples: add threading support and model parameter to llama-eval-new.py
- Add ThreadPoolExecutor for parallel request processing controlled by --threads
- Add --model argument to specify model name in request data
- Refactor process() to use thread-safe _process_single_case() method
- Update progress tracking to work with concurrent execution
2026-05-10 18:13:45 +03:00
Georgi Gerganov
2fe445cc60 docs: update llama-eval-discussion.md with session work summary 2026-05-10 18:13:45 +03:00
Georgi Gerganov
3732aea2df examples: use cached dataset path in simulator to avoid HF Hub requests 2026-05-10 18:13:45 +03:00
Georgi Gerganov
edc766c919 examples: use cached dataset path to avoid HF Hub requests 2026-05-10 18:13:45 +03:00
Georgi Gerganov
d7d2c22909 examples: remove HF_HUB_OFFLINE to allow dataset download 2026-05-10 18:13:45 +03:00
Georgi Gerganov
30ea5124de examples: use HF_HUB_OFFLINE to avoid HF Hub warnings 2026-05-10 18:13:45 +03:00
Georgi Gerganov
0ca458d892 examples: implement flexible grader system for answer validation
- Add Grader class supporting regex and CLI-based grading
- Implement built-in regex patterns for AIME, GSM8K, MMLU, HellaSwag, ARC, WinoGrande
- Add CLI grader interface: python script.py --answer <pred> --expected <gold>
- Add HF telemetry disable to avoid warnings
- Support exact match requirement for regex patterns
- Add 30-second timeout for CLI grader
- Handle both boxed and plain text formats for AIME answers
2026-05-10 18:13:45 +03:00
Georgi Gerganov
de8eda468b docs: remove README.md from llama-eval 2026-05-10 18:13:44 +03:00
Georgi Gerganov
a2b96e0444 examples: add simplified llama-eval-new.py for AIME evaluation
- Create new simplified evaluation script focused only on AIME
- Implement EvalState and Processor dataclasses for structured state management
- Add real-time feedback showing correct/incorrect status per case
- Abstract grading interface for external grader support
- Use structured JSON output for eval state
- Apply HuggingFace dataset caching to avoid repeated downloads
- Remove Levenshtein matching - eval script only sends requests and validates answers
2026-05-10 18:13:44 +03:00
Georgi Gerganov
deed078654 docs: update llama-eval-discussion.md with session work summary
Add summary of llama-server-simulator implementation work including
features, testing results, technical decisions, and refactoring.
2026-05-10 18:13:44 +03:00
Georgi Gerganov
05b8425bd6 examples: refactor test-simulator.sh for better readability
Extract repeating question string into TEST_QUESTION variable and
create make_request() helper function to reduce code duplication.
Add proper error handling for error responses.
2026-05-10 18:13:44 +03:00
Georgi Gerganov
58bd57ba99 examples: add llama-server simulator for testing eval scripts
Add a standalone Python script that simulates a llama-server HTTP endpoint
for testing the eval script. The simulator:

- Implements /v1/chat/completions endpoint with OpenAI-compatible format
- Loads AIME dataset from HuggingFace with local caching
- Uses Levenshtein distance for intelligent question matching
- Supports configurable success rate for correct/wrong answer generation
- Provides debug logging for troubleshooting

Also includes test scripts and documentation for testing and understanding
the simulator functionality.
2026-05-10 18:13:44 +03:00
gatbontonpc
5cbe95b6e5 add checkpointing 2026-05-10 18:13:44 +03:00
gatbontonpc
c7f3ce25f5 Add readme 2026-05-10 18:13:44 +03:00
gatbontonpc
4db4497ca7 multi source llama-eval 2026-05-10 18:13:43 +03:00
gatbontonpc
db8b09d6e8 working llama-eval mc and math suite 2026-05-10 18:13:42 +03:00
Georgi Gerganov
0b047287fe sync : ggml 2026-05-10 17:00:11 +03:00
Georgi Gerganov
efbada936f ggml : bump version to 0.11.1 (ggml/1484) 2026-05-10 17:00:11 +03:00
scutler-nv
f3c3e0e9a0 internal AllReduce kernel for CUDA provider (#22299)
* ggml-cuda: add internal AllReduce provider for tensor parallelism

Introduces a NCCL-free AllReduce implementation for LLAMA_SPLIT_MODE_TENSOR
using a single-phase CUDA kernel that pipelines D2H copy, cross-GPU
handshake via pinned-memory volatile flags, and the reduction in one
kernel launch per GPU.

New files:
- ggml/src/ggml-cuda/comm.cuh        — ggml_cuda_allreduce_provider enum
- ggml/src/ggml-cuda/allreduce.cuh   — pipeline API declarations
- ggml/src/ggml-cuda/allreduce.cu    — kernel + pipeline init/dispatch

ggml-cuda.cu changes:
- ggml_backend_cuda_comm_context gains ar_pipeline field
- Provider selection via GGML_CUDA_ALLREDUCE env var ("nccl" / "internal")
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* llama-bench: add --allreduce flag to select AllReduce provider

Adds --allreduce <auto|nccl|internal> to llama-bench (and via the shared
field pattern, consistent with other multi-value flags).  Useful for
isolating hangs or regressions in tensor-parallel mode: pass --allreduce nccl
to force NCCL and bypass the internal provider.

Also fixes ggml_cuda_select_allreduce_provider() to treat an empty
GGML_CUDA_ALLREDUCE env var the same as unset (avoids spurious warning when
llama-bench sets it to "" for the "auto" case).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
xt gains ar_pipeline field
- Provider selection via GGML_CUDA_ALLREDUCE env var ("nccl" / "internal")
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* llama-bench: rename --allreduce to --reduction-provider / -rp

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
 via the shared
field pattern, consistent with other multi-value flags).  Useful for
isolating hangs or regressions in tensor-parallel mode: pass --allreduce nccl
to force NCCL and bypass the internal provider.

Also fixes ggml_cuda_select_allreduce_provider() to treat an empty
GGML_CUDA_ALLREDUCE env var the same as unset (avoids spurious warning when
llama-bench sets it to "" for the "auto" case).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
xt gains ar_pipeline field
- Provider selection via GGML_CUDA_ALLREDUCE env var ("nccl" / "internal")
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* llama-bench: pass WARN/ERROR log messages through in non-verbose mode

The null log callback was silently dropping all messages. WARN and ERROR
should always be visible since they indicate legitimate issues (e.g. a
requested reduction provider not being available).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
vider.

Also fixes ggml_cuda_select_allreduce_provider() to treat an empty
GGML_CUDA_ALLREDUCE env var the same as unset (avoids spurious warning when
llama-bench sets it to "" for the "auto" case).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
xt gains ar_pipeline field
- Provider selection via GGML_CUDA_ALLREDUCE env var ("nccl" / "internal")
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* cmake: improve NCCL detection for source-tree builds, add static/dynamic switch

FindNCCL.cmake now searches the cmake source-build layout used by the Windows
NCCL port (cmake/lib/Release for static, cmake/src/Release for dynamic import
lib) and also checks src/include for the generated nccl.h header.

New option GGML_CUDA_NCCL_STATIC (default OFF) selects static vs dynamic
linking and controls which paths and library names are searched.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
 for the "auto" case).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
xt gains ar_pipeline field
- Provider selection via GGML_CUDA_ALLREDUCE env var ("nccl" / "internal")
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ggml-cuda: add AllReduce hang watchdog (GGML_CUDA_AR_WATCHDOG)

When compiled with -DGGML_CUDA_AR_WATCHDOG=ON, uses a debug kernel
variant that writes per-GPU spin diagnostics to pinned host memory.
A host-side blocking poll (cudaEventQuery + volatile reads) detects
hangs and logs WARN with the last observed arrival counters and spin
counts, controlled by GGML_CUDA_AR_WATCHDOG (ms timeout) and
GGML_CUDA_AR_MAX_SPIN (kernel bailout) env vars at runtime.

Zero overhead on the production path — all debug code is behind #ifdef.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
 ar_pipeline field
- Provider selection via GGML_CUDA_ALLREDUCE env var ("nccl" / "internal")
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ggml-cuda: fix intermittent AllReduce hang on Blackwell PCIe

Add __threadfence_system() before the arrival signal write in
signal_set to ensure D2H data is globally visible before the peer
observes the arrival flag.  Without this fence, the peer could enter
Phase 3 host reads before the data had fully landed, causing an
intermittent deadlock on RTX 5090 (Blackwell, PCIe-only).

Also redesign the watchdog from a blocking dispatch-thread poll to a
non-blocking background thread, eliminating the ~20ms per-slot
latency the old design added.

Verified: 30/30 soak test runs clean at ~50 t/s (previously ~1-in-15
hang rate).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- INTERNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ggml-cuda: fix watchdog shutdown ordering and pipeline_free drain

- Stop watchdog thread BEFORE destroying GPU resources (events, streams)
  to prevent polling destroyed handles → spurious "busy" readings
- Add cudaStreamSynchronize in pipeline_free to drain in-flight kernels
  before freeing pinned host buffers they may still be reading
- Sleep-first watchdog polling: no +0ms noise, only logs when a kernel
  is genuinely stuck past the poll interval
- Check wdog_stop in both outer and inner loops so join() returns
  promptly instead of draining the entire queue
- Add Phase 3 breadcrumbs to debug[3] for hang localization

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
RNAL provider initialises the pipeline at comm_init time
- Dispatch routes to ggml_cuda_ar_allreduce(); falls back to meta-backend
  CPU reduce for unsupported sizes or GPU counts (> 2)

Current scope: 2 GPUs, FP32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ggml-cuda: replace event-based watchdog with per-GPU ring buffer

Completely rework the GGML_CUDA_AR_WATCHDOG system:

- Replace the shared debug_buf + event-polling + queue design with
  per-GPU ring buffers in pinned host memory
- Kernel writes a debug record only on spin-limit bailout: claims a
  ring slot via atomicAdd (single-GPU host atomics work on RTX 5090),
  writes fields, fences, sets completion flag, then all threads exit
- Watchdog thread simply polls ring head counters every 1ms and prints
  any new complete records — no CUDA event queries, no mutex, no queue
- Zero overhead on the dispatch path (no queue posting, no memset)
- Watchdog shutdown returns within ~1ms (atomic bool, no drain)
- On bailout the kernel skips Phase 3 entirely and exits cleanly

Verified: 20/20 prefill soak test clean at ~1112 t/s, no hangs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
P32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix: normalize line endings to LF (undo Windows CRLF conversion)

Five files were inadvertently converted to CRLF by the Windows
development environment, causing every line to show as changed in
diffs against master.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
imit bailout: claims a
  ring slot via atomicAdd (single-GPU host atomics work on RTX 5090),
  writes fields, fences, sets completion flag, then all threads exit
- Watchdog thread simply polls ring head counters every 1ms and prints
  any new complete records — no CUDA event queries, no mutex, no queue
- Zero overhead on the dispatch path (no queue posting, no memset)
- Watchdog shutdown returns within ~1ms (atomic bool, no drain)
- On bailout the kernel skips Phase 3 entirely and exits cleanly

Verified: 20/20 prefill soak test clean at ~1112 t/s, no hangs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
P32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* .gitattributes: force LF line endings to prevent Windows CRLF conversion

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
elopment environment, causing every line to show as changed in
diffs against master.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
imit bailout: claims a
  ring slot via atomicAdd (single-GPU host atomics work on RTX 5090),
  writes fields, fences, sets completion flag, then all threads exit
- Watchdog thread simply polls ring head counters every 1ms and prints
  any new complete records — no CUDA event queries, no mutex, no queue
- Zero overhead on the dispatch path (no queue posting, no memset)
- Watchdog shutdown returns within ~1ms (atomic bool, no drain)
- On bailout the kernel skips Phase 3 entirely and exits cleanly

Verified: 20/20 prefill soak test clean at ~1112 t/s, no hangs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
P32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* ggml-cuda: move GGML_CUDA_AR_WATCHDOG from CMake option to local define

The watchdog is development-only; a global CMake option is overkill.
Move the toggle to a #define at the top of allreduce.cu (set to 0 by
default) and remove the option from ggml/CMakeLists.txt and the CUDA
CMakeLists.txt add_compile_definitions block.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
 fences, sets completion flag, then all threads exit
- Watchdog thread simply polls ring head counters every 1ms and prints
  any new complete records — no CUDA event queries, no mutex, no queue
- Zero overhead on the dispatch path (no queue posting, no memset)
- Watchdog shutdown returns within ~1ms (atomic bool, no drain)
- On bailout the kernel skips Phase 3 entirely and exits cleanly

Verified: 20/20 prefill soak test clean at ~1112 t/s, no hangs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
P32, tensors <= 256 KB. Notes in NOTES-allreduce.md.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* unify kernel debug paths

* use __threadfence_system explicitly (not in ggml_cuda_ar_signal_set)

* preferentially use internal reduction for <=2 GPUs

* templatize the main kernel to support fp16/bf16

* restore llama-bench.cpp changes

* revert CMakeLists changes

* remove notes from repo

* remove dead warmup code

* fix comments

* improve reduction provider fallback code

* add messages for allreduce fallback

* rework reduction provider init to not call ncclCommInitAll if using the internal provider

* fix case where a given tensor has not been computed

* add chunked mode to the kernel for unlimited vector size

* rework a few checks/fallbacks

* various small cleanups

* allow disabling CUDA reductions completely (falling back to the non-CUDA butterfly mode)

* simplify reduction provider selection

* minor simplifications

* more cleanups/fixes

* prototype alternate path for large reductions

* chunked version of large reduction path

* use bf16 for large reductions

* experimental reduction using cudaMemcpyPeerAsync (slightly slower)

* revert experimental change

* add combined conversion/reduction kernel

* add bf16 wire format for single kernel mode

* experimental on-stream small reduction kernel

* double buffer arrival slots, use token (incrementing) method

* double buffer host_buf for small reductions

* put in waits for use of host_mem in large reduction case (prevents stomping on in-use memory

* remove watchdog code

* various cleanups / dead code removal

* fix fp16 mode

* fix some comments/logging statements

* use increasing token scheme for arrival signals

* add top-level comment to allreduce.cu

* improve top-level comment in allreduce.cu

* fix comments in ggml_cuda_ar_kernel

* improve event handling for hostmem buffer usage tracking

* change ev_pool to fixed 2D array

* add chunked memcpy fallback for extra-large reductions (>32 MB)

* change thresholds for copy-engine path and bf16 demotion

* multi-block kernel test

* more fine-tuning for chukn-size, etc.

* various fixes for PR review

* more PR fixes

* fix semantics of all host mappings

* require ampere+

* small cleanups

* properly use host pointer for src/dst in cudaMemcpy calls

* allreduce: lazy-init the internal pipeline on first use

A config that lives entirely on NCCL never needs the chunked-kernel
pipeline (host_buf, host_large, dev_tmp, streams, events, arrival ring).
Defer pipeline creation to the first try_allreduce_internal call using the
same std::call_once pattern as ensure_nccl, so those resources stay
unallocated when only NCCL is in use.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: assert n_backends == 2 instead of soft-fallback

ar_pipeline_init already requires n_devices == 2 and bails before any AR can
get here, so by the time we reach try_allreduce_internal we know we have
exactly two backends.  Replace the runtime-debug-log fallback with a hard
assert.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
 NCCL is in use.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* rework reduction provider selection. internal/nccl is OS dependent; most fallbacks are removed

* remove unneeded Turing arch check (llama.cpp doesn't even compile pre-Turing anyway)

* allreduce: ASCII-only comments and ggml_cuda_cast for value conversions

Replace non-ASCII characters in comments (em dashes, right arrows) with
ASCII equivalents (--, ->) so the source stays in the ggml/upstream norm.

In the kernel-side code, replace static_cast<Twire>/static_cast<Tdst>
with ggml_cuda_cast<...> so the BF16 conversions go through the fast
__float2bfloat16 / __bfloat162float intrinsics from convert.cuh.  Pure
pointer and integer casts stay as static_cast.

Also drops two stray garbage tokens that snuck in from earlier merges
(a duplicated 'return ok; }' tail in allreduce.cu and a leftover '_reg)'
fragment in ggml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: use ggml_cuda_memcpy_1 for the chunked-kernel vector copies

The chunked kernel's two 16-byte register<->host transfers (Phase 1 store
and Phase 3 load) used reinterpret_cast<float4 *> on both sides.  Replace
with ggml_cuda_memcpy_1<sizeof(wire)>, which is the canonical helper for
this pattern and emits the same int4 LD/ST under the hood.

Conformance passes; 5x reruns of 70b internal pp512 show 1832-1836 t/s,
matching the prior matrix value of 1831 t/s -- no perf change as expected.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
ok; }' tail in allreduce.cu and a leftover '_reg)'
fragment in ggml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: assert cuda_ctx->device matches the pipeline's device

Both ggml_cuda_ar_pipeline and ggml_backend_cuda_context carry the device
they were created for; if they ever disagree, every cuda call that follows
runs on the wrong device.  Add GGML_ASSERT at each cuda_ctx retrieval site
in the AR path so the misuse fails fast rather than silently corrupting.

Also: rename __nv_bfloat16 -> nv_bfloat16 (typedef alias) for consistency
with the rest of the file, and tighten one cudaGetLastError check to fire
only after the to_bf16 call that can actually fail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
gml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: expand one-liner for loops to braced bodies

Code-style preference -- match the rest of the file by writing every for
loop with the body on its own braced line.  Three sites in the copy-engine
typed dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
in the AR path so the misuse fails fast rather than silently corrupting.

Also: rename __nv_bfloat16 -> nv_bfloat16 (typedef alias) for consistency
with the rest of the file, and tighten one cudaGetLastError check to fire
only after the to_bf16 call that can actually fail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
gml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: rename template parameters Tdst/Twire/Tsrc -> T_dst/T_wire/T_src

Code-style preference per PR review -- T_dst/T_wire/T_src is more
consistent with surrounding code.  Whole-word rename across all 58 sites
in allreduce.cu (kernel definitions, internal uses, and comment text).

Realigned the parameter columns in three function signatures whose
T_src/T_dst lines shifted by 1 char relative to their non-templated
neighbors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
to fire
only after the to_bf16 call that can actually fail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
gml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: drop hyphen in 'chunked-kernel' across comments

Per PR review feedback -- 'chunked kernel' (no hyphen) reads more naturally
in running prose, especially for ESL readers.  Pure comment-only change;
all 10 occurrences in allreduce.cu updated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
three function signatures whose
T_src/T_dst lines shifted by 1 char relative to their non-templated
neighbors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
to fire
only after the to_bf16 call that can actually fail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
gml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: use ggml_cuda_get_max_cpy_bytes() instead of hardcoded 16

The chunked kernel hardcoded a 16-byte vector unit; replace with the
ggml_cuda_get_max_cpy_bytes() helper that fattn-common.cuh uses for the
same purpose, so ELEMS_PER_VEC self-adjusts to the arch's widest
single-instruction copy.

Perf-neutral on supported targets (Volta+ returns 16).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
hbors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
to fire
only after the to_bf16 call that can actually fail.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
gml-cuda.cu).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* ggml-cuda: PR review fixes -- annotate #endif, fix stale comment, assert nbytes alignment

Three separate but minor changes from PR #22299 review feedback:

1. Annotate the five GGML_USE_NCCL #endif lines with the matching condition
   so the pairing is visible without scrolling back.

2. The comment block on ggml_backend_cuda_comm_context claimed NCCL is
   lazy-initialised; that was true at one point but the dispatch refactor
   (727b141c0) made both NCCL and the internal pipeline eager.  Rewrite
   the comment to match current behaviour.

3. Assert in ggml_backend_cuda_comm_allreduce_internal that the tensor's
   byte size is a 16-byte multiple.  The chunked-kernel issues full-width
   vector loads/stores, so this is a precondition; tensor-parallel splits
   of hidden-dim-multiples satisfy it trivially, but a hard assert turns
   any caller-side bug into a clear failure rather than UB.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
 device's new AR
records its ev.ker -- otherwise the second device's wait sees the first
device's just-recorded event (the in-flight new AR) and creates a circular
dependency with the in-kernel peer signal.  Two-pass dispatch (all waits,
then all launches) avoids this.

Bump POOL_SIZE 2 -> 8 (small memory cost, more breathing room for the
GPU's view of the event chain) and add a runtime env override for the
hybrid kernel chunk size (GGML_CUDA_AR_HYBRID_CHUNK_BYTES) for tuning.
One-shot stderr diagnostic at first AR prints the chosen path + sizing.

Result on 2x RTX 5090 Linux, 70b ub_sweep:

    ub=64   (1 MB AR): 913 -> 1036 t/s  (+13.5% vs old, +1.8% vs NCCL)
    ub=128  (2 MB AR): 1056 -> 1181     (+11.9%, +3.7% vs NCCL)
    ub=256  (4 MB AR): 1212 -> 1424     (+17.5%, +3.5% vs NCCL)

Internal now beats NCCL at every size (+1.8% to +15.6%), recovering all
ground in the 1-4 MB regime that was previously a 10-12% loss.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* simplify the init logic

* address some other PR requests

* ggml-cuda: stub internal AllReduce on HIP/MUSA, drop pre-Ampere mention, gate NCCL fallback warning on !HIP

The internal AllReduce relies on cudaHostAllocPortable/Mapped,
cudaHostGetDevicePointer, and __nanosleep -- none of which the HIP or
MUSA shims expose -- so wrap the implementation in
!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) and provide
nullptr/no-op/false stubs in the #else branch.  The dispatcher already
treats a null pipeline as init failure and silently falls back to the
meta backend's generic AllReduce, so HIP/MUSA builds compile clean and
behave correctly without further call-site changes.

PR review follow-ups:
 - drop "or pre-Ampere?" from the internal-init failure warning -- the
   kernel doesn't require Ampere or newer.
 - guard the "NCCL not compiled in" fallback warning behind
   !defined(GGML_USE_HIP); the suggestion to install NCCL only makes
   sense on NVIDIA builds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
hind, now +6-8% ahead at ub=1024-4096.
Perplexity (32 chunks) matches NCCL bit-for-bit (3.4044 vs 3.4043).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: guard __nanosleep on Volta+ and reject pre-Volta devices at init

__nanosleep is the only Volta-specific intrinsic in the kernel; wrap it
in #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA / NO_DEVICE_CODE so the file
still compiles cleanly when targeting older arches (the dispatcher's
init check below ensures the kernel is never actually launched on
pre-Volta).

Add a per-device compute-capability check in pipeline_init that returns
nullptr if any device is below sm70.  The dispatcher already treats
nullptr as init failure and silently falls back to the meta backend's
generic AllReduce.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
rom the internal-init failure warning -- the
   kernel doesn't require Ampere or newer.
 - guard the "NCCL not compiled in" fallback warning behind
   !defined(GGML_USE_HIP); the suggestion to install NCCL only makes
   sense on NVIDIA builds.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
hind, now +6-8% ahead at ub=1024-4096.
Perplexity (32 chunks) matches NCCL bit-for-bit (3.4044 vs 3.4043).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* allreduce: fix CI -Werror warnings (sign-compare, format, restrict alias, maybe-uninitialized)

The CUDA CI builds with -Werror -Wsign-compare -Wformat -Wrestrict
-Wmaybe-uninitialized.  Address each:

 - n_devices is size_t; change `int i; i < n_devices` to size_t in the
   three init loops, and the matching GGML_LOG_INFO format from %d to %zu.
 - ggml_cuda_ar_kernel was launched with sendbuf == recvbuf (in-place
   reduction), so the __restrict__ qualifiers on those parameters were
   technically UB.  Drop __restrict__ from sendbuf and recvbuf; an A/B
   sweep showed <0.6% perf delta (within noise) on Linux.
 - The buf/src/dst pointer arrays in ggml_cuda_ar_allreduce and the
   per-iteration arrays in ggml_cuda_ar_allreduce_copy_outer were
   declared with size GGML_CUDA_MAX_DEVICES but the loop only writes
   indices [0, n_devices); zero-initialise so the compiler sees the
   tail elements as defined.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
now +6-8% ahead at ub=1024-4096.
Perplexity (32 chunks) matches NCCL bit-for-bit (3.4044 vs 3.4043).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* ggml-cuda: drop unused-function warning by guarding try_allreduce_nccl behind GGML_USE_NCCL

The only call site (in init_nccl) is already inside #ifdef GGML_USE_NCCL,
so the function is unreferenced in non-NCCL builds and trips
nvcc's -Werror=unused-function check.  Move the guard from inside the
function body to around the entire definition.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
ce
   reduction), so the __restrict__ qualifiers on those parameters were
   technically UB.  Drop __restrict__ from sendbuf and recvbuf; an A/B
   sweep showed <0.6% perf delta (within noise) on Linux.
 - The buf/src/dst pointer arrays in ggml_cuda_ar_allreduce and the
   per-iteration arrays in ggml_cuda_ar_allreduce_copy_outer were
   declared with size GGML_CUDA_MAX_DEVICES but the loop only writes
   indices [0, n_devices); zero-initialise so the compiler sees the
   tail elements as defined.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
now +6-8% ahead at ub=1024-4096.
Perplexity (32 chunks) matches NCCL bit-for-bit (3.4044 vs 3.4043).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-10 11:05:22 +02:00
Sigbjørn Skjæret
5755a100cd model : fix model type check for granite/llama3 and deepseek2/glm4.7 lite (#22870) 2026-05-10 08:44:29 +02:00
Sumit Chatterjee
1e5ad35d56 model : add sarvam_moe architecture support (#20275) 2026-05-09 16:31:50 +02:00
Yuannan
65d7a8bbf0 devops : updated Nix systems (#22869) 2026-05-09 17:15:03 +03:00
Davi Henrique Linhares
00d56b11c3 docker : upgraded the default intel compute-runtime version (#22567) 2026-05-09 10:22:23 +02:00
20 changed files with 3122 additions and 130 deletions

View File

@@ -33,10 +33,10 @@ RUN mkdir -p /app/full \
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGC_VERSION=v2.32.7
ARG IGC_VERSION_FULL=2_2.32.7+21184
ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
ARG IGDGMM_VERSION=22.9.0
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \

View File

@@ -103,6 +103,7 @@ let
vulkan-headers
vulkan-loader
shaderc
spirv-headers
];
in
@@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
ninja
pkg-config
git
spirv-headers
]
++ optionals useCuda [
cudaPackages.cuda_nvcc

1
.gitignore vendored
View File

@@ -110,6 +110,7 @@ uv.lock
# Nix
flake.lock
/result
# Test binaries

View File

@@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
res = "f2llmv2"
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
# ref: https://huggingface.co/sarvamai/sarvam-30b
res = "sarvam-moe"
if res is None:
logger.warning("\n")
@@ -11591,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
class SarvamMoEModel(BailingMoeV2Model):
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
# Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
# - full rotary (no partial_rotary_factor)
# - expert bias is zero-mean normalized at load time
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
# Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
self.gguf_writer.add_rope_dimension_count(rope_dim)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if name.endswith(".expert_bias"):
# Sarvam normalizes expert bias to zero mean
inner = gen
def gen():
t = inner()
return t - t.mean()
return super().filter_tensors((name, gen))
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
class GroveMoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GROVEMOE

View File

@@ -155,6 +155,7 @@ models = [
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View File

@@ -0,0 +1,26 @@
# llama-eval
Simple evaluation tool for llama.cpp with support for multiple datasets.
For a full description, usage examples, and sample results, see:
- [PR 21152](https://github.com/ggml-org/llama.cpp/pull/21152)
## Quick start
```bash
# Single server
python3 llama-eval.py \
--server http://localhost:8033 \
--model my-model \
--dataset gsm8k --n_cases 100 \
--grader-type regex --threads 32
# Multiple servers (comma-separated URLs and thread counts)
python3 llama-eval.py \
--server http://gpu1:8033,http://gpu2:8033 \
--server-name gpu1,gpu2 \
--threads 16,16 \
--dataset aime2025 --n_cases 240 \
--grader-type regex
```

1428
examples/llama-eval/llama-eval.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,317 @@
#!/usr/bin/env python3
import argparse
import json
import random
import re
import time
import sys
import os
import threading
from http.server import HTTPServer, BaseHTTPRequestHandler
from typing import Dict, List, Optional
from dataclasses import dataclass
from pathlib import Path
import datasets
# Set cache directory for HuggingFace datasets
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
cache_dir.mkdir(parents=True, exist_ok=True)
os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
def dice(s1: str, s2: str) -> float:
"""Calculate Dice coefficient between two strings based on bigram overlap."""
if not s1 and not s2:
return 1.0
def _bigrams(s: str):
return [s[i : i + 2] for i in range(len(s) - 1)]
bigrams1 = _bigrams(s1)
bigrams2 = _bigrams(s2)
if not bigrams1 and not bigrams2:
return 1.0
from collections import Counter
freq1 = Counter(bigrams1)
freq2 = Counter(bigrams2)
intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
return dice_coeff
def debug_log(message: str):
"""Log debug messages to both stdout and a file"""
print(message, file=sys.stderr)
with open("/tmp/simulator-debug.log", "a") as f:
f.write(message + "\n")
simulator: Optional["Simulator"] = None
@dataclass
class EvalState:
id: str
tasks: List[str]
task_states: Dict[str, Dict]
sampling_config: Dict
def normalize_number(s: str) -> Optional[int]:
match = re.match(r"\d+", s) # match digits from the start
if not match:
return None
return int(match.group(0))
class AimeDataset:
def __init__(self, split: str = "train"):
self.split = split
self.questions: List[Dict] = []
self._load_dataset()
def _load_dataset(self):
print(f"Loading AIME dataset (split: {self.split})...")
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
else:
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
self.questions = list(ds)
print(f"AIME dataset loaded: {len(self.questions)} questions")
def find_question(self, request_text: str) -> Optional[Dict]:
best_match = None
best_distance = -1
best_index = -1
for i, question in enumerate(self.questions):
question_text = question["problem"]
request_lower = request_text.lower()
question_lower = question_text.lower()
# Exact match
if question_lower == request_lower:
debug_log(f"DEBUG: Found exact match at index {i}")
return question
# Remove LaTeX formatting for more flexible matching
question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
if question_no_latex.lower() == request_lower:
debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
return question
# Calculate Dice coefficient for partial matches
# Only consider if request is at least 50% of question length
if len(request_lower) >= len(question_lower) * 0.5:
distance = dice(question_lower, request_lower)
if distance > best_distance:
best_distance = distance
best_match = question
best_index = i
if best_match and best_distance > 0.3: # Threshold for partial match
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
return best_match
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
return None
def get_answer(self, question: Dict) -> str:
answer = question["answer"]
if isinstance(answer, str):
normalized = normalize_number(answer)
return str(normalized) if normalized is not None else answer
return str(answer)
class Simulator:
def __init__(
self,
port: int = 8033,
host: str = "localhost",
success_rate: float = 0.8,
dataset_split: str = "train"
):
self.port = port
self.host = host
self.success_rate = success_rate
self.dataset = AimeDataset(dataset_split)
self.eval_state = EvalState(
id="aime-2025",
tasks=["aime"],
task_states={},
sampling_config={"temperature": 0, "max_tokens": 2048}
)
def _generate_response(
self,
question: Dict,
should_be_correct: bool
) -> Dict:
expected_answer = self.dataset.get_answer(question)
if should_be_correct:
response_text = expected_answer
else:
response_text = self._generate_wrong_answer(question)
return {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": "llama",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
}
}
def _generate_wrong_answer(self, question: Dict) -> str:
expected_answer = self.dataset.get_answer(question)
if expected_answer.isdigit():
wrong_answer = str(int(expected_answer) + 1)
else:
wrong_answer = expected_answer + " (wrong)"
return wrong_answer
def _process_request(self, request_data: Dict) -> Dict:
messages = request_data.get("messages", [])
if not messages:
return {"error": "No messages in request"}
request_text = messages[0].get("content", "")
debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
question = self.dataset.find_question(request_text)
if not question:
debug_log(f"DEBUG: find_question returned None")
return {"error": "No matching question found"}
should_be_correct = random.random() < self.success_rate
response = self._generate_response(question, should_be_correct)
task_id = "aime"
self.eval_state.task_states[task_id] = {
"correct": should_be_correct,
"expected": self.dataset.get_answer(question),
"predicted": response["choices"][0]["message"]["content"]
}
return response
class RequestHandler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path != "/v1/chat/completions":
self._send_json({"error": "Not found"}, 404)
return
try:
content_length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(content_length)
request_data = json.loads(body) if body else None
if not request_data:
self._send_json({"error": "Invalid JSON"}, 400)
return
if simulator is None:
self._send_json({"error": "Simulator not initialized"}, 500)
return
response = simulator._process_request(request_data)
self._send_json(response, 200)
except json.JSONDecodeError:
self._send_json({"error": "Invalid JSON"}, 400)
except Exception as e:
print(f"Error processing request: {e}")
self._send_json({"error": str(e)}, 500)
def _send_json(self, data: dict, status: int = 200):
body = json.dumps(data).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format, *args):
# Suppress default request logging
pass
def main():
parser = argparse.ArgumentParser(
description="llama-server simulator for testing eval scripts"
)
parser.add_argument(
"--port",
type=int,
default=8033,
help="Server port (default: 8033)"
)
parser.add_argument(
"--host",
type=str,
default="localhost",
help="Server host (default: localhost)"
)
parser.add_argument(
"--success-rate",
type=float,
default=0.8,
help="Success rate 0-1 (default: 0.8)"
)
parser.add_argument(
"--dataset-split",
type=str,
default="train",
help="AIME dataset split to use (default: train)"
)
args = parser.parse_args()
global simulator
simulator = Simulator(
port=args.port,
host=args.host,
success_rate=args.success_rate,
dataset_split=args.dataset_split
)
server = HTTPServer((args.host, args.port), RequestHandler)
server_thread = threading.Thread(target=server.serve_forever, daemon=True)
server_thread.start()
print("\n=== llama-server-simulator ===")
print(f"Server running on http://{args.host}:{args.port}")
print(f"Success rate: {args.success_rate}")
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
print("\nPress Ctrl+C to stop\n")
try:
server_thread.join()
except KeyboardInterrupt:
print("\nShutting down...")
server.shutdown()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,86 @@
#!/bin/bash
set -e
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "=== llama-server-simulator Test Script ==="
echo ""
PORT=8033
SUCCESS_RATE=0.8
TEST_PORT=8034
echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
source "$SCRIPT_DIR/venv/bin/activate"
python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
SIMULATOR_PID=$!
echo "Waiting for simulator to start..."
sleep 5
# Helper function to make a request and extract the answer
make_request() {
local question="$1"
curl -s -X POST http://localhost:$PORT/v1/chat/completions \
-H "Content-Type: application/json" \
-d "{
\"model\": \"llama\",
\"messages\": [
{\"role\": \"user\", \"content\": \"$question\"}
],
\"temperature\": 0,
\"max_tokens\": 2048
}" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
}
# Test question (repeated in multiple tests)
TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
echo ""
echo "=== Test 1: Correct Answer ==="
echo "Sending request with known question..."
answer=$(make_request "$TEST_QUESTION")
echo "Answer: $answer"
echo "Expected: 116"
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 2: Wrong Answer ==="
echo "Sending request with known question (success rate 0.0)..."
answer=$(make_request "$TEST_QUESTION")
echo "Answer: $answer"
echo "Expected: 116"
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 3: No Matching Question ==="
echo "Sending request with non-matching text..."
response=$(make_request "What is the capital of France?")
echo "Response: $response"
echo "Expected: No matching question found"
echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 4: Success Rate Verification ==="
echo "Sending 10 requests to test success rate..."
correct_count=0
for i in {1..10}; do
answer=$(make_request "$TEST_QUESTION")
if [ "$answer" == "116" ]; then
correct_count=$((correct_count + 1))
fi
echo " Request $i: Answer = $answer"
done
echo "Correct answers: $correct_count/10"
echo "Expected: ~8/10 (80% success rate)"
echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
echo ""
echo "=== Test Complete ==="
echo "Stopping simulator..."
kill $SIMULATOR_PID 2>/dev/null
wait $SIMULATOR_PID 2>/dev/null || true
echo "Simulator stopped."

58
flake.lock generated
View File

@@ -1,58 +0,0 @@
{
"nodes": {
"flake-parts": {
"inputs": {
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1730504689,
"narHash": "sha256-hgmguH29K2fvs9szpq2r3pz2/8cJd2LPS+b4tfNFCwE=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "506278e768c2a08bec68eb62932193e341f55c90",
"type": "github"
},
"original": {
"owner": "hercules-ci",
"repo": "flake-parts",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1732014248,
"narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs-lib": {
"locked": {
"lastModified": 1730504152,
"narHash": "sha256-lXvH/vOfb4aGYyvFmZK/HlsNsr/0CVWlwYvo2rxJk3s=",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
},
"original": {
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/cc2f28000298e1269cea6612cd06ec9979dd5d7f.tar.gz"
}
},
"root": {
"inputs": {
"flake-parts": "flake-parts",
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

View File

@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 11)
set(GGML_VERSION_PATCH 0)
set(GGML_VERSION_PATCH 1)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

View File

@@ -0,0 +1,968 @@
#include "allreduce.cuh"
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
#include "convert.cuh"
#include "ggml-impl.h"
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <limits>
// ---------------------------------------------------------------------------
// CUDA AllReduce for tensor-parallel inference across two GPUs.
//
// Provides an in-place sum reduction over matching tensors on two CUDA
// devices in the same process. Used by the tensor-split path alongside
// NCCL; targets setups without NVLink, where data is exchanged between the
// GPUs by staging it through pinned host memory over PCIe.
//
// Two reduction strategies are selected per call by tensor size:
//
// * Chunked kernel path (small reductions): a single CUDA kernel both
// stages data through pinned host memory and performs the local sum.
// Cross-GPU synchronization happens *inside the kernel* (busy-wait on
// a host-memory flag), which keeps launch overhead low for the
// latency-sensitive token-generation case.
//
// * Copy-engine path (large reductions): the transfer is split into
// D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
// followed by a small device-side add kernel. Cross-GPU
// synchronization happens *outside the kernel*, via CUDA events
// between streams. This keeps the compute engine free while large
// transfers are in flight, which matters for prefill-sized tensors.
// Reductions larger than the per-call inner cap are processed by an
// outer chunker that issues sequential inner calls.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Cross-GPU signal mechanism
//
// One int per (slot, rank) pair in pinned host memory. Each AR call writes a
// strictly increasing token (= the AR call number) into its own arrival int.
// The peer spins until its read of the other's arrival int equals the token
// it expects for this call -- a mismatch means the peer hasn't arrived yet.
// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
// days at thousands of ARs/sec), so arrival ints don't need to be reset
// between calls; we initialize once at pipeline init and let the values
// accumulate.
//
// There is exactly one writer (the owning GPU) and one reader (the peer), so
// we don't need atomics. A volatile store paired with __threadfence_system()
// provides the release ordering that makes the D2H writes visible system-wide
// before the arrival token is observed.
//
// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
// portable choice.
// ---------------------------------------------------------------------------
static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
*(volatile int *)p = token;
}
static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
return *(const volatile int *)p;
}
// Byte spacing between adjacent arrival ints. 64 bytes (one cache line)
// ensures each GPU/block's arrival slot lives on its own line, preventing
// false-sharing stalls on the polling GPU.
static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
// Number of blocks the chunked kernel launches with. Each block stripes a
// disjoint slice of the data and synchronizes through its own arrival-token
// slot so multiple SMs can pump PCIe stores in parallel.
static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
// ---------------------------------------------------------------------------
// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
//
// Both GPUs run this kernel simultaneously on independent streams. sendbuf
// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When
// T_dst == T_wire the casts below are no-ops.
//
// Each GPU runs three phases:
//
// Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
// single-instruction-width vectors into host_mine.
// __threadfence_system() commits these writes to host
// memory.
// Phase 2 (thread 0): write token to arrival_mine; spin until
// arrival_other == token.
// Phase 3 (all threads): read T_wire vectors from host_other, cast
// each element to T_dst, and sum with the local
// sendbuf value (also rounded through T_wire so that
// both GPUs truncate identically -- this guarantees
// bit-equivalent results across the two devices).
//
// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
// threads to keep multiple SMs issuing PCIe stores in parallel. Each block
// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
// thread 0 of each block signals/spins on that slot independently of other
// blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are
// handled only by block 0 to avoid cross-block writes to the same slots.
// ---------------------------------------------------------------------------
template <typename T_dst, typename T_wire>
static __global__ void ggml_cuda_ar_kernel(
const T_dst * sendbuf,
T_dst * recvbuf,
T_wire * __restrict__ host_mine,
const T_wire * __restrict__ host_other,
int count,
int * arrival_mine,
int * arrival_other,
int token) {
// Vector unit for the wire type, sized to the arch's widest single-instruction
// copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory;
// each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
const int tid = threadIdx.x;
const int nt = blockDim.x;
const int bid = blockIdx.x;
const int gtid = bid * nt + tid;
const int gnt = gridDim.x * nt;
const int count_vec = count / ELEMS_PER_VEC;
const int tail = count_vec * ELEMS_PER_VEC;
// Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
{
for (int i = gtid; i < count_vec; i += gnt) {
const int off = i * ELEMS_PER_VEC;
T_wire wire[ELEMS_PER_VEC];
#pragma unroll
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
}
ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
}
if (bid == 0 && tid < count - tail) {
host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
}
}
// Commit this block's host writes before signalling.
__threadfence_system();
__syncthreads();
// Phase 2: thread 0 of each block signals on its own arrival slot, then
// spins for the matching slot from peer. Per-block tokens mean blocks
// proceed independently -- no inter-block barrier needed.
if (tid == 0) {
int * my_slot = arrival_mine + bid * ARRIVAL_INTS;
const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
ggml_cuda_ar_signal_set(my_slot, token);
__threadfence_system(); // make our signal visible system-wide
while (ggml_cuda_ar_signal_get(other_slot) != token) {
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
__nanosleep(100);
#else
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
}
}
__syncthreads();
// Acquire peer's host_other writes (this block's stripe of them).
__threadfence_system();
// Phase 3: read peer's T_wire vector, cast both sides through T_wire for
// bit-equivalence, sum in T_dst precision, and write back to recvbuf.
{
for (int i = gtid; i < count_vec; i += gnt) {
const int off = i * ELEMS_PER_VEC;
T_wire wire[ELEMS_PER_VEC];
ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
#pragma unroll
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
}
}
if (bid == 0 && tid < count - tail) {
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
recvbuf[tail + tid] =
ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
}
}
}
// Combined load-convert-add kernel. The peer's contribution arrives as T_src
// (which may be a lower-precision type than T_dst when the BF16 round-trip is
// active). For bit-equivalence between the two GPUs, dst is first rounded
// through T_src's precision via ggml_cuda_cast -- peer already truncated its
// own value the same way before sending -- so both sides perform identical
// arithmetic. When T_dst == T_src the round-trip cast is a no-op.
template <typename T_dst, typename T_src>
static __global__ void ggml_cuda_ar_add_kernel(
T_dst * __restrict__ dst,
const T_src * __restrict__ src,
int count) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int nt = gridDim.x * blockDim.x;
for (int i = tid; i < count; i += nt) {
const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
}
}
// ---------------------------------------------------------------------------
// Pipeline structure
// ---------------------------------------------------------------------------
// Number of slots in the event / arrival ring. Two slots is sufficient:
// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
// from AR N-2 by the time we get to AR N. acquire_slot's
// cudaEventSynchronize on ev.ker for both devices makes that consumption
// explicit before we overwrite host_buf[slot] for the new AR.
static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
// Larger tensors are reduced by issuing multiple chunked launches.
static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
// Copy-engine path: largest tensor accepted on this path; sets host_large /
// dev_tmp allocation size.
static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
// AR wire size at which the copy-engine path takes over from the chunked-
// kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD.
static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
// the peer); the clamps cover the cases where nbytes/4 is too small (per-
// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB
// Absolute floor that an env-var override is allowed to set; this caps the
// per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor.
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
struct ggml_cuda_ar_event_slot {
cudaEvent_t app = nullptr; // upstream computation complete
cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete
cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream)
cudaEvent_t ker = nullptr; // AllReduce kernel complete
};
// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
// in one place, with the host handle preserved for cudaFreeHost. Used where
// the CPU never touches the buffer -- only the device reads/writes via the
// mapped device pointer. Required on systems where cudaDevAttrCanUseHost-
// PointerForRegisteredMem is 0 and the host pointer can't be used as a
// device pointer.
struct ggml_cuda_ar_host_mapping {
uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset
cudaError_t alloc(size_t bytes) {
cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
cudaHostAllocPortable | cudaHostAllocMapped);
if (rc != cudaSuccess) {
host = nullptr;
return rc;
}
rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
if (rc != cudaSuccess) {
cudaFreeHost(host);
host = nullptr;
dev = nullptr;
}
return rc;
}
void free() {
if (host) {
cudaFreeHost(host);
host = nullptr;
dev = nullptr;
}
}
};
struct ggml_cuda_ar_pipeline {
int n_devices;
int devices[GGML_CUDA_MAX_DEVICES];
size_t buf_bytes; // bytes per device in host_buf[]
size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[]
size_t copy_threshold;
size_t copy_chunk_bytes;
size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
uint64_t call_count;
// Per-device resources.
ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel)
ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path
cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking
ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
// Copy-engine: per-device "I finished reading my peer's host_large"
// event. Indexed by RECORDER device. Recorded same-device on streams[i]
// after stage 2's last H2D from host_large[peer]. Waited cross-device
// by peer's stage-1 stream before the next AR overwrites host_large[peer].
cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES];
bool host_large_read_done_valid;
// Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
// Recorded on the compute stream after each add_kernel; the AR stream
// waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us
// single-buffer dev_tmp despite add_kernel running on a separate stream.
cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
bool dev_tmp_kernel_done_valid;
// Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned
// memory; CPU never reads/writes -- only the kernel and cudaMemset.
// Use ggml_cuda_ar_arrival_ptr() to index.
ggml_cuda_ar_host_mapping arrival;
};
// Base pointer for the (slot, rank) per-block token block. The kernel adds
// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
const size_t offset = ((size_t)slot * p->n_devices + rank) *
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
return reinterpret_cast<int *>(p->arrival.dev + offset);
}
static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
const char * value = getenv(name);
if (value == nullptr || value[0] == '\0') {
return default_value;
}
char * end = nullptr;
const unsigned long long parsed = strtoull(value, &end, 10);
return end != value ? (uint64_t) parsed : default_value;
}
struct ggml_cuda_ar_slot_info {
int slot;
int token;
};
static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
const int slot = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
p->call_count++;
if (pool_lapped) {
for (int i = 0; i < p->n_devices; ++i) {
ggml_cuda_set_device(p->devices[i]);
CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
}
}
return { slot, (int) p->call_count };
}
// Per-AR copy-engine chunk size: env-var override if set, else heuristic
// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
if (p->copy_chunk_bytes > 0) {
return p->copy_chunk_bytes;
}
return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
}
static void ggml_cuda_ar_wait_for_compute(
ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
}
// ---------------------------------------------------------------------------
// Init / free
// ---------------------------------------------------------------------------
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
if (n_devices != 2) {
GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
"falling back\n", __func__, n_devices);
return nullptr;
}
// The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
for (size_t i = 0; i < n_devices; ++i) {
const int cc = ggml_cuda_info().devices[devices[i]].cc;
if (cc < GGML_CUDA_CC_VOLTA) {
GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
"(device %d has cc=%d); falling back\n",
__func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
return nullptr;
}
}
auto * p = new ggml_cuda_ar_pipeline{};
p->n_devices = n_devices;
p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES;
p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
// 0 = use the per-call heuristic (default). Non-zero env value forces a
// fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
__func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
}
// Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
// ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
// byte threshold to opt out for small tensors.
p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
for (size_t i = 0; i < n_devices; ++i) {
p->devices[i] = devices[i];
}
// Per-device streams and event pools.
for (size_t i = 0; i < n_devices; ++i) {
ggml_cuda_set_device(p->devices[i]);
cudaStream_t stream = nullptr;
if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
__func__, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
p->streams[i] = stream;
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
bool ok =
cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
}
if (!ok) {
GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
__func__, p->devices[i], s);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
__func__, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
__func__, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
// Arrival ring: cache-line padded so each GPU's int is on its own line.
const size_t arrival_bytes =
(size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
__func__, arrival_bytes);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
ggml_cuda_set_device(p->devices[0]);
if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
__func__, arrival_bytes);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
// Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
// kernel can write the next slot's data while the peer is still reading
// the previous slot's. Indexed by (slot * buf_bytes) at the call site.
p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
for (size_t i = 0; i < n_devices; ++i) {
if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
__func__, host_buf_total);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
// Copy-engine path: pinned host staging + device scratch, sized for the
// largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
// dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
// cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
for (size_t i = 0; i < n_devices; ++i) {
ggml_cuda_set_device(p->devices[i]);
if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
__func__, p->copy_bytes);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
__func__, p->copy_bytes, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
"%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
__func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
return p;
}
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
if (!p) {
return;
}
// Drain all in-flight kernels before tearing down resources.
for (int i = 0; i < p->n_devices; ++i) {
if (p->streams[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaStreamSynchronize(p->streams[i]);
}
}
for (int i = 0; i < p->n_devices; ++i) {
p->host_buf[i].free();
p->host_large[i].free();
if (p->dev_tmp[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaFree(p->dev_tmp[i]);
}
ggml_cuda_set_device(p->devices[i]);
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
}
if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
}
if (p->host_large_read_done[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaEventDestroy(p->host_large_read_done[i]);
}
if (p->dev_tmp_kernel_done[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaEventDestroy(p->dev_tmp_kernel_done[i]);
}
if (p->streams[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaStreamDestroy(p->streams[i]);
}
}
p->arrival.free();
delete p;
}
// ---------------------------------------------------------------------------
// Dispatch
// ---------------------------------------------------------------------------
// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
// nbytes per ne element); accumulated locally into a T_dst buffer. When
// T_src == T_dst this is the original homogeneous reduction. When they differ
// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
// for bit-equivalence between GPUs and we skip the otherwise-needed
// post-conversion entirely.
template <typename T_src, typename T_dst>
static bool ggml_cuda_ar_allreduce_copy_impl(
ggml_cuda_ar_pipeline * p,
ggml_backend_t * backends,
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
const bool compute[GGML_CUDA_MAX_DEVICES],
int64_t ne,
size_t nbytes) {
GGML_ASSERT(p->n_devices == 2);
GGML_ASSERT(nbytes <= p->copy_bytes);
GGML_ASSERT(ne <= std::numeric_limits<int>::max());
const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
GGML_ASSERT(chunk_bytes > 0);
const int slot = ggml_cuda_ar_acquire_slot(p).slot;
const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
ggml_backend_cuda_context * cuda_ctx[2] = {};
// Stage 1: both GPUs copy their local contribution to pinned host memory.
for (int i = 0; i < 2; ++i) {
ggml_cuda_set_device(p->devices[i]);
cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
// Wait for peer's H2D from our host_large[i] (recorded in the
// previous AR's stage 2) to complete before we overwrite host_large[i].
// host_large_read_done[peer] = peer finished reading host_large[i].
// No-op on the first AR -- no prior record exists.
if (p->host_large_read_done_valid) {
const int peer = 1 - i;
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
}
if (!compute[i]) {
CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
}
for (size_t c = 0; c < copy_chunks; ++c) {
const size_t offset = c * chunk_bytes;
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
(nbytes - offset) : chunk_bytes;
CUDA_CHECK(cudaMemcpyAsync(
p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
cudaMemcpyDeviceToHost, p->streams[i]));
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
}
}
// Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
// local device scratch (dev_tmp), then performs one device-local add over
// the assembled peer tensor. The H2Ds run on the AR stream (copy engine)
// and the add_kernel runs on the caller's compute stream, so the AR stream
// stays pure-copy and avoids an in-stream copy->compute engine switch every
// AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the
// prior AR's add_kernel-done event before overwriting it.
for (int i = 0; i < 2; ++i) {
const int peer = 1 - i;
ggml_cuda_set_device(p->devices[i]);
// Wait for the previous AR's add_kernel (on the compute stream) to
// finish reading dev_tmp before our H2D overwrites it. No-op on the
// first copy_impl call.
if (p->dev_tmp_kernel_done_valid) {
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
}
for (size_t c = 0; c < copy_chunks; ++c) {
const size_t offset = c * chunk_bytes;
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
(nbytes - offset) : chunk_bytes;
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
CUDA_CHECK(cudaMemcpyAsync(
p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
cudaMemcpyHostToDevice, p->streams[i]));
}
// Mark our reads of host_large[peer] complete so peer's next AR can
// safely overwrite it.
CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
// Hand off from AR stream (copy engine) to compute stream: compute
// stream waits for all H2Ds to finish, then runs the add_kernel.
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
const int block_size = 256;
int n_blocks = (int) ((ne + block_size - 1) / block_size);
if (n_blocks > 1024) {
n_blocks = 1024;
}
ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
dst_buf[i],
reinterpret_cast<const T_src *>(p->dev_tmp[i]),
(int) ne);
CUDA_CHECK(cudaGetLastError());
// Record dev_tmp-released on the compute stream so the next copy_impl
// can wait for the kernel to finish before overwriting dev_tmp. Also
// record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
}
p->host_large_read_done_valid = true;
p->dev_tmp_kernel_done_valid = true;
return true;
}
// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
// the host_large / dev_tmp allocation size). When the full AR exceeds that,
// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
// slot, so cross-AR fences and pool wraparound work the same way as for any
// other sequence of small ARs.
template <typename T_src, typename T_dst>
static bool ggml_cuda_ar_allreduce_copy_outer(
ggml_cuda_ar_pipeline * p,
ggml_backend_t * backends,
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
const bool compute[GGML_CUDA_MAX_DEVICES],
int64_t ne) {
const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
GGML_ASSERT(outer_max_elems > 0);
bool ok = true;
for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start);
const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src);
T_src * src[GGML_CUDA_MAX_DEVICES] = {};
T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < p->n_devices; ++i) {
src[i] = src_buf[i] + outer_start;
dst[i] = dst_buf[i] + outer_start;
}
ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
p, backends, src, dst, compute, outer_ne, outer_nbytes);
}
return ok;
}
bool ggml_cuda_ar_allreduce(
ggml_cuda_ar_pipeline * p,
ggml_backend_t * backends,
ggml_tensor ** tensors) {
GGML_ASSERT(p != nullptr);
const int n = p->n_devices;
GGML_ASSERT(n == 2);
const ggml_type input_type = tensors[0]->type;
GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
const int64_t ne = ggml_nelements(tensors[0]);
GGML_ASSERT(ne > 0);
const size_t input_nbytes = ggml_nbytes(tensors[0]);
// BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
// the reduction (chunked or copy-engine), halving on-wire bytes. Matches
// NCCL's behaviour. The pre-conversion zeroes inactive shards so the
// inner paths see them as already-prepared compute tensors.
const bool use_bf16 =
input_type == GGML_TYPE_F32 &&
p->bf16_threshold > 0 &&
input_nbytes >= p->bf16_threshold;
const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
const size_t type_size = ggml_type_size(kernel_type);
GGML_ASSERT(p->buf_bytes >= type_size);
const size_t nbytes = (size_t) ne * type_size;
bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
}
// Decide between copy-engine and chunked kernel paths based on the working
// type's actual byte count. No upper bound: copy_outer slices reductions
// larger than copy_bytes into copy_bytes-sized pieces.
const bool use_copy_engine =
p->copy_threshold > 0 &&
nbytes >= p->copy_threshold;
// BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
// (chunked kernel path) and the combined add kernel (copy_engine path)
// both accumulate into the F32 tensor data directly, so an inactive
// shard's accumulator must start at zero.
if (use_bf16) {
for (int i = 0; i < n; ++i) {
if (!compute_flag[i]) {
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
ggml_cuda_set_device(p->devices[i]);
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
}
}
}
// Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
// path; the chunked kernel path's combined kernel does the conversion
// inline as it writes to host_buf.
ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
if (use_copy_engine && use_bf16) {
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
for (int i = 0; i < n; ++i) {
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
bf16_tmp[i].pool = &cuda_ctx->pool();
bf16_tmp[i].alloc(ne);
ggml_cuda_set_device(p->devices[i]);
if (compute_flag[i]) {
to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
} else {
CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
}
copy_src_ptr[i] = bf16_tmp[i].get();
}
}
bool ok = true;
if (use_copy_engine) {
// After up-front BF16 conversion, the tmp buffers already hold the
// (possibly zeroed-for-inactive) data, so the inner path can treat
// every shard as compute.
bool inner_compute[GGML_CUDA_MAX_DEVICES];
for (int i = 0; i < n; ++i) {
inner_compute[i] = use_bf16 ? true : compute_flag[i];
}
// Dispatch into copy_impl with explicit src/dst types. When use_bf16
// is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
// is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
// through BF16 for bit-equivalence and writes F32 directly, so no
// post-conversion is needed. Otherwise src == dst (same native type).
if (use_bf16) {
GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
float * dst[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
dst[i] = static_cast<float *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
p, backends, src, dst, inner_compute, ne);
} else {
switch (kernel_type) {
case GGML_TYPE_F32: {
float * buf[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
buf[i] = static_cast<float *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
p, backends, buf, buf, inner_compute, ne);
break;
}
case GGML_TYPE_BF16: {
nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
p, backends, buf, buf, inner_compute, ne);
break;
}
case GGML_TYPE_F16: {
half * buf[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
buf[i] = static_cast<half *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
p, backends, buf, buf, inner_compute, ne);
break;
}
default:
GGML_ASSERT(false);
}
}
} else {
// host_buf carries T_wire-typed data; max_chunk_elems is the count that
// fits in one host_buf at the wire size.
const size_t max_chunk_elems = p->buf_bytes / type_size;
const size_t input_type_size = ggml_type_size(input_type);
// Chunked kernel path runs entirely on the caller's compute stream:
// since AR is a barrier here, same-stream ordering subsumes any
// cross-stream event handshake that the copy-engine path needs, and
// skips the cross-stream scheduling overhead that was hurting the
// small-tensor (tg) latency on the AR-stream variant. Only ev.ker is
// still recorded at end-of-AR for acquire_slot's pool-wraparound check.
for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
const size_t remaining_elems = (size_t) (ne - chunk_start);
const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
const size_t chunk_dst_bytes = chunk_elems * input_type_size;
const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
for (int i = 0; i < n; ++i) {
const int peer = 1 - i; // valid for n == 2 only
ggml_cuda_set_device(p->devices[i]);
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
cudaStream_t stream = cuda_ctx->stream();
char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
// Match NCCL/meta-backend semantics: inactive shards contribute
// zeros. On the BF16 path the F32 tensor data was already
// zeroed up-front (above), so per-chunk zeroing isn't needed.
if (!compute_flag[i] && !use_bf16) {
CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
}
#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
reinterpret_cast<const T_dst *>(data), \
reinterpret_cast<T_dst *>(data), \
reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
static_cast<int>(chunk_elems), \
ggml_cuda_ar_arrival_ptr(p, slot, i), \
ggml_cuda_ar_arrival_ptr(p, slot, peer), \
token)
if (use_bf16) {
GGML_ASSERT(input_type == GGML_TYPE_F32);
LAUNCH_AR_KERNEL(float, nv_bfloat16);
} else {
switch (input_type) {
case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break;
case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break;
case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
default: GGML_ASSERT(false);
}
}
#undef LAUNCH_AR_KERNEL
CUDA_CHECK(cudaGetLastError());
if (last_chunk) {
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
}
}
}
}
return ok;
}
#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
// implementation relies on, so the internal AllReduce is a CUDA-only feature.
// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
// and silently falls back to the meta backend's generic AllReduce.
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
return nullptr;
}
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
}
bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
return false;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

View File

@@ -0,0 +1,29 @@
#pragma once
#include "common.cuh"
#include "ggml-backend-impl.h"
#include <cstddef>
// Opaque pipeline context -- owns all pinned buffers, streams, and events.
struct ggml_cuda_ar_pipeline;
// Allocate a pipeline for n_devices GPUs.
// devices[] holds the CUDA device IDs in rank order.
// Returns nullptr on allocation failure.
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
const int * devices, size_t n_devices);
// Release all resources owned by the pipeline.
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
// tensors[i] must live on the device managed by backends[i] and be
// contiguous F32, F16, or BF16.
// Preconditions are checked by the CUDA comm dispatcher before calling this.
// Returns true once the reduction work has been enqueued successfully.
bool ggml_cuda_ar_allreduce(
ggml_cuda_ar_pipeline * pipeline,
ggml_backend_t * backends,
ggml_tensor ** tensors);

View File

@@ -2,6 +2,7 @@
#include "ggml-impl.h"
#include "ggml-backend-impl.h"
#include "ggml-cuda/allreduce.cuh"
#include "ggml-cuda/common.cuh"
#include "ggml-cuda/acc.cuh"
#include "ggml-cuda/add-id.cuh"
@@ -86,6 +87,9 @@
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#define GGML_LOG_WARN_ONCE(str) \
{ static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails
@@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
#ifdef GGML_USE_NCCL
// Communication context for multi-GPU AllReduce during tensor parallelism.
//
// Created once per meta backend instance. Resources for the selected mode
// (NCCL communicators or the internal AllReduce pipeline) are initialised
// eagerly during comm_init so any init failure surfaces at startup rather
// than mid-run.
struct ggml_backend_cuda_comm_context {
using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
std::vector<ggml_backend_t> backends;
std::vector<ncclComm_t> comms;
std::vector<int> dev_ids;
// Set by the init chain (comm_init_{nccl, internal, none}) to one of
// try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
// internal needs `ar_pipeline`, butterfly needs nothing. Per-call
// failures return false; the meta backend's generic implementation then
// handles that call.
try_allreduce_fn try_allreduce = nullptr;
ggml_cuda_ar_pipeline * ar_pipeline = nullptr;
#ifdef GGML_USE_NCCL
std::vector<ncclComm_t> comms;
#endif // GGML_USE_NCCL
~ggml_backend_cuda_comm_context() {
#ifdef GGML_USE_NCCL
for (ncclComm_t comm : comms) {
NCCL_CHECK(ncclCommDestroy(comm));
}
#endif // GGML_USE_NCCL
ggml_cuda_ar_pipeline_free(ar_pipeline);
}
};
#endif // GGML_USE_NCCL
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
#ifdef GGML_USE_NCCL
if (comm_ctx_v == nullptr) {
return;
}
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
delete comm_ctx;
#else
GGML_UNUSED(comm_ctx_v);
#endif // GGML_USE_NCCL
}
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
#ifdef GGML_USE_NCCL
for (size_t i = 0; i < n_backends; i++) {
if (!ggml_backend_is_cuda(backends[i])) {
return nullptr;
}
}
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
std::vector<int> dev_ids;
ret->backends.reserve(n_backends);
dev_ids.reserve(n_backends);
for (size_t i = 0; i < n_backends; i++) {
ret->backends.push_back(backends[i]);
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
dev_ids.push_back(cuda_ctx->device);
}
ret->comms.resize(n_backends);
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
return ret;
#else
// If NCCL is installed it is used by default for optimal performance.
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
// RCCL is disabled by default, users are explicitly opting in.
// Therefore print no warning for RCCL.
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool warning_printed = false;
if (!warning_printed) {
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
warning_printed = true;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
GGML_UNUSED_VARS(backends, n_backends);
return nullptr;
#endif // GGML_USE_NCCL
}
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
#ifdef GGML_USE_NCCL
// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
// tensors (bandwidth-bound), then converts back to FP32.
static bool ggml_backend_cuda_comm_allreduce_nccl(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
@@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
return true;
}
GGML_ASSERT(comm_ctx_v != nullptr);
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
const size_t n_backends = comm_ctx->backends.size();
for (size_t i = 0; i < n_backends; ++i) {
@@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
@@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
}
return true;
#else
GGML_UNUSED_VARS(comm_ctx_v, tensors);
return false;
}
#endif // GGML_USE_NCCL
// Run the internal AR pipeline. Returns false on unsupported / failed input
// -- the caller decides whether to abort (env-forced) or fall back silently.
static bool ggml_backend_cuda_comm_allreduce_internal(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
const size_t n_backends = comm_ctx->backends.size();
GGML_ASSERT(n_backends == 2);
GGML_ASSERT(tensors[0] != nullptr);
const int64_t ne = ggml_nelements(tensors[0]);
const ggml_type type = tensors[0]->type;
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
return false;
}
if (ne == 0) {
return true;
}
for (size_t i = 0; i < n_backends; ++i) {
if (tensors[i] == nullptr) {
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
return false;
}
if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
__func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
return false;
}
if (!ggml_is_contiguously_allocated(tensors[i])) {
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
__func__, i, ne, ggml_nbytes(tensors[i]),
(size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
return false;
}
if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
__func__, i, tensors[i]->data, (int) type, ne);
return false;
}
GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
}
return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
}
// ---------------------------------------------------------------------------
// Per-call dispatch -- three variants, one per backend. Each is set as
// comm_ctx->try_allreduce by the matching init step. Per-call failure
// returns false; the meta backend's generic implementation handles that call.
// ---------------------------------------------------------------------------
#ifdef GGML_USE_NCCL
static bool ggml_backend_cuda_comm_try_allreduce_nccl(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
}
#endif // GGML_USE_NCCL
static bool ggml_backend_cuda_comm_try_allreduce_internal(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
}
static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
return false;
}
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
if (comm_ctx_v == nullptr) {
return;
}
delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
}
// ---------------------------------------------------------------------------
// Init -- chained nccl -> internal -> none. Each step tries to bring up its
// resource; on failure it warns and recurses into the next step.
// ---------------------------------------------------------------------------
static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
}
static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
if (ret->ar_pipeline) {
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
return;
}
// Clear sticky CUDA error from the failed init.
(void) cudaGetLastError();
GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
"falling back to meta-backend butterfly\n");
ggml_backend_cuda_comm_init_none(ret);
}
static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
#ifdef GGML_USE_NCCL
const size_t n = ret->dev_ids.size();
ret->comms.resize(n);
ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
if (rc == ncclSuccess) {
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
return;
}
ret->comms.clear();
GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
ncclGetErrorString(rc));
#else // GGML_USE_NCCL
#ifndef GGML_USE_HIP
GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. "
"Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
#endif // !GGML_USE_HIP
#endif // GGML_USE_NCCL
ggml_backend_cuda_comm_init_internal(ret);
}
// Top-level init. Picks one of the three init paths based on
// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
// any fallback. Unrecognised env values warn and fall through to the
// platform default.
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
for (size_t i = 0; i < n_backends; i++) {
if (!ggml_backend_is_cuda(backends[i])) {
return nullptr;
}
}
auto * ret = new ggml_backend_cuda_comm_context;
ret->backends.assign(backends, backends + n_backends);
ret->dev_ids.reserve(n_backends);
for (size_t i = 0; i < n_backends; i++) {
ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
}
const char * env = getenv("GGML_CUDA_ALLREDUCE");
if (!env) {
// Platform default: Linux uses NCCL, otherwise (generally Windows) internal
#if defined(__linux__)
ggml_backend_cuda_comm_init_nccl(ret);
#else
ggml_backend_cuda_comm_init_internal(ret);
#endif // defined(__linux__)
} else {
std::string env_str(env);
if (env_str == "nccl") {
ggml_backend_cuda_comm_init_nccl(ret);
} else if (env_str == "internal") {
ggml_backend_cuda_comm_init_internal(ret);
} else if (env_str == "none") {
ggml_backend_cuda_comm_init_none(ret);
} else {
GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
ggml_backend_cuda_comm_init_none(ret);
}
}
return ret;
}
// Top-level dispatch -- calls the function pointer chosen by comm_init.
// Returns false to let the meta-backend's butterfly run.
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
if (comm_ctx_v == nullptr) {
return false;
}
auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
return comm_ctx->try_allreduce(comm_ctx, tensors);
}
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {

View File

@@ -1 +1 @@
ac6f7b44f60fde0091f0b3d99afde48f8c99b13a
628249b398293fc8d2fa81a449ae2920a02c6523

View File

@@ -1131,10 +1131,6 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
}
// for differentiating model types
uint32_t n_vocab = 0;
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
// for classifier models
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
if (!classifier_labels.empty()) {

View File

@@ -503,6 +503,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
};
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
break;
case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
// Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
// by the normalizer, BPE merges over the whole text on raw UTF-8.
regex_exprs = {
"[^\\n]+|[\\n]+",
};
byte_encode = false;
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@@ -2005,6 +2013,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "gemma4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
escape_whitespaces = true;
} else if (
tokenizer_pre == "sarvam-moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
escape_whitespaces = true;
clean_spaces = false;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||

View File

@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
};
struct LLM_KV;

View File

@@ -1,7 +1,8 @@
#include "models.h"
void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
const auto n_vocab = vocab.n_tokens();
uint32_t n_vocab = 0;
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));

View File

@@ -1,7 +1,8 @@
#include "models.h"
void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
const auto n_vocab = vocab.n_tokens();
uint32_t n_vocab = 0;
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);