Compare commits

...

3 Commits
b6067 ... b6070

Author SHA1 Message Date
Georgi Gerganov
15e92fd337 cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (#15038)
* cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1

ggml-ci

* cont : fix cont types

ggml-ci

* cont : adopt variable names and comment from the other branch
2025-08-02 17:13:05 +03:00
Sigbjørn Skjæret
2bf3fbf0b5 ci : check that pre-tokenizer hashes are up-to-date (#15032)
* torch is not required for convert_hf_to_gguf_update

* add --check-missing parameter

* check that pre-tokenizer hashes are up-to-date
2025-08-02 14:39:01 +02:00
Douglas Hanley
711d5e6fe6 convert : fix Qwen3-Embedding pre-tokenizer hash (#15030) 2025-08-02 12:51:02 +02:00
6 changed files with 89 additions and 22 deletions

View File

@@ -0,0 +1,45 @@
name: Check Pre-Tokenizer Hashes
on:
push:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
pull_request:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
jobs:
pre-tokenizer-hashes:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
python3 -m venv .venv
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
- name: Update pre-tokenizer hashes
run: |
cp convert_hf_to_gguf.py /tmp
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
- name: Check if committed pre-tokenizer hashes matches generated version
run: |
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
echo "Differences found:"
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
exit 1
fi
echo "Model pre-tokenizer hashes are up to date."

View File

@@ -702,6 +702,9 @@ class TextModel(ModelBase):
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
res = "kimi-k2"
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
res = "qwen2"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@@ -849,9 +852,6 @@ class TextModel(ModelBase):
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
res = "exaone4"
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-8B
res = "qwen2"
if res is None:
logger.warning("\n")

View File

@@ -59,6 +59,10 @@ parser.add_argument(
"--full", action="store_true",
help="download full list of models - make sure you have access to all of them",
)
parser.add_argument(
"--check-missing", action="store_true",
help="only check for missing pre-tokenizer hashes",
)
parser.add_argument(
"hf_token",
help="optional HF token",
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
if hf_token is None:
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
if args.check_missing and args.full:
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
args.check_missing = False
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -147,6 +155,7 @@ pre_computed_hashes = [
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
]
@@ -221,12 +230,13 @@ if not args.full:
all_models = models.copy()
models = [model for model in all_models if model["name"] not in existing_models]
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
if not args.check_missing:
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:

View File

@@ -1852,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
// Handle src0
src0_ptr = (const cuda_t *) src0->data;
@@ -1870,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
s11 = ne10;
s12 = ne11*s11;
s13 = ne12*s12;
is_src1_cont_2 = true;
}
// Setup destination buffer
@@ -1918,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
const int64_t smb = ne12 == 1 ? s13 : s12;
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
src1_ptr, cu_data_type_b, s11, s12, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
src1_ptr, cu_data_type_b, s11, smb, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
ne12*ne13,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));

View File

@@ -2688,6 +2688,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
const size_t type_size_src0 = ggml_type_size(src0->type);
const size_t type_size_src1 = ggml_type_size(src1->type);
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
// SRC1 strides
int64_t s11 = nb11 / type_size_src1;
int64_t s12 = nb12 / type_size_src1;
@@ -2737,6 +2740,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
s11 = ne10;
s12 = ne11 * s11;
s13 = ne12 * s12;
is_src1_cont_2 = true;
}
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
@@ -2852,12 +2857,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
else
#endif
{
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
const int64_t smb = ne12 == 1 ? s13 : s12;
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
} else {
const int ne23 = ne12 * ne13;

View File

@@ -1,7 +1 @@
-r ./requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu
torch~=2.2.1; platform_machine != "s390x"
# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
torch>=0.0.0.dev0; platform_machine == "s390x"