Compare commits

...

2 Commits
b6024 ... b6026

Author SHA1 Message Date
uvos
b77d11179d HIP: add GGML_HIP_MMQ_MFMA option to allow disableing the MFMA path. (#14930)
This is useful for testing for regressions on GCN with CDNA hardware.

With GGML_HIP_MMQ_MFMA=Off and GGML_CUDA_FORCE_MMQ=On we can conveniently test the GCN code path on CDNA. As CDNA is just GCN renamed with MFMA added and limited use ACC registers, this provides a good alternative for regression testing when GCN hardware is not available.
2025-07-29 17:44:30 +02:00
uvos
c7aa1364fd HIP: Ignore unsupported unroll transformation in fattn-vec (#14931)
llvm with the amdgcn target dose not support unrolling loops with conditional break statements, when those statements can not be resolved at compile time. Similar to other places in GGML lets simply ignore this warning.
2025-07-29 17:43:43 +02:00
5 changed files with 29 additions and 2 deletions

View File

@@ -174,6 +174,7 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)

View File

@@ -227,7 +227,7 @@ typedef float2 dfloat2;
#define FP16_MMA_AVAILABLE
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3) && !defined(GGML_HIP_NO_MMQ_MFMA)
#define AMD_MFMA_AVAILABLE
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
@@ -295,7 +295,11 @@ static bool fp32_mma_hardware_available(const int cc) {
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
static bool amd_mfma_available(const int cc) {
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
#if !defined(GGML_HIP_NO_MMQ_MFMA)
return GGML_CUDA_CC_IS_CDNA3(cc);
#else
return false;
#endif //!defined(GGML_HIP_NO_MMQ_MFMA)
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.

View File

@@ -1,6 +1,12 @@
#include "common.cuh"
#include "fattn-common.cuh"
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#ifndef GGML_USE_HIP
__launch_bounds__(D, 1)
@@ -341,6 +347,9 @@ static __global__ void flash_attn_vec_ext_f16(
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@@ -1,6 +1,12 @@
#include "common.cuh"
#include "fattn-common.cuh"
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#ifndef GGML_USE_HIP
__launch_bounds__(D, 1)
@@ -336,6 +342,9 @@ static __global__ void flash_attn_vec_ext_f32(
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@@ -113,6 +113,10 @@ if (GGML_HIP_ROCWMMA_FATTN)
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN)
endif()
if (NOT GGML_HIP_MMQ_MFMA)
add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
endif()
if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
endif()