mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			913 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			913 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "ggml.h"
 | |
| #include "ggml-impl.h"
 | |
| #include "ggml-cuda.h"
 | |
| 
 | |
| #include <cstdint>
 | |
| #include <memory>
 | |
| 
 | |
| #if defined(GGML_USE_HIP)
 | |
| #define GGML_COMMON_DECL_HIP
 | |
| #define GGML_COMMON_IMPL_HIP
 | |
| #else
 | |
| #define GGML_COMMON_DECL_CUDA
 | |
| #define GGML_COMMON_IMPL_CUDA
 | |
| #if defined(GGML_USE_MUSA)
 | |
| #define GGML_COMMON_DECL_MUSA
 | |
| #define GGML_COMMON_IMPL_MUSA
 | |
| #endif
 | |
| #endif
 | |
| #include "ggml-common.h"
 | |
| 
 | |
| #include <array>
 | |
| #include <cassert>
 | |
| #include <cfloat>
 | |
| #include <cstdio>
 | |
| #include <string>
 | |
| #include <vector>
 | |
| 
 | |
| #if defined(GGML_USE_HIP)
 | |
| #include "vendors/hip.h"
 | |
| #elif defined(GGML_USE_MUSA)
 | |
| #include "vendors/musa.h"
 | |
| #else
 | |
| #include "vendors/cuda.h"
 | |
| #endif // defined(GGML_USE_HIP)
 | |
| 
 | |
| #define STRINGIZE_IMPL(...) #__VA_ARGS__
 | |
| #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
 | |
| 
 | |
| #define WARP_SIZE 32
 | |
| #define CUDART_HMAX   11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 | |
| #define CUDART_HMASK  12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
 | |
| 
 | |
| #define GGML_CUDA_CC_PASCAL          600
 | |
| #define GGML_CUDA_CC_DP4A            610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 | |
| #define GGML_CUDA_CC_VOLTA           700
 | |
| #define GGML_CUDA_CC_TURING          750
 | |
| #define GGML_CUDA_CC_AMPERE          800
 | |
| #define GGML_CUDA_CC_ADA_LOVELACE    890
 | |
| #define GGML_CUDA_CC_OFFSET_AMD      0x1000000
 | |
| #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
 | |
| #define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
 | |
| 
 | |
| // AMD
 | |
| // GCN/CDNA, wave size is 64
 | |
| #define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x803)  // Tonga, Fiji, Polaris, minimum for fast fp16
 | |
| #define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
 | |
| #define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
 | |
| #define GGML_CUDA_CC_CDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x908)  // MI100, minimum for MFMA, acc registers
 | |
| #define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
 | |
| #define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300
 | |
| 
 | |
| // RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
 | |
| #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 | |
| #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 | |
| #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
 | |
| #define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000
 | |
| 
 | |
| #define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
 | |
| #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
 | |
| #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
 | |
| #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
 | |
| #define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
 | |
| #define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
 | |
| #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
 | |
| #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
 | |
| #define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
 | |
| 
 | |
| // Moore Threads
 | |
| #define MUSART_HMASK 40300 // MUSA rc4.3, min. ver. for half2 -> uint mask comparisons
 | |
| 
 | |
| #define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
 | |
| #define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
 | |
| #define GGML_CUDA_CC_NG  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
 | |
| 
 | |
| #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
 | |
| #define GGML_CUDA_CC_IS_QY1(cc)      (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
 | |
| #define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
 | |
| #define GGML_CUDA_CC_IS_NG(cc)       (cc >= GGML_CUDA_CC_NG)
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
 | |
| #    define GGML_CUDA_USE_CUB
 | |
| #endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
 | |
| 
 | |
| #ifdef __CUDA_ARCH_LIST__
 | |
| constexpr bool ggml_cuda_has_arch_impl(int) {
 | |
|     return false;
 | |
| }
 | |
| 
 | |
| template<class ... Archs>
 | |
| constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
 | |
|     return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
 | |
| }
 | |
| 
 | |
| constexpr bool ggml_cuda_has_arch(const int arch) {
 | |
|     return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
 | |
| }
 | |
| 
 | |
| constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
 | |
|     if (cur == 0) {
 | |
|         GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
 | |
|     }
 | |
|     return cur;
 | |
| }
 | |
| 
 | |
| template<class ... Archs>
 | |
| constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
 | |
|     if (first <= arch && first > cur) {
 | |
|         return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
 | |
|     } else {
 | |
|         return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
 | |
|     }
 | |
| }
 | |
| 
 | |
| constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
 | |
|     return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
 | |
| }
 | |
| #else
 | |
| static int ggml_cuda_highest_compiled_arch(const int arch) {
 | |
|     return arch;
 | |
| }
 | |
| #endif // __CUDA_ARCH_LIST__
 | |
| 
 | |
| // ---------------------------------------------------------------------------------------------------------
 | |
| 
 | |
| #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 | |
| 
 | |
| #define GGML_CUDA_MAX_STREAMS 8
 | |
| 
 | |
| [[noreturn]]
 | |
| void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
 | |
| 
 | |
| #define CUDA_CHECK_GEN(err, success, error_fn)                                      \
 | |
|      do {                                                                           \
 | |
|         auto err_ = (err);                                                          \
 | |
|         if (err_ != (success)) {                                                    \
 | |
|             ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
 | |
|         }                                                                           \
 | |
|     } while (0)
 | |
| 
 | |
| #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
 | |
| 
 | |
| #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
 | |
|     static const char * cublas_get_error_str(const cublasStatus_t err) {
 | |
|         return cublasGetStatusString(err);
 | |
|     }
 | |
| #else
 | |
|     static const char * cublas_get_error_str(const cublasStatus_t err) {
 | |
|         switch (err) {
 | |
|             case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
 | |
|             case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
 | |
|             case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
 | |
|             case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
 | |
|             case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
 | |
|             case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
 | |
|             case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
 | |
|             case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
 | |
|             case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
 | |
|             default: return "unknown error";
 | |
|         }
 | |
|     }
 | |
| #endif // CUDART_VERSION >= 12000
 | |
| 
 | |
| #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
 | |
| static const char * cu_get_error_str(CUresult err) {
 | |
|     const char * err_str;
 | |
|     cuGetErrorString(err, &err_str);
 | |
|     return err_str;
 | |
| }
 | |
| #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
 | |
| #endif
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 | |
| #    define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes)                                                       \
 | |
|         do {                                                                                                   \
 | |
|             static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false };                         \
 | |
|             const int   id                                                = ggml_cuda_get_device();            \
 | |
|             if (!shared_memory_limit_raised[id]) {                                                             \
 | |
|                 CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
 | |
|                 shared_memory_limit_raised[id] = true;                                                         \
 | |
|             }                                                                                                  \
 | |
|         } while (0)
 | |
| #else
 | |
| #    define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
 | |
|         do {                                             \
 | |
|             GGML_UNUSED(nbytes);                         \
 | |
|         } while (0)
 | |
| #endif // !(defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 | |
| 
 | |
| #if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
 | |
| #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
 | |
| #else
 | |
| #define GGML_CUDA_ASSUME(x)
 | |
| #endif // CUDART_VERSION >= 11010
 | |
| 
 | |
| #ifdef GGML_CUDA_F16
 | |
| typedef half dfloat; // dequantize float
 | |
| typedef half2 dfloat2;
 | |
| #else
 | |
| typedef float dfloat; // dequantize float
 | |
| typedef float2 dfloat2;
 | |
| #endif // GGML_CUDA_F16
 | |
| 
 | |
| #if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
 | |
| #define GGML_USE_VMM
 | |
| #endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
 | |
| 
 | |
| #if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 | |
| #define FP16_AVAILABLE
 | |
| #endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
 | |
| 
 | |
| #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 | |
| #define FAST_FP16_AVAILABLE
 | |
| #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 | |
| 
 | |
| #if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
 | |
| #define FP16_MMA_AVAILABLE
 | |
| #endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
 | |
| 
 | |
| #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
 | |
| #define FP16_MMA_AVAILABLE
 | |
| #endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
 | |
| 
 | |
| #if defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
 | |
| #define AMD_MFMA_AVAILABLE
 | |
| #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 | |
| #define TURING_MMA_AVAILABLE
 | |
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | |
| #define AMPERE_MMA_AVAILABLE
 | |
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | |
| #define CP_ASYNC_AVAILABLE
 | |
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | |
| 
 | |
| #if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
 | |
| #define FLASH_ATTN_AVAILABLE
 | |
| #endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
 | |
| 
 | |
| static bool fp16_available(const int cc) {
 | |
|     return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 | |
| }
 | |
| 
 | |
| static bool fast_fp16_available(const int cc) {
 | |
|     return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
 | |
| }
 | |
| 
 | |
| // To be used for feature selection of external libraries, e.g. cuBLAS.
 | |
| static bool fast_fp16_hardware_available(const int cc) {
 | |
|     return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
 | |
|         (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
 | |
| }
 | |
| 
 | |
| // Any FP16 tensor core instructions are available for ggml code.
 | |
| static bool fp16_mma_available(const int cc) {
 | |
| #if defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
 | |
|     return false;
 | |
| #else
 | |
|     if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
 | |
|         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
 | |
|         GGML_CUDA_CC_IS_MTHREADS(cc)) {
 | |
|         return true;
 | |
|     } else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
 | |
| #if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
 | |
|         return true;
 | |
| #else
 | |
|         return false;
 | |
| #endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
 | |
|     } else {
 | |
|         return false;
 | |
|     }
 | |
| #endif // defined(GGML_USE_HIP) && !defined(GGML_HIP_ROCWMMA_FATTN)
 | |
| }
 | |
| 
 | |
| // To be used for feature selection of external libraries, e.g. cuBLAS.
 | |
| static bool fp16_mma_hardware_available(const int cc) {
 | |
|     return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
 | |
|         GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
 | |
|         (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
 | |
| }
 | |
| 
 | |
| static bool bf16_mma_hardware_available(const int cc) {
 | |
|     return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
 | |
| }
 | |
| 
 | |
| static bool fp32_mma_hardware_available(const int cc) {
 | |
|     return GGML_CUDA_CC_IS_CDNA(cc);
 | |
| }
 | |
| 
 | |
| static bool amd_mfma_available(const int cc) {
 | |
| #if !defined(GGML_HIP_NO_MMQ_MFMA)
 | |
|     return GGML_CUDA_CC_IS_CDNA(cc);
 | |
| #else
 | |
|     return false;
 | |
| #endif //!defined(GGML_HIP_NO_MMQ_MFMA)
 | |
| }
 | |
| 
 | |
| // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 | |
| static bool turing_mma_available(const int cc) {
 | |
|     return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 | |
| }
 | |
| 
 | |
| static bool ampere_mma_available(const int cc) {
 | |
|     return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
 | |
| }
 | |
| 
 | |
| static bool cp_async_available(const int cc) {
 | |
|     return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE;
 | |
| }
 | |
| 
 | |
| static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
 | |
| #if defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
 | |
|     return 64;
 | |
| #else
 | |
|     return 32;
 | |
| #endif // defined(GGML_USE_HIP) && (defined(__GFX9__) || defined(__GFX8__))
 | |
| }
 | |
| 
 | |
| [[noreturn]]
 | |
| static __device__ void no_device_code(
 | |
|     const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 | |
| 
 | |
| #if defined(GGML_USE_HIP)
 | |
|     printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
 | |
|            file_name, line, function_name, arch);
 | |
|     GGML_UNUSED(arch_list);
 | |
| #else
 | |
|     printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
 | |
|            file_name, line, function_name, arch, arch_list);
 | |
| #endif // defined(GGML_USE_HIP)
 | |
|     __trap();
 | |
| 
 | |
|     GGML_UNUSED(no_device_code); // suppress unused function warning
 | |
| 
 | |
| #if defined(GGML_USE_MUSA)
 | |
|     __builtin_unreachable();
 | |
| #endif // defined(GGML_USE_MUSA)
 | |
| }
 | |
| 
 | |
| #ifdef __CUDA_ARCH__
 | |
| #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
 | |
| #else
 | |
| #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 | |
| #endif // __CUDA_ARCH__
 | |
| 
 | |
| // The compiler is always able to unroll loops if they contain continue expressions.
 | |
| // In such cases loop unrolling can still be achieved via recursion:
 | |
| template <int n>
 | |
| struct ggml_cuda_unroll {
 | |
|     template <typename Func, typename... Args>
 | |
|     __device__ void operator()(const Func & f, Args... args) const {
 | |
|         f(n - 1, args...);
 | |
|         ggml_cuda_unroll<n - 1>{}(f, args...);
 | |
|     }
 | |
| };
 | |
| 
 | |
| template <>
 | |
| struct ggml_cuda_unroll<1> {
 | |
|     template <typename Func, typename... Args>
 | |
|     __device__ void operator()(const Func & f, Args... args) const {
 | |
|         f(0, args...);
 | |
|     }
 | |
| };
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ int warp_reduce_sum(int x) {
 | |
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | |
|     return __reduce_add_sync(0xffffffff, x);
 | |
| #else
 | |
| #pragma unroll
 | |
|     for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|         x += __shfl_xor_sync(0xffffffff, x, offset, width);
 | |
|     }
 | |
|     return x;
 | |
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | |
| }
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ float warp_reduce_sum(float x) {
 | |
| #pragma unroll
 | |
|     for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|         x += __shfl_xor_sync(0xffffffff, x, offset, width);
 | |
|     }
 | |
|     return x;
 | |
| }
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 | |
| #pragma unroll
 | |
|     for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|         a.x += __shfl_xor_sync(0xffffffff, a.x, offset, width);
 | |
|         a.y += __shfl_xor_sync(0xffffffff, a.y, offset, width);
 | |
|     }
 | |
|     return a;
 | |
| }
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 | |
| #ifdef FP16_AVAILABLE
 | |
| #pragma unroll
 | |
|     for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|         a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, width));
 | |
|     }
 | |
|     return a;
 | |
| 
 | |
| #else
 | |
|     NO_DEVICE_CODE;
 | |
|     return a;
 | |
| #endif // FP16_AVAILABLE
 | |
| }
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ int warp_reduce_all(int x) {
 | |
| #ifdef GGML_USE_HIP
 | |
| #pragma unroll
 | |
|     for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|         x = x && __shfl_xor_sync(0xffffffff, x, offset, width);
 | |
|     }
 | |
|     return x;
 | |
| #else
 | |
|     static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented");
 | |
|     return __all_sync(0xffffffff, x);
 | |
| #endif // GGML_USE_HIP
 | |
| }
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ float warp_reduce_max(float x) {
 | |
| #pragma unroll
 | |
|     for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|         x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, width));
 | |
|     }
 | |
|     return x;
 | |
| }
 | |
| 
 | |
| static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 | |
| #ifdef FP16_AVAILABLE
 | |
| 
 | |
| #if !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
 | |
|     return __float2half(fmaxf(__half2float(a), __half2float(b)));
 | |
| #else
 | |
|     return __hmax(a, b);
 | |
| #endif // !defined(GGML_USE_HIP) && CUDART_VERSION < CUDART_HMAX
 | |
| 
 | |
| #else
 | |
|    NO_DEVICE_CODE;
 | |
|    GGML_UNUSED(b);
 | |
|    return a;
 | |
| #endif // FP16_AVAILABLE
 | |
| }
 | |
| 
 | |
| static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
 | |
| #if defined(GGML_USE_HIP)
 | |
|     return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
 | |
| #elif CUDART_VERSION >= CUDART_HMAX
 | |
|     return __hmax2(a, b);
 | |
| #else
 | |
|     half2 ret;
 | |
|     reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
 | |
|     reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
 | |
|     return ret;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| template<int width = WARP_SIZE>
 | |
| static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 | |
| #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
 | |
| #pragma unroll
 | |
|    for (int offset = width/2; offset > 0; offset >>= 1) {
 | |
|        x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
 | |
|    }
 | |
|    return x;
 | |
| #else
 | |
|    GGML_UNUSED(x);
 | |
|    NO_DEVICE_CODE;
 | |
| #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
 | |
| }
 | |
| 
 | |
| #if (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || \
 | |
|     (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
 | |
| static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
 | |
|     const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
 | |
|     const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
 | |
|     return mask_low | mask_high;
 | |
| }
 | |
| #endif // (defined(CUDART_VERSION) && CUDART_VERSION < CUDART_HMASK) || defined(GGML_USE_HIP) || (defined(MUSART_VERSION) && MUSART_VERSION < MUSART_HMASK)
 | |
| 
 | |
| static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 | |
| #if defined(GGML_USE_HIP)
 | |
| #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
 | |
|     c = __builtin_amdgcn_sdot4(a, b, c, false);
 | |
| #elif defined(RDNA3) || defined(RDNA4)
 | |
|     c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 | |
| #elif defined(RDNA1) || defined(__gfx900__)
 | |
|     int tmp1;
 | |
|     int tmp2;
 | |
|     asm("\n \
 | |
|         v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
 | |
|         v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
 | |
|         v_add3_u32 %0, %1, %2, %0 \n \
 | |
|         v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
 | |
|         v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
 | |
|         v_add3_u32 %0, %1, %2, %0 \n \
 | |
|         "
 | |
|         : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
 | |
|         : "v"(a), "v"(b)
 | |
|     );
 | |
| #else
 | |
|     const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
 | |
|     const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
 | |
|     c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
 | |
| #endif
 | |
|     return c;
 | |
| 
 | |
| #else // defined(GGML_USE_HIP)
 | |
| 
 | |
| #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
 | |
|     return __dp4a(a, b, c);
 | |
| #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
 | |
|     const int8_t * a8 = (const int8_t *) &a;
 | |
|     const int8_t * b8 = (const int8_t *) &b;
 | |
|     return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
 | |
| #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
 | |
| 
 | |
| #endif // defined(GGML_USE_HIP)
 | |
| }
 | |
| 
 | |
| static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 | |
| #if CUDART_VERSION >= 12080
 | |
|     const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x);
 | |
|     return (float) e;
 | |
| #else
 | |
|     uint32_t bits;
 | |
|     if (x == 0) {
 | |
|         bits = 0x00400000;
 | |
|     } else {
 | |
|         bits = (uint32_t) x << 23;
 | |
|     }
 | |
| 
 | |
|     float result;
 | |
|     memcpy(&result, &bits, sizeof(float));
 | |
|     return result;
 | |
| #endif // CUDART_VERSION >= 12050
 | |
| }
 | |
| 
 | |
| typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
 | |
| 
 | |
| static __device__ __forceinline__ float get_alibi_slope(
 | |
|     const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
 | |
| ) {
 | |
|     if (max_bias <= 0.0f) {
 | |
|         return 1.0f;
 | |
|     }
 | |
|     const float base = h < n_head_log2 ? m0 : m1;
 | |
|     const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
 | |
| 
 | |
|     return powf(base, exph);
 | |
| }
 | |
| 
 | |
| template <ggml_type type>
 | |
| struct ggml_cuda_type_traits;
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_F16> {
 | |
|     static constexpr int qk = 1;
 | |
|     static constexpr int qr = 1;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
 | |
|     static constexpr int qk = QK4_0;
 | |
|     static constexpr int qr = QR4_0;
 | |
|     static constexpr int qi = QI4_0;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
 | |
|     static constexpr int qk = QK4_1;
 | |
|     static constexpr int qr = QR4_1;
 | |
|     static constexpr int qi = QI4_1;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
 | |
|     static constexpr int qk = QK5_0;
 | |
|     static constexpr int qr = QR5_0;
 | |
|     static constexpr int qi = QI5_0;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
 | |
|     static constexpr int qk = QK5_1;
 | |
|     static constexpr int qr = QR5_1;
 | |
|     static constexpr int qi = QI5_1;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
 | |
|     static constexpr int qk = QK8_0;
 | |
|     static constexpr int qr = QR8_0;
 | |
|     static constexpr int qi = QI8_0;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
 | |
|     static constexpr int qk = QK_MXFP4;
 | |
|     static constexpr int qr = QR_MXFP4;
 | |
|     static constexpr int qi = QI_MXFP4;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR2_K;
 | |
|     static constexpr int qi = QI2_K;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR3_K;
 | |
|     static constexpr int qi = QI3_K;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR4_K;
 | |
|     static constexpr int qi = QI4_K;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR5_K;
 | |
|     static constexpr int qi = QI5_K;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR6_K;
 | |
|     static constexpr int qi = QI6_K;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR2_XXS;
 | |
|     static constexpr int qi = QI2_XXS;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR2_XS;
 | |
|     static constexpr int qi = QI2_XS;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR2_S;
 | |
|     static constexpr int qi = QI2_S;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR3_XXS;
 | |
|     static constexpr int qi = QI3_XXS;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR1_S;
 | |
|     static constexpr int qi = QI1_S;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR1_M;
 | |
|     static constexpr int qi = QI1_M;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
 | |
|     static constexpr int qk = QK4_NL;
 | |
|     static constexpr int qr = QR4_NL;
 | |
|     static constexpr int qi = QI4_NL;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR4_XS;
 | |
|     static constexpr int qi = QI4_XS;
 | |
| };
 | |
| 
 | |
| template<>
 | |
| struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
 | |
|     static constexpr int qk = QK_K;
 | |
|     static constexpr int qr = QR3_S;
 | |
|     static constexpr int qi = QI3_S;
 | |
| };
 | |
| 
 | |
| //////////////////////
 | |
| 
 | |
| struct ggml_cuda_device_info {
 | |
|     int device_count;
 | |
| 
 | |
|     struct cuda_device_info {
 | |
|         int     cc;                 // compute capability
 | |
|         int     nsm;                // number of streaming multiprocessors
 | |
|         size_t  smpb;               // max. shared memory per block
 | |
|         size_t  smpbo;              // max. shared memory per block (with opt-in)
 | |
|         bool    integrated;         // Device is integrated as opposed to discrete
 | |
|         bool    vmm;                // virtual memory support
 | |
|         size_t  vmm_granularity;    // granularity of virtual memory
 | |
|         size_t  total_vram;
 | |
|         int     warp_size;          // Number of threads in a dispatch
 | |
|     };
 | |
| 
 | |
|     cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
 | |
| 
 | |
|     std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
 | |
| };
 | |
| 
 | |
| const ggml_cuda_device_info & ggml_cuda_info();
 | |
| 
 | |
| void ggml_cuda_set_device(int device);
 | |
| int ggml_cuda_get_device();
 | |
| 
 | |
| struct ggml_cuda_pool {
 | |
|     virtual ~ggml_cuda_pool() = default;
 | |
| 
 | |
|     virtual void * alloc(size_t size, size_t * actual_size) = 0;
 | |
|     virtual void free(void * ptr, size_t size) = 0;
 | |
| };
 | |
| 
 | |
| template<typename T>
 | |
| struct ggml_cuda_pool_alloc {
 | |
|     ggml_cuda_pool * pool = nullptr;
 | |
|     T * ptr = nullptr;
 | |
|     size_t actual_size = 0;
 | |
| 
 | |
|     ggml_cuda_pool_alloc() = default;
 | |
| 
 | |
|     explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
 | |
|     }
 | |
| 
 | |
|     ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
 | |
|         alloc(size);
 | |
|     }
 | |
| 
 | |
|     ~ggml_cuda_pool_alloc() {
 | |
|         if (ptr != nullptr) {
 | |
|             pool->free(ptr, actual_size);
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     // size is in number of elements
 | |
|     T * alloc(size_t size) {
 | |
|         GGML_ASSERT(pool != nullptr);
 | |
|         GGML_ASSERT(ptr == nullptr);
 | |
|         ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
 | |
|         return ptr;
 | |
|     }
 | |
| 
 | |
|     T * alloc(ggml_cuda_pool & pool, size_t size) {
 | |
|         this->pool = &pool;
 | |
|         return alloc(size);
 | |
|     }
 | |
| 
 | |
|     T * get() {
 | |
|         return ptr;
 | |
|     }
 | |
| 
 | |
|     ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
 | |
|     ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
 | |
|     ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
 | |
|     ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
 | |
| };
 | |
| 
 | |
| 
 | |
| // backend interface
 | |
| 
 | |
| struct ggml_tensor_extra_gpu {
 | |
|     void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
 | |
|     cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
 | |
| };
 | |
| 
 | |
| 
 | |
| #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
 | |
| #define USE_CUDA_GRAPH
 | |
| #endif
 | |
| 
 | |
| struct ggml_graph_node_properties {
 | |
|     void * node_address;
 | |
|     ggml_op node_op;
 | |
|     int64_t ne[GGML_MAX_DIMS];
 | |
|     size_t nb[GGML_MAX_DIMS];
 | |
|     void * src_address[GGML_MAX_SRC];
 | |
|     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 | |
| };
 | |
| 
 | |
| struct ggml_cuda_graph {
 | |
| #ifdef USE_CUDA_GRAPH
 | |
|     ~ggml_cuda_graph() {
 | |
|         if (instance != nullptr) {
 | |
|             CUDA_CHECK(cudaGraphExecDestroy(instance));
 | |
|         }
 | |
|         if (graph != nullptr) {
 | |
|             CUDA_CHECK(cudaGraphDestroy(graph));
 | |
|         }
 | |
|     }
 | |
|     cudaGraph_t graph = nullptr;
 | |
|     cudaGraphExec_t instance = nullptr;
 | |
|     size_t num_nodes = 0;
 | |
|     std::vector<cudaGraphNode_t> nodes;
 | |
|     std::vector<cudaKernelNodeParams> params;
 | |
|     bool disable_due_to_gpu_arch = false;
 | |
|     bool disable_due_to_too_many_updates = false;
 | |
|     bool disable_due_to_failed_graph_capture = false;
 | |
|     int number_consecutive_updates = 0;
 | |
|     std::vector<ggml_graph_node_properties> ggml_graph_properties;
 | |
|     bool use_cpy_indirection = false;
 | |
|     std::vector<char *> cpy_dest_ptrs;
 | |
|     char ** dest_ptrs_d;
 | |
|     int dest_ptrs_size = 0;
 | |
|     // Index to allow each cpy kernel to be aware of it's position within the graph
 | |
|     // relative to other cpy nodes.
 | |
|     int graph_cpynode_index = -1;
 | |
| #endif
 | |
| };
 | |
| 
 | |
| struct ggml_backend_cuda_context {
 | |
|     int device;
 | |
|     std::string name;
 | |
|     cudaEvent_t copy_event = nullptr;
 | |
| 
 | |
|     cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
 | |
|     cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 | |
| 
 | |
|     std::unique_ptr<ggml_cuda_graph> cuda_graph;
 | |
| 
 | |
|     explicit ggml_backend_cuda_context(int device) :
 | |
|         device(device),
 | |
|         name(GGML_CUDA_NAME + std::to_string(device)) {
 | |
|     }
 | |
| 
 | |
|     ~ggml_backend_cuda_context();
 | |
| 
 | |
|     cudaStream_t stream(int device, int stream) {
 | |
|         if (streams[device][stream] == nullptr) {
 | |
|             ggml_cuda_set_device(device);
 | |
|             CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
 | |
|         }
 | |
|         return streams[device][stream];
 | |
|     }
 | |
| 
 | |
|     cudaStream_t stream() {
 | |
|         return stream(device, 0);
 | |
|     }
 | |
| 
 | |
|     cublasHandle_t cublas_handle(int device) {
 | |
|         if (cublas_handles[device] == nullptr) {
 | |
|             ggml_cuda_set_device(device);
 | |
|             CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
 | |
|             CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
 | |
|         }
 | |
|         return cublas_handles[device];
 | |
|     }
 | |
| 
 | |
|     cublasHandle_t cublas_handle() {
 | |
|         return cublas_handle(device);
 | |
|     }
 | |
| 
 | |
|     // pool
 | |
|     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
 | |
| 
 | |
|     static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
 | |
| 
 | |
|     ggml_cuda_pool & pool(int device) {
 | |
|         if (pools[device] == nullptr) {
 | |
|             pools[device] = new_pool_for_device(device);
 | |
|         }
 | |
|         return *pools[device];
 | |
|     }
 | |
| 
 | |
|     ggml_cuda_pool & pool() {
 | |
|         return pool(device);
 | |
|     }
 | |
| };
 | 
