mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	ggml : build backends as libraries (#10256)
* ggml : build backends as libraries --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: R0CKSTAR <xiaodong.ye@mthreads.com>
This commit is contained in:
		@@ -6,7 +6,7 @@
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
#include <memory>
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_USE_HIPBLAS)
 | 
			
		||||
#if defined(GGML_USE_HIP)
 | 
			
		||||
#define GGML_COMMON_DECL_HIP
 | 
			
		||||
#define GGML_COMMON_IMPL_HIP
 | 
			
		||||
#else
 | 
			
		||||
@@ -26,13 +26,13 @@
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_USE_HIPBLAS)
 | 
			
		||||
#if defined(GGML_USE_HIP)
 | 
			
		||||
#include "vendors/hip.h"
 | 
			
		||||
#elif defined(GGML_USE_MUSA)
 | 
			
		||||
#include "vendors/musa.h"
 | 
			
		||||
#else
 | 
			
		||||
#include "vendors/cuda.h"
 | 
			
		||||
#endif // defined(GGML_USE_HIPBLAS)
 | 
			
		||||
#endif // defined(GGML_USE_HIP)
 | 
			
		||||
 | 
			
		||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
 | 
			
		||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
 | 
			
		||||
@@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 | 
			
		||||
 | 
			
		||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
 | 
			
		||||
 | 
			
		||||
#if !defined(GGML_USE_HIPBLAS)
 | 
			
		||||
#if !defined(GGML_USE_HIP)
 | 
			
		||||
static const char * cu_get_error_str(CUresult err) {
 | 
			
		||||
    const char * err_str;
 | 
			
		||||
    cuGetErrorString(err, &err_str);
 | 
			
		||||
@@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
 | 
			
		||||
typedef float2 dfloat2;
 | 
			
		||||
#endif // GGML_CUDA_F16
 | 
			
		||||
 | 
			
		||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
#define FP16_AVAILABLE
 | 
			
		||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
 | 
			
		||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 | 
			
		||||
#define FAST_FP16_AVAILABLE
 | 
			
		||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
 | 
			
		||||
 | 
			
		||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 | 
			
		||||
#define FP16_MMA_AVAILABLE
 | 
			
		||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
 | 
			
		||||
 | 
			
		||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 | 
			
		||||
#define INT8_MMA_AVAILABLE
 | 
			
		||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 | 
			
		||||
 | 
			
		||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
 | 
			
		||||
#define FLASH_ATTN_AVAILABLE
 | 
			
		||||
@@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
 | 
			
		||||
static __device__ void no_device_code(
 | 
			
		||||
    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
 | 
			
		||||
           file_name, line, function_name, arch);
 | 
			
		||||
    GGML_UNUSED(arch_list);
 | 
			
		||||
#else
 | 
			
		||||
    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
 | 
			
		||||
           file_name, line, function_name, arch, arch_list);
 | 
			
		||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
    __trap();
 | 
			
		||||
 | 
			
		||||
    GGML_UNUSED(no_device_code); // suppress unused function warning
 | 
			
		||||
@@ -176,7 +176,7 @@ static __device__ void no_device_code(
 | 
			
		||||
#endif // __CUDA_ARCH__
 | 
			
		||||
 | 
			
		||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
 | 
			
		||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
 | 
			
		||||
    return __reduce_add_sync(0xffffffff, x);
 | 
			
		||||
#else
 | 
			
		||||
#pragma unroll
 | 
			
		||||
@@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
 | 
			
		||||
        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
 | 
			
		||||
    }
 | 
			
		||||
    return x;
 | 
			
		||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
 | 
			
		||||
@@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
 | 
			
		||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 | 
			
		||||
#ifdef FP16_AVAILABLE
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = 16; mask > 0; mask >>= 1) {
 | 
			
		||||
        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
 | 
			
		||||
@@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 | 
			
		||||
        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
 | 
			
		||||
    }
 | 
			
		||||
    return a;
 | 
			
		||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
    NO_DEVICE_CODE;
 | 
			
		||||
@@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
 | 
			
		||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 | 
			
		||||
#ifdef FP16_AVAILABLE
 | 
			
		||||
 | 
			
		||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 | 
			
		||||
    return __float2half(fmaxf(__half2float(a), __half2float(b)));
 | 
			
		||||
#else
 | 
			
		||||
    return __hmax(a, b);
 | 
			
		||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
   NO_DEVICE_CODE;
 | 
			
		||||
@@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
 | 
			
		||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION >= CUDART_HMAX
 | 
			
		||||
    return __hmax2(a, b);
 | 
			
		||||
@@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
 | 
			
		||||
    GGML_UNUSED(a);
 | 
			
		||||
    GGML_UNUSED(b);
 | 
			
		||||
    NO_DEVICE_CODE;
 | 
			
		||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 | 
			
		||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
#pragma unroll
 | 
			
		||||
   for (int mask = 16; mask > 0; mask >>= 1) {
 | 
			
		||||
       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
 | 
			
		||||
@@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
 | 
			
		||||
#else
 | 
			
		||||
   GGML_UNUSED(x);
 | 
			
		||||
   NO_DEVICE_CODE;
 | 
			
		||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if CUDART_VERSION < CUDART_HMASK
 | 
			
		||||
@@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
 | 
			
		||||
#endif // CUDART_VERSION < CUDART_HMASK
 | 
			
		||||
 | 
			
		||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
 | 
			
		||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
 | 
			
		||||
    c = __builtin_amdgcn_sdot4(a, b, c, false);
 | 
			
		||||
#elif defined(RDNA3)
 | 
			
		||||
@@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 | 
			
		||||
#endif
 | 
			
		||||
    return c;
 | 
			
		||||
 | 
			
		||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
 | 
			
		||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
 | 
			
		||||
    return __dp4a(a, b, c);
 | 
			
		||||
@@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 | 
			
		||||
    return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
 | 
			
		||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 | 
			
		||||
 | 
			
		||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// TODO: move to ggml-common.h
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user