mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CUDA: MMQ code deduplication + iquant support (#8495)
* CUDA: MMQ code deduplication + iquant support * 1 less parallel job for CI build
This commit is contained in:
		
							
								
								
									
										2
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							| @@ -860,7 +860,7 @@ jobs: | ||||
|           mkdir build | ||||
|           cd build | ||||
|           cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON | ||||
|           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} | ||||
|           cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) | ||||
|  | ||||
|       - name: Determine tag name | ||||
|         id: tag | ||||
|   | ||||
| @@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q( | ||||
|         case GGML_TYPE_Q6_K: | ||||
|             mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ2_XXS: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ2_XS: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ2_S: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ3_XXS: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ3_S: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ1_S: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_IQ4_XS: | ||||
|             mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream); | ||||
|             break; | ||||
| @@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { | ||||
|         case GGML_TYPE_Q4_K: | ||||
|         case GGML_TYPE_Q5_K: | ||||
|         case GGML_TYPE_Q6_K: | ||||
|         case GGML_TYPE_IQ2_XXS: | ||||
|         case GGML_TYPE_IQ2_XS: | ||||
|         case GGML_TYPE_IQ2_S: | ||||
|         case GGML_TYPE_IQ3_XXS: | ||||
|         case GGML_TYPE_IQ3_S: | ||||
|         case GGML_TYPE_IQ1_S: | ||||
|         case GGML_TYPE_IQ4_XS: | ||||
|         case GGML_TYPE_IQ4_NL: | ||||
|             mmq_supported = true; | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block} | ||||
| TYPES_MMQ = [ | ||||
|     "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", | ||||
|     "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", | ||||
|     "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" | ||||
|     "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S", | ||||
|     "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" | ||||
| ] | ||||
|  | ||||
| SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|   | ||||
| @@ -0,0 +1,5 @@ | ||||
| // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|  | ||||
| #include "../mmq.cuh" | ||||
|  | ||||
| DECL_MMQ_CASE(GGML_TYPE_IQ1_S); | ||||
| @@ -0,0 +1,5 @@ | ||||
| // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|  | ||||
| #include "../mmq.cuh" | ||||
|  | ||||
| DECL_MMQ_CASE(GGML_TYPE_IQ2_S); | ||||
| @@ -0,0 +1,5 @@ | ||||
| // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|  | ||||
| #include "../mmq.cuh" | ||||
|  | ||||
| DECL_MMQ_CASE(GGML_TYPE_IQ2_XS); | ||||
| @@ -0,0 +1,5 @@ | ||||
| // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|  | ||||
| #include "../mmq.cuh" | ||||
|  | ||||
| DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS); | ||||
| @@ -0,0 +1,5 @@ | ||||
| // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|  | ||||
| #include "../mmq.cuh" | ||||
|  | ||||
| DECL_MMQ_CASE(GGML_TYPE_IQ3_S); | ||||
| @@ -0,0 +1,5 @@ | ||||
| // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||
|  | ||||
| #include "../mmq.cuh" | ||||
|  | ||||
| DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS); | ||||
| @@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp | ||||
|     return sumi*d8d8 + m8s8 / (QI8_1 / vdr); | ||||
| } | ||||
|  | ||||
| template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl( | ||||
|     const int * v, const int * u, const float * d8_0, const float & d8_1) { | ||||
|  | ||||
|     float sumf = 0.0f; | ||||
|  | ||||
| #pragma unroll | ||||
|     for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) { | ||||
|         int sumi = 0; | ||||
|  | ||||
| #pragma unroll | ||||
|         for (int i = i0; i < i0 + QI8_0/2; ++i) { | ||||
|             // SIMD dot product of quantized values | ||||
|             sumi = ggml_cuda_dp4a(v[i], u[i], sumi); | ||||
|         } | ||||
|  | ||||
|         sumf += d8_0[i0/(QI8_0/2)]*sumi; | ||||
|     } | ||||
|  | ||||
|     return d8_1*sumf; | ||||
| } | ||||
|  | ||||
| #define VDR_Q2_K_Q8_1_MMVQ 1 | ||||
| #define VDR_Q2_K_Q8_1_MMQ  4 | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler