mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CUDA: MMQ code deduplication + iquant support (#8495)
* CUDA: MMQ code deduplication + iquant support * 1 less parallel job for CI build
This commit is contained in:
		
							
								
								
									
										2
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							| @@ -860,7 +860,7 @@ jobs: | |||||||
|           mkdir build |           mkdir build | ||||||
|           cd build |           cd build | ||||||
|           cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON |           cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON | ||||||
|           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} |           cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) | ||||||
|  |  | ||||||
|       - name: Determine tag name |       - name: Determine tag name | ||||||
|         id: tag |         id: tag | ||||||
|   | |||||||
| @@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q( | |||||||
|         case GGML_TYPE_Q6_K: |         case GGML_TYPE_Q6_K: | ||||||
|             mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream); |             mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream); | ||||||
|             break; |             break; | ||||||
|  |         case GGML_TYPE_IQ2_XXS: | ||||||
|  |             mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream); | ||||||
|  |             break; | ||||||
|  |         case GGML_TYPE_IQ2_XS: | ||||||
|  |             mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream); | ||||||
|  |             break; | ||||||
|  |         case GGML_TYPE_IQ2_S: | ||||||
|  |             mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream); | ||||||
|  |             break; | ||||||
|  |         case GGML_TYPE_IQ3_XXS: | ||||||
|  |             mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream); | ||||||
|  |             break; | ||||||
|  |         case GGML_TYPE_IQ3_S: | ||||||
|  |             mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream); | ||||||
|  |             break; | ||||||
|  |         case GGML_TYPE_IQ1_S: | ||||||
|  |             mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream); | ||||||
|  |             break; | ||||||
|         case GGML_TYPE_IQ4_XS: |         case GGML_TYPE_IQ4_XS: | ||||||
|             mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream); |             mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream); | ||||||
|             break; |             break; | ||||||
| @@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { | |||||||
|         case GGML_TYPE_Q4_K: |         case GGML_TYPE_Q4_K: | ||||||
|         case GGML_TYPE_Q5_K: |         case GGML_TYPE_Q5_K: | ||||||
|         case GGML_TYPE_Q6_K: |         case GGML_TYPE_Q6_K: | ||||||
|  |         case GGML_TYPE_IQ2_XXS: | ||||||
|  |         case GGML_TYPE_IQ2_XS: | ||||||
|  |         case GGML_TYPE_IQ2_S: | ||||||
|  |         case GGML_TYPE_IQ3_XXS: | ||||||
|  |         case GGML_TYPE_IQ3_S: | ||||||
|  |         case GGML_TYPE_IQ1_S: | ||||||
|         case GGML_TYPE_IQ4_XS: |         case GGML_TYPE_IQ4_XS: | ||||||
|         case GGML_TYPE_IQ4_NL: |         case GGML_TYPE_IQ4_NL: | ||||||
|             mmq_supported = true; |             mmq_supported = true; | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block} | |||||||
| TYPES_MMQ = [ | TYPES_MMQ = [ | ||||||
|     "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", |     "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", | ||||||
|     "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", |     "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", | ||||||
|     "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" |     "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S", | ||||||
|  |     "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" | ||||||
| ] | ] | ||||||
|  |  | ||||||
| SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. | SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|   | |||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|  |  | ||||||
|  | #include "../mmq.cuh" | ||||||
|  |  | ||||||
|  | DECL_MMQ_CASE(GGML_TYPE_IQ1_S); | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|  |  | ||||||
|  | #include "../mmq.cuh" | ||||||
|  |  | ||||||
|  | DECL_MMQ_CASE(GGML_TYPE_IQ2_S); | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|  |  | ||||||
|  | #include "../mmq.cuh" | ||||||
|  |  | ||||||
|  | DECL_MMQ_CASE(GGML_TYPE_IQ2_XS); | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|  |  | ||||||
|  | #include "../mmq.cuh" | ||||||
|  |  | ||||||
|  | DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS); | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|  |  | ||||||
|  | #include "../mmq.cuh" | ||||||
|  |  | ||||||
|  | DECL_MMQ_CASE(GGML_TYPE_IQ3_S); | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | // This file has been autogenerated by generate_cu_files.py, do not edit manually. | ||||||
|  |  | ||||||
|  | #include "../mmq.cuh" | ||||||
|  |  | ||||||
|  | DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS); | ||||||
| @@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp | |||||||
|     return sumi*d8d8 + m8s8 / (QI8_1 / vdr); |     return sumi*d8d8 + m8s8 / (QI8_1 / vdr); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl( | ||||||
|  |     const int * v, const int * u, const float * d8_0, const float & d8_1) { | ||||||
|  |  | ||||||
|  |     float sumf = 0.0f; | ||||||
|  |  | ||||||
|  | #pragma unroll | ||||||
|  |     for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) { | ||||||
|  |         int sumi = 0; | ||||||
|  |  | ||||||
|  | #pragma unroll | ||||||
|  |         for (int i = i0; i < i0 + QI8_0/2; ++i) { | ||||||
|  |             // SIMD dot product of quantized values | ||||||
|  |             sumi = ggml_cuda_dp4a(v[i], u[i], sumi); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         sumf += d8_0[i0/(QI8_0/2)]*sumi; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return d8_1*sumf; | ||||||
|  | } | ||||||
|  |  | ||||||
| #define VDR_Q2_K_Q8_1_MMVQ 1 | #define VDR_Q2_K_Q8_1_MMVQ 1 | ||||||
| #define VDR_Q2_K_Q8_1_MMQ  4 | #define VDR_Q2_K_Q8_1_MMQ  4 | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler