mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	[SYCL] fallback mmvq (#9088)
* fallback mmvq to mul_mat * mmvq in cuda path * Update ggml/src/ggml-sycl.cpp Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@codeplay.com> --------- Co-authored-by: Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
This commit is contained in:
		| @@ -3477,7 +3477,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |||||||
|  |  | ||||||
|     bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type) |     bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type) | ||||||
|         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 |         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 | ||||||
|         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; |         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE | ||||||
|  |         && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda || src1->ne[1] > MMVQ_MIN_BATCH_SIZE); | ||||||
|  |  | ||||||
|     bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type) |     bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type) | ||||||
|         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; |         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; | ||||||
|   | |||||||
| @@ -130,6 +130,7 @@ typedef sycl::float2 dfloat2; | |||||||
| #endif // GGML_SYCL_F16 | #endif // GGML_SYCL_F16 | ||||||
|  |  | ||||||
| #define MMVQ_MAX_BATCH_SIZE  8 | #define MMVQ_MAX_BATCH_SIZE  8 | ||||||
|  | #define MMVQ_MIN_BATCH_SIZE  4 | ||||||
|  |  | ||||||
| static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; | static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Meng, Hengyu
					Meng, Hengyu