mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	CUDA: fix MMV kernel being used for FP16 src1 (#10357)
This commit is contained in:
		| @@ -1760,11 +1760,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor | ||||
|     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); | ||||
|     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); | ||||
|  | ||||
|     if (!split && src0->type == GGML_TYPE_F16 && src1->ne[1] == 1 && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { | ||||
|     if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) { | ||||
|         // the custom F16 vector kernel can be used over batched cuBLAS GEMM | ||||
|         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) | ||||
|         ggml_cuda_mul_mat_vec(ctx, src0, src1, dst); | ||||
|     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) | ||||
|                && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { | ||||
|         // KQ + KQV multi-batch without FlashAttention | ||||
|         // general KQ + KQV multi-batch without FlashAttention | ||||
|         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); | ||||
|     } else if (use_mul_mat_vec) { | ||||
|         ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler