mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	wip
This commit is contained in:
		
							
								
								
									
										140
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										140
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -4244,7 +4244,7 @@ template <bool need_check> static __global__ void | ||||
| } | ||||
|  | ||||
| template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda> | ||||
| static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { | ||||
| static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst0, const int ncols, const int nrows) { | ||||
|     const int row = blockIdx.y*blockDim.y + threadIdx.y; | ||||
|  | ||||
|     if (row >= nrows) { | ||||
| @@ -4258,7 +4258,9 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * | ||||
|     float tmp = 0.0f; | ||||
|  | ||||
|     const block_q_t  * x = (const block_q_t  *) vx; | ||||
|     const block_q8_1 * y = (const block_q8_1 *) vy; | ||||
|     const block_q8_1 * y = (const block_q8_1 *) vy + blockIdx.x*blocks_per_row; | ||||
|  | ||||
|     float * dst = dst0 + blockIdx.x*nrows; | ||||
|  | ||||
|     for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { | ||||
|         const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index | ||||
| @@ -4282,11 +4284,14 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * | ||||
| } | ||||
|  | ||||
| template <int qk, int qr, dequantize_kernel_t dequantize_kernel> | ||||
| static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { | ||||
| static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y0, float * __restrict__ dst0, const int ncols, const int nrows) { | ||||
|     // qk = quantized weights per x block | ||||
|     // qr = number of quantized weights per data value in x block | ||||
|     const int row = blockIdx.y*blockDim.y + threadIdx.y; | ||||
|  | ||||
|     const dfloat * y = y0 + blockIdx.x*ncols; | ||||
|     float * dst = dst0 + blockIdx.x*nrows; | ||||
|  | ||||
|     if (row >= nrows) { | ||||
|         return; | ||||
|     } | ||||
| @@ -4813,178 +4818,178 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 | ||||
|     const int block_num_y = (nrows + ny - 1) / ny; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(32, ny, 1); | ||||
|     dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int ny = 2 / K_QUANTS_PER_ITERATION; | ||||
|     const int block_num_y = (nrows + ny - 1) / ny; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(32, ny, 1); | ||||
|     dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int ny = 2 / K_QUANTS_PER_ITERATION; | ||||
|     const int block_num_y = (nrows + ny - 1) / ny; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(32, ny, 1); | ||||
|     dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const dim3 block_dims(32, 1, 1); | ||||
|     dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols); | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int ny = 2 / K_QUANTS_PER_ITERATION; | ||||
|     const int block_num_y = (nrows + ny - 1) / ny; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(32, ny, 1); | ||||
|     dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK4_0 == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK4_1 == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK5_0 == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK5_1 == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK8_0 == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| } | ||||
|  | ||||
| static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % QK_K == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); | ||||
| @@ -5000,10 +5005,10 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu | ||||
|     dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); | ||||
| } | ||||
|  | ||||
| static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
| static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) { | ||||
|     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); | ||||
|     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; | ||||
|     const dim3 block_nums(1, block_num_y, 1); | ||||
|     const dim3 block_nums(nb, block_num_y, 1); | ||||
|     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); | ||||
|     dequantize_mul_mat_vec<1, 1, convert_f16> | ||||
|         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); | ||||
| @@ -6212,38 +6217,39 @@ inline void ggml_cuda_op_mul_mat_vec_q( | ||||
|     const int64_t src1_padded_row_size, const cudaStream_t & stream) { | ||||
|  | ||||
|     const int64_t ne00 = src0->ne[0]; | ||||
|     const int64_t ne11 = src1->ne[1]; | ||||
|     const int64_t row_diff = row_high - row_low; | ||||
|  | ||||
|     switch (src0->type) { | ||||
|         case GGML_TYPE_Q4_0: | ||||
|             mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q4_1: | ||||
|             mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q5_0: | ||||
|             mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q5_1: | ||||
|             mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q8_0: | ||||
|             mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q2_K: | ||||
|             mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q3_K: | ||||
|             mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q4_K: | ||||
|             mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q5_K: | ||||
|             mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q6_K: | ||||
|             mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         default: | ||||
|             GGML_ASSERT(false); | ||||
| @@ -6263,6 +6269,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( | ||||
|     const int64_t src1_padded_row_size, const cudaStream_t & stream) { | ||||
|  | ||||
|     const int64_t ne00 = src0->ne[0]; | ||||
|     const int64_t ne11 = src1->ne[1]; | ||||
|     const int64_t row_diff = row_high - row_low; | ||||
|  | ||||
|     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics | ||||
| @@ -6286,37 +6293,37 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( | ||||
|  | ||||
|     switch (src0->type) { | ||||
|         case GGML_TYPE_Q4_0: | ||||
|             dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q4_1: | ||||
|             dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q5_0: | ||||
|             dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q5_1: | ||||
|             dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q8_0: | ||||
|             dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q2_K: | ||||
|             dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i,  dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q3_K: | ||||
|             dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i,  dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q4_K: | ||||
|             dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i,  dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q5_K: | ||||
|             dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i,  dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_Q6_K: | ||||
|             dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); | ||||
|             dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i,  dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         case GGML_TYPE_F16: | ||||
|             convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); | ||||
|             convert_mul_mat_vec_f16_cuda    (src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream); | ||||
|             break; | ||||
|         default: | ||||
|             GGML_ASSERT(false); | ||||
| @@ -7328,7 +7335,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 | ||||
|     } else if (src0->type == GGML_TYPE_F32) { | ||||
|         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); | ||||
|     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { | ||||
|         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { | ||||
|         if ((src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) || | ||||
|             (src1->ne[1] <= MMQ_MAX_BATCH_SIZE && src0->ne[0] % MATRIX_ROW_PADDING == 0)) { | ||||
| #ifdef GGML_CUDA_FORCE_DMMV | ||||
|             const bool use_mul_mat_vec_q = false; | ||||
| #else | ||||
|   | ||||
		Reference in New Issue
	
	Block a user