mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	sycl: reordered Q4_K MMVQ (#13109)
This commit is contained in:
		
				
					committed by
					
						
						GitHub
					
				
			
			
				
	
			
			
			
						parent
						
							9c404ed54c
						
					
				
				
					commit
					64bb51cf90
				
			@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
 | 
			
		||||
    const int     blocks_per_row              = ncols / block_traits::qk;
 | 
			
		||||
    constexpr int blocks_per_subgroup         = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
 | 
			
		||||
    constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
 | 
			
		||||
    const int     nblocks                     = nrows * (ncols / block_traits::qk);
 | 
			
		||||
 | 
			
		||||
    static_assert(blocks_per_subgroup > 0);
 | 
			
		||||
    static_assert(block_elements_per_subgroup > 0);
 | 
			
		||||
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
 | 
			
		||||
            // x block quant index when casting the quants to int
 | 
			
		||||
            const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
 | 
			
		||||
 | 
			
		||||
            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
 | 
			
		||||
            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
 | 
			
		||||
    const int nrows, dpct::queue_ptr stream) {
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
 | 
			
		||||
    const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
 | 
			
		||||
    constexpr size_t num_subgroups = 16;
 | 
			
		||||
    GGML_ASSERT(block_num_y % num_subgroups == 0);
 | 
			
		||||
 | 
			
		||||
    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
 | 
			
		||||
 | 
			
		||||
    stream->submit([&](sycl::handler & cgh) {
 | 
			
		||||
        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
 | 
			
		||||
                            [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                                mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
 | 
			
		||||
                                                                                            nrows, nd_item);
 | 
			
		||||
                            });
 | 
			
		||||
    });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
                                       float *dst, const int ncols,
 | 
			
		||||
                                       const int nrows,
 | 
			
		||||
@@ -1035,7 +1057,12 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
 | 
			
		||||
                mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
 | 
			
		||||
                break;
 | 
			
		||||
            case GGML_TYPE_Q4_K:
 | 
			
		||||
                mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
 | 
			
		||||
                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
 | 
			
		||||
                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
 | 
			
		||||
                    reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
 | 
			
		||||
                } else {
 | 
			
		||||
                    mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
 | 
			
		||||
                }
 | 
			
		||||
                break;
 | 
			
		||||
            case GGML_TYPE_Q5_K:
 | 
			
		||||
                mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user