mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	sycl : variable sg_size support for mmvq kernels (#12336)
This commit is contained in:
		
				
					committed by
					
						
						GitHub
					
				
			
			
				
	
			
			
			
						parent
						
							34c961b181
						
					
				
				
					commit
					363f8c5d67
				
			@@ -3,44 +3,42 @@
 | 
			
		||||
#include <cassert>
 | 
			
		||||
 | 
			
		||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
 | 
			
		||||
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
 | 
			
		||||
                          const sycl::nd_item<3> &item_ct1) {
 | 
			
		||||
    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
 | 
			
		||||
                    item_ct1.get_local_id(1);
 | 
			
		||||
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
 | 
			
		||||
                          const int ncols, const int nrows, const sycl::nd_item<3> & item_ct1) {
 | 
			
		||||
    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
 | 
			
		||||
 | 
			
		||||
    if (row >= nrows) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
    const int     blocks_per_row  = ncols / qk;
 | 
			
		||||
    constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;  // Ensuring blocks_per_warp > 0
 | 
			
		||||
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    assert(blocks_per_warp > 0);
 | 
			
		||||
 | 
			
		||||
    // partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
 | 
			
		||||
    const block_q_t  * x = (const block_q_t  *) vx;
 | 
			
		||||
    const block_q_t *  x = (const block_q_t *) vx;
 | 
			
		||||
    const block_q8_1 * y = (const block_q8_1 *) vy;
 | 
			
		||||
 | 
			
		||||
    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
 | 
			
		||||
         i += blocks_per_warp) {
 | 
			
		||||
        const int ibx = row*blocks_per_row + i; // x block index
 | 
			
		||||
    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
 | 
			
		||||
        const int ibx = row * blocks_per_row + i;  // x block index
 | 
			
		||||
 | 
			
		||||
        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
 | 
			
		||||
        const int iby = i * (qk / QK8_1);          // y block index that aligns with ibx
 | 
			
		||||
 | 
			
		||||
        const int iqs =
 | 
			
		||||
            vdr *
 | 
			
		||||
            (item_ct1.get_local_id(2) %
 | 
			
		||||
             (qi / vdr)); // x block quant index when casting the quants to int
 | 
			
		||||
        for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
 | 
			
		||||
            const int iqs = elem + vdr * (item_ct1.get_local_id(2) %
 | 
			
		||||
                                          (qi / vdr));  // x block quant index when casting the quants to int
 | 
			
		||||
 | 
			
		||||
        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
 | 
			
		||||
            tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (item_ct1.get_local_id(2) == 0) {
 | 
			
		||||
@@ -62,7 +60,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
@@ -87,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -111,7 +109,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -135,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -159,7 +157,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -183,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -207,7 +205,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -231,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -255,7 +253,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -279,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -303,7 +301,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -327,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -351,7 +349,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -375,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -399,7 +397,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -423,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -448,7 +446,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int blocks_per_row = ncols / qk;
 | 
			
		||||
    const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
 | 
			
		||||
    const int blocks_per_warp = vdr * WARP_SIZE / qi;
 | 
			
		||||
    assert(blocks_per_warp>0);
 | 
			
		||||
// partial sum for each thread
 | 
			
		||||
    float tmp = 0.0f;
 | 
			
		||||
@@ -472,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
 | 
			
		||||
 | 
			
		||||
    // sum up partial sums and write back result
 | 
			
		||||
#pragma unroll
 | 
			
		||||
    for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
 | 
			
		||||
        tmp +=
 | 
			
		||||
            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
 | 
			
		||||
    }
 | 
			
		||||
@@ -489,7 +487,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK4_0 == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -497,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
 | 
			
		||||
                                      VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -513,7 +511,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK4_1 == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -521,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
 | 
			
		||||
                                      VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -537,7 +535,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK5_0 == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -545,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
 | 
			
		||||
                                      VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -561,7 +559,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK5_1 == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -569,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
 | 
			
		||||
                                      VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -585,7 +583,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK8_0 == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -593,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
 | 
			
		||||
                                      VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -609,7 +607,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -617,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
 | 
			
		||||
                                      VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -633,7 +631,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -641,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
 | 
			
		||||
                                      VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -657,7 +655,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -665,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
 | 
			
		||||
                                      VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -681,7 +679,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -689,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
 | 
			
		||||
                                      VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -705,7 +703,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
@@ -713,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
 | 
			
		||||
                                      VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
@@ -730,13 +728,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -751,13 +749,13 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
        stream->submit([&](sycl::handler & cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -772,14 +770,14 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -794,14 +792,14 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -816,14 +814,14 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -838,14 +836,14 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -860,13 +858,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -881,14 +879,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK4_NL == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
@@ -903,14 +901,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
 | 
			
		||||
    GGML_ASSERT(ncols % QK_K == 0);
 | 
			
		||||
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
 | 
			
		||||
    const sycl::range<3> block_nums(1, 1, block_num_y);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
 | 
			
		||||
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
        stream->submit([&](sycl::handler &cgh) {
 | 
			
		||||
            cgh.parallel_for(
 | 
			
		||||
                sycl::nd_range<3>(block_nums * block_dims, block_dims),
 | 
			
		||||
                [=](sycl::nd_item<3> item_ct1)
 | 
			
		||||
                    [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
 | 
			
		||||
                    [[intel::reqd_sub_group_size(WARP_SIZE)]] {
 | 
			
		||||
                        mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
 | 
			
		||||
                            vx, vy, dst, ncols, nrows, item_ct1);
 | 
			
		||||
                    });
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user