mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	[SYCL] Fix SYCL im2col and convert Overflow with Large Dims (#9052)
				
					
				
			* sycl: fix im2col overflow and sync with cuda Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: fix convert overflow Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: fix convert and dequantize Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: fix ib in dmmv Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl:refine convert Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: move downsample global_range into common Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * test: add im2col and convert test cases Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * test: make new cases only in sycl Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * test: comment new test_cases for only local testing Signed-off-by: zhentaoyu <zhentao.yu@intel.com> --------- Signed-off-by: zhentaoyu <zhentao.yu@intel.com>
This commit is contained in:
		@@ -3,19 +3,19 @@
 | 
			
		||||
#include "presets.hpp"
 | 
			
		||||
 | 
			
		||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 | 
			
		||||
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
 | 
			
		||||
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
 | 
			
		||||
                             const sycl::nd_item<3> &item_ct1) {
 | 
			
		||||
    const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
 | 
			
		||||
    const int64_t i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
 | 
			
		||||
                       item_ct1.get_local_id(2));
 | 
			
		||||
 | 
			
		||||
    if (i >= k) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int ib = i/qk; // block index
 | 
			
		||||
    const int iqs = (i%qk)/qr; // quant index
 | 
			
		||||
    const int iybs = i - i%qk; // y block start index
 | 
			
		||||
    const int y_offset = qr == 1 ? 1 : qk/2;
 | 
			
		||||
    const int64_t ib = i/qk; // block index
 | 
			
		||||
    const int64_t iqs = (i%qk)/qr; // quant index
 | 
			
		||||
    const int64_t iybs = i - i%qk; // y block start index
 | 
			
		||||
    const int64_t y_offset = qr == 1 ? 1 : qk/2;
 | 
			
		||||
 | 
			
		||||
    // dequantize
 | 
			
		||||
    dfloat2 v;
 | 
			
		||||
@@ -27,9 +27,9 @@ static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__
 | 
			
		||||
 | 
			
		||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 | 
			
		||||
static void dequantize_block_sycl(const void *__restrict__ vx,
 | 
			
		||||
                                  dst_t *__restrict__ y, const int k,
 | 
			
		||||
                                  dst_t *__restrict__ y, const int64_t k,
 | 
			
		||||
                                  dpct::queue_ptr stream) {
 | 
			
		||||
    const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
 | 
			
		||||
    const int64_t num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -45,9 +45,9 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
#if QK_K == 256
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
@@ -77,9 +77,9 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
#if QK_K == 256
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
@@ -108,10 +108,10 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb32 = k / 32;
 | 
			
		||||
    const int nb = (k + 255) / 256;
 | 
			
		||||
    const int64_t nb32 = k / 32;
 | 
			
		||||
    const int64_t nb = (k + 255) / 256;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -126,10 +126,10 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb32 = k / 32;
 | 
			
		||||
    const int nb = (k + 255) / 256;
 | 
			
		||||
    const int64_t nb32 = k / 32;
 | 
			
		||||
    const int64_t nb = (k + 255) / 256;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -145,9 +145,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -165,9 +165,9 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
#if QK_K == 256
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
@@ -197,9 +197,9 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                     dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
#if QK_K == 256
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
@@ -229,9 +229,9 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                        dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -250,9 +250,9 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                        dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -271,9 +271,9 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                        dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -292,9 +292,9 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                       dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -313,9 +313,9 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                      dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -333,9 +333,9 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                        dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -354,9 +354,9 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                        dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = k / QK_K;
 | 
			
		||||
    const int64_t nb = k / QK_K;
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
@@ -374,9 +374,9 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                       dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = (k + QK_K - 1) / QK_K;
 | 
			
		||||
    const int64_t nb = (k + QK_K - 1) / QK_K;
 | 
			
		||||
#if QK_K == 64
 | 
			
		||||
    dequantize_row_iq4_nl_sycl(vx, y, k, stream);
 | 
			
		||||
#else
 | 
			
		||||
@@ -398,9 +398,9 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename dst_t>
 | 
			
		||||
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k,
 | 
			
		||||
                                       dpct::queue_ptr stream) {
 | 
			
		||||
    const int nb = (k + QK_K - 1) / QK_K;
 | 
			
		||||
    const int64_t nb = (k + QK_K - 1) / QK_K;
 | 
			
		||||
      {
 | 
			
		||||
            dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                         {sycl::aspect::fp16});
 | 
			
		||||
@@ -418,34 +418,34 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename src_t, typename dst_t>
 | 
			
		||||
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
 | 
			
		||||
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
 | 
			
		||||
                          const sycl::nd_item<3> &item_ct1) {
 | 
			
		||||
    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
 | 
			
		||||
                  item_ct1.get_local_id(2);
 | 
			
		||||
 | 
			
		||||
    if (i >= k) {
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
    const int64_t work_group_size = item_ct1.get_local_range(2);
 | 
			
		||||
    const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
 | 
			
		||||
 | 
			
		||||
    // make each work-item deal with more elements since sycl global range can not exceed max int
 | 
			
		||||
    const src_t * x = (src_t *) vx;
 | 
			
		||||
 | 
			
		||||
    y[i] = x[i];
 | 
			
		||||
    for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
 | 
			
		||||
        y[i] = x[i];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename src_t, typename dst_t>
 | 
			
		||||
static void convert_unary_sycl(const void *__restrict__ vx,
 | 
			
		||||
                               dst_t *__restrict__ y, const int k,
 | 
			
		||||
                               dst_t *__restrict__ y, const int64_t k,
 | 
			
		||||
                               dpct::queue_ptr stream) {
 | 
			
		||||
    const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
 | 
			
		||||
    const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
 | 
			
		||||
 | 
			
		||||
    // decrease global range when it exceeds the max int
 | 
			
		||||
    int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
 | 
			
		||||
    sycl::range<3> block_nums(1, 1, num_blocks);
 | 
			
		||||
    sycl::range<3> local_range(1, 1, local_size);
 | 
			
		||||
    {
 | 
			
		||||
        dpct::has_capability_or_fail(stream->get_device(),
 | 
			
		||||
                                     {sycl::aspect::fp16});
 | 
			
		||||
 | 
			
		||||
        stream->parallel_for(
 | 
			
		||||
            sycl::nd_range<3>(
 | 
			
		||||
                sycl::range<3>(1, 1, num_blocks) *
 | 
			
		||||
                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
 | 
			
		||||
                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
 | 
			
		||||
            sycl::nd_range<3>(block_nums * local_range, local_range),
 | 
			
		||||
            [=](sycl::nd_item<3> item_ct1) {
 | 
			
		||||
                convert_unary<src_t>(vx, y, k, item_ct1);
 | 
			
		||||
            });
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user