sycl: add usage of enqueue_functions extension (#14244)

* Add header and namespace to use enqueue_functions extension

* Convert submit and parallel_for to use new extension in convert.cpp

* Convert submit and parallel_for to use extension in ggml-sycl.cpp

* Convert submit and parallel_for to use extension in gla.cpp

* Convert submit and parallel_for in mmq.cpp

* Convert submit and parallel_for in mmvq.cpp

* Convert submit and parallel_for in remaining files

* Convert all simple parallel_for to nd_launch from enqueue_functions
extension

* Wrapping extension in general function

Create a general function that enable the enqueue_functions extension if
it is enable in the compiler, otherwise call the general SYCL function
to launch kernels.

---------

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
This commit is contained in:
Nicolò Scipione
2025-06-20 15:07:21 +02:00
committed by GitHub
parent 6369be0735
commit 8308f98c7f
19 changed files with 750 additions and 986 deletions

View File

@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1829,9 +1829,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q4_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -1853,7 +1852,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1864,9 +1863,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q4_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -1933,7 +1931,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1944,9 +1942,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q4_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -1968,7 +1965,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1979,9 +1976,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q4_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2048,7 +2044,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2059,9 +2055,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q5_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2083,7 +2078,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2094,9 +2089,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q5_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2163,7 +2157,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2174,9 +2168,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q5_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2198,7 +2191,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2209,9 +2202,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q5_1<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2278,7 +2270,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2289,9 +2281,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q8_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2313,7 +2304,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2324,9 +2315,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q8_0<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2393,7 +2383,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2406,9 +2396,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q2_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2431,7 +2420,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2444,9 +2433,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q2_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2516,7 +2504,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2531,9 +2519,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q3_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2557,7 +2544,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2572,9 +2559,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q3_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2644,7 +2630,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2657,9 +2643,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q4_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2682,7 +2667,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2695,9 +2680,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q4_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2765,7 +2749,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2778,9 +2762,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q5_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2803,7 +2786,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2816,9 +2799,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q5_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2886,7 +2868,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2899,9 +2881,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q6_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,
@@ -2924,7 +2905,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->submit([&](sycl::handler &cgh) {
sycl_launch(stream, [&](sycl::handler & cgh) {
sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2937,9 +2918,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
sycl_parallel_for(
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
mul_mat_q6_K<need_check>(
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
nrows_dst, item_ct1,