mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-18 11:46:58 +00:00
sycl: add usage of enqueue_functions extension (#14244)
* Add header and namespace to use enqueue_functions extension * Convert submit and parallel_for to use new extension in convert.cpp * Convert submit and parallel_for to use extension in ggml-sycl.cpp * Convert submit and parallel_for to use extension in gla.cpp * Convert submit and parallel_for in mmq.cpp * Convert submit and parallel_for in mmvq.cpp * Convert submit and parallel_for in remaining files * Convert all simple parallel_for to nd_launch from enqueue_functions extension * Wrapping extension in general function Create a general function that enable the enqueue_functions extension if it is enable in the compiler, otherwise call the general SYCL function to launch kernels. --------- Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
This commit is contained in:
@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
|
||||
@@ -1829,9 +1829,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -1853,7 +1852,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
|
||||
@@ -1864,9 +1863,8 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -1933,7 +1931,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
|
||||
@@ -1944,9 +1942,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -1968,7 +1965,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
|
||||
@@ -1979,9 +1976,8 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2048,7 +2044,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
|
||||
@@ -2059,9 +2055,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2083,7 +2078,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
|
||||
@@ -2094,9 +2089,8 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2163,7 +2157,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
|
||||
@@ -2174,9 +2168,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2198,7 +2191,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
|
||||
@@ -2209,9 +2202,8 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_1<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2278,7 +2270,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
|
||||
@@ -2289,9 +2281,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2313,7 +2304,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
|
||||
@@ -2324,9 +2315,8 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q8_0<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2393,7 +2383,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
|
||||
@@ -2406,9 +2396,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2431,7 +2420,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
|
||||
@@ -2444,9 +2433,8 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q2_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2516,7 +2504,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
|
||||
@@ -2531,9 +2519,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2557,7 +2544,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
|
||||
@@ -2572,9 +2559,8 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q3_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2644,7 +2630,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
|
||||
@@ -2657,9 +2643,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2682,7 +2667,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
|
||||
@@ -2695,9 +2680,8 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q4_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2765,7 +2749,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
|
||||
@@ -2778,9 +2762,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2803,7 +2786,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
|
||||
@@ -2816,9 +2799,8 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q5_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2886,7 +2868,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
|
||||
@@ -2899,9 +2881,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
@@ -2924,7 +2905,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
|
||||
sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
|
||||
sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
|
||||
@@ -2937,9 +2918,8 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
|
||||
sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
mul_mat_q6_K<need_check>(
|
||||
vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
|
||||
nrows_dst, item_ct1,
|
||||
|
||||
Reference in New Issue
Block a user