mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-10 10:27:03 +00:00
sycl: add usage of enqueue_functions extension (#14244)
* Add header and namespace to use enqueue_functions extension * Convert submit and parallel_for to use new extension in convert.cpp * Convert submit and parallel_for to use extension in ggml-sycl.cpp * Convert submit and parallel_for to use extension in gla.cpp * Convert submit and parallel_for in mmq.cpp * Convert submit and parallel_for in mmvq.cpp * Convert submit and parallel_for in remaining files * Convert all simple parallel_for to nd_launch from enqueue_functions extension * Wrapping extension in general function Create a general function that enable the enqueue_functions extension if it is enable in the compiler, otherwise call the general SYCL function to launch kernels. --------- Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
This commit is contained in:
@@ -1887,13 +1887,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
const size_t shared_mem = ncols_pad * sizeof(int);
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
||||
sycl::range<1>(shared_mem), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
|
||||
x, dst, ncols, ncols_pad, item_ct1,
|
||||
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
||||
@@ -1901,13 +1900,12 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
});
|
||||
});
|
||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
|
||||
sycl::range<1>(shared_mem), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
|
||||
x, dst, ncols, ncols_pad, item_ct1,
|
||||
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
|
||||
@@ -1925,50 +1923,47 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
|
||||
const sycl::range<3> block_nums(1, nrows, 1);
|
||||
const size_t shared_mem = 256 * sizeof(float);
|
||||
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<float, 1> shared_data(
|
||||
sycl::range<1>(shared_mem/sizeof(float)), cgh);
|
||||
sycl::local_accessor<int, 1> shared_indices(
|
||||
sycl::range<1>(shared_mem/sizeof(float)), cgh);
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int row = item_ct1.get_global_id(1);
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
const int row = item_ct1.get_global_id(1);
|
||||
|
||||
float max_val = -INFINITY;
|
||||
int max_idx = -1;
|
||||
float max_val = -INFINITY;
|
||||
int max_idx = -1;
|
||||
|
||||
for (int col = tid; col < ncols; col += 256) {
|
||||
float val = x[row * ncols + col];
|
||||
if (val > max_val) {
|
||||
max_val = val;
|
||||
max_idx = col;
|
||||
for (int col = tid; col < ncols; col += 256) {
|
||||
float val = x[row * ncols + col];
|
||||
if (val > max_val) {
|
||||
max_val = val;
|
||||
max_idx = col;
|
||||
}
|
||||
}
|
||||
|
||||
shared_data[tid] = max_val;
|
||||
shared_indices[tid] = max_idx;
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
for (int stride = 256 / 2; stride > 0; stride >>= 1) {
|
||||
if (tid < stride) {
|
||||
float val1 = shared_data[tid];
|
||||
float val2 = shared_data[tid + stride];
|
||||
if (val2 > val1) {
|
||||
shared_data[tid] = val2;
|
||||
shared_indices[tid] = shared_indices[tid + stride];
|
||||
}
|
||||
}
|
||||
|
||||
shared_data[tid] = max_val;
|
||||
shared_indices[tid] = max_idx;
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
}
|
||||
|
||||
for (int stride = 256/2; stride > 0; stride >>= 1) {
|
||||
if (tid < stride) {
|
||||
float val1 = shared_data[tid];
|
||||
float val2 = shared_data[tid + stride];
|
||||
if (val2 > val1) {
|
||||
shared_data[tid] = val2;
|
||||
shared_indices[tid] = shared_indices[tid + stride];
|
||||
}
|
||||
}
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
}
|
||||
|
||||
|
||||
if (tid == 0) {
|
||||
dst[row] = shared_indices[0];
|
||||
}
|
||||
});
|
||||
if (tid == 0) {
|
||||
dst[row] = shared_indices[0];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
||||
@@ -2952,7 +2947,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
void ** ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
||||
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
@@ -3456,7 +3451,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, 768u));
|
||||
sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<int, 0> src1_row_acc(cgh);
|
||||
|
||||
char *__restrict src1_contiguous_get =
|
||||
@@ -3468,9 +3463,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
size_t ids_nb_ct6 = ids->nb[1];
|
||||
size_t ids_nb_ct7 = ids->nb[0];
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_src1_to_contiguous(
|
||||
src1_original, src1_contiguous_get,
|
||||
dev_cur_src1_row_get,
|
||||
@@ -3501,15 +3495,14 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
|
||||
{
|
||||
sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, 768u));
|
||||
sycl::range<3> grid_dims(1, 1, num_src1_rows);
|
||||
stream->submit([&](sycl::handler &cgh) {
|
||||
sycl_launch(stream, [&](sycl::handler & cgh) {
|
||||
const char *__restrict dst_contiguous_get =
|
||||
dst_contiguous.get();
|
||||
const mmid_row_mapping *__restrict dev_row_mapping_get =
|
||||
dev_row_mapping.get();
|
||||
|
||||
cgh.parallel_for(
|
||||
sycl::nd_range<3>(grid_dims * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
sycl_parallel_for(
|
||||
cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_copy_dst_from_contiguous(dst_original,
|
||||
dst_contiguous_get,
|
||||
dev_row_mapping_get,
|
||||
|
||||
Reference in New Issue
Block a user