mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-03 09:22:01 +00:00
[SYCL] Optimize mul_mat for Q4_0 on Intel GPU (#12035)
* opt performance by reorder for Intel GPU * detect hw type and save opt feature, and print opt feature * correct name * support optimize graph once when compute graph, record the opt status in tensor->extra, make CI passed * add env variable GGML_SYCL_DISABLE_OPT for debug * use syclex::architecture replace the custom hw define, update the guide for GGML_SYCL_DISABLE_OPT * add performance data * mv getrows functions to separeted files * fix global variables --------- Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
This commit is contained in:
@@ -3,7 +3,6 @@
|
||||
#include "dequantize.hpp"
|
||||
#include "presets.hpp"
|
||||
|
||||
|
||||
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
const sycl::half *x = (const sycl::half *)vx;
|
||||
|
||||
@@ -91,6 +90,112 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
|
||||
}
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
|
||||
static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
// qk = quantized weights per x block
|
||||
// qr = number of quantized weights per data value in x block
|
||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||
item_ct1.get_local_id(1);
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
|
||||
const int ncols_left = ncols % (QK4_0*WARP_SIZE);
|
||||
const int ncols_align = ncols - ncols_left;
|
||||
const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
|
||||
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
|
||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// partial sum for each thread
|
||||
#ifdef GGML_SYCL_F16
|
||||
sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
||||
#else
|
||||
float tmp = 0.0f;
|
||||
#endif // GGML_SYCL_F16
|
||||
const char *d_ptr = (const char*)vx+ncols*nrows/2;
|
||||
int i=0;
|
||||
for (i = 0; i < ncols_align; i += iter_stride) {
|
||||
const int col = i + vals_per_iter*tid;
|
||||
const int ib = (row*ncols + col)/qk; // x block index
|
||||
const int iqs = (col%qk)/qr; // x quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// processing >2 values per i iter is faster for fast GPUs
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vals_per_iter; j += 2) {
|
||||
// process 2 vals per j iter
|
||||
|
||||
// dequantize
|
||||
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
||||
dfloat2 v;
|
||||
dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
|
||||
|
||||
// matrix multiplication
|
||||
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
||||
#ifdef GGML_SYCL_F16
|
||||
dfloat2 t1{y[iybs + iqs + j / qr + 0],
|
||||
y[iybs + iqs + j / qr + y_offset]};
|
||||
|
||||
tmp += v * t1;
|
||||
#else
|
||||
tmp += v.x() * y[iybs + iqs + j / qr + 0];
|
||||
tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < ncols; i += iter_stride) {
|
||||
if (tid>=ncols_left/QK4_0) continue;
|
||||
const int col = i + vals_per_iter*tid;
|
||||
const int ib = (row*ncols + col)/qk; // x block index
|
||||
const int iqs = (col%qk)/qr; // x quant index
|
||||
const int iybs = col - col%qk; // y block start index
|
||||
|
||||
// processing >2 values per i iter is faster for fast GPUs
|
||||
#pragma unroll
|
||||
for (int j = 0; j < vals_per_iter; j += 2) {
|
||||
// process 2 vals per j iter
|
||||
|
||||
// dequantize
|
||||
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
||||
dfloat2 v;
|
||||
dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
|
||||
|
||||
// matrix multiplication
|
||||
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
||||
#ifdef GGML_SYCL_F16
|
||||
dfloat2 t1{y[iybs + iqs + j / qr + 0],
|
||||
y[iybs + iqs + j / qr + y_offset]};
|
||||
|
||||
tmp += v * t1;
|
||||
#else
|
||||
tmp += v.x() * y[iybs + iqs + j / qr + 0];
|
||||
tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
|
||||
for (int mask = mask_start; mask > 0; mask >>= 1) {
|
||||
tmp +=
|
||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
#ifdef GGML_SYCL_F16
|
||||
dst[row] = tmp.x() + tmp.y();
|
||||
#else
|
||||
dst[row] = tmp;
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
}
|
||||
|
||||
static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -759,6 +864,28 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||
}
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
|
||||
vx, y, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
||||
float *dst, const int ncols,
|
||||
@@ -953,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||
#ifdef GGML_SYCL_F16
|
||||
@@ -967,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
|
||||
if (src1_convert_f16) {
|
||||
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
|
||||
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
|
||||
GGML_ASSERT(to_fp16_sycl != nullptr);
|
||||
to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
|
||||
}
|
||||
@@ -977,7 +1103,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
@@ -1020,4 +1151,5 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
GGML_UNUSED(src1_ddq_i);
|
||||
GGML_UNUSED(src1_ncols);
|
||||
GGML_UNUSED(src1_padded_row_size);
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user