mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	SYCL: Add set_rows support for quantized types (#14883)
* SYCL: Add set_rows support for quantized types This commit adds support for GGML_OP_SET_ROWS operation for various quantized tensor types (Q8_0, Q5_1, Q5_0, Q4_1, Q4_0, IQ4_NL) and BF16 type in the SYCL backend. The quantization/dequantization copy kernels were moved from cpy.cpp to cpy.hpp to make them available for set_rows.cpp. This addresses part of the TODOs mentioned in the code. * Use get_global_linear_id() instead ggml-ci * Fix formatting ggml-ci * Use const for ne11 and size_t variables in set_rows_sycl_q ggml-ci * Increase block size for q kernel to 256 ggml-ci * Cleanup imports * Add float.h to cpy.hpp
This commit is contained in:
		@@ -1,31 +1,12 @@
 | 
			
		||||
#include "cpy.hpp"
 | 
			
		||||
 | 
			
		||||
#include <float.h>
 | 
			
		||||
#include <string>
 | 
			
		||||
 | 
			
		||||
#include "dequantize.hpp"
 | 
			
		||||
#include "ggml-sycl/common.hpp"
 | 
			
		||||
#include "ggml-sycl/presets.hpp"
 | 
			
		||||
#include "ggml.h"
 | 
			
		||||
 | 
			
		||||
static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
 | 
			
		||||
    if (x <= val[0]) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
    if (x >= val[n - 1]) {
 | 
			
		||||
        return n - 1;
 | 
			
		||||
    }
 | 
			
		||||
    int ml = 0, mu = n - 1;
 | 
			
		||||
    while (mu - ml > 1) {
 | 
			
		||||
        int mav = (ml + mu) / 2;
 | 
			
		||||
        if (x < val[mav]) {
 | 
			
		||||
            mu = mav;
 | 
			
		||||
        } else {
 | 
			
		||||
            ml = mav;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
@@ -97,28 +78,6 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne
 | 
			
		||||
    cpy_1(cx + x_offset, cdst + dst_offset);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q8_0 *  dsti = (block_q8_0 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;  // absolute max
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK8_0; j++) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        amax          = sycl::fmax(amax, sycl::fabs((float) v));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = amax / ((1 << 7) - 1);
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->d = d;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK8_0; ++j) {
 | 
			
		||||
        const float x0 = xi[j] * id;
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = sycl::round((float) x0);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* quantized type same copy */
 | 
			
		||||
template<typename T>
 | 
			
		||||
@@ -140,178 +99,7 @@ static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q4_0 *  dsti = (block_q4_0 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;
 | 
			
		||||
    float vmax = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_0; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        if (amax < sycl::fabs((float) v)) {
 | 
			
		||||
            amax = sycl::fabs((float) v);
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = vmax / -8;
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->d = d;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_0 / 2; ++j) {
 | 
			
		||||
        const float x0 = xi[0 + j] * id;
 | 
			
		||||
        const float x1 = xi[QK4_0 / 2 + j] * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
 | 
			
		||||
        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = xi0;
 | 
			
		||||
        dsti->qs[j] |= xi1 << 4;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q4_1 *  dsti = (block_q4_1 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float vmin = FLT_MAX;
 | 
			
		||||
    float vmax = -FLT_MAX;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_1; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
 | 
			
		||||
        if (v < vmin) {
 | 
			
		||||
            vmin = v;
 | 
			
		||||
        }
 | 
			
		||||
        if (v > vmax) {
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = (vmax - vmin) / ((1 << 4) - 1);
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->dm.x() = d;
 | 
			
		||||
    dsti->dm.y() = vmin;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_1 / 2; ++j) {
 | 
			
		||||
        const float x0 = (xi[0 + j] - vmin) * id;
 | 
			
		||||
        const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
 | 
			
		||||
        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = xi0;
 | 
			
		||||
        dsti->qs[j] |= xi1 << 4;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q5_0 *  dsti = (block_q5_0 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;
 | 
			
		||||
    float vmax = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK5_0; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        if (amax < sycl::fabs((float) v)) {
 | 
			
		||||
            amax = sycl::fabs((float) v);
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = vmax / -16;
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->d = d;
 | 
			
		||||
 | 
			
		||||
    uint32_t qh = 0;
 | 
			
		||||
    for (int j = 0; j < QK5_0 / 2; ++j) {
 | 
			
		||||
        const float x0 = xi[0 + j] * id;
 | 
			
		||||
        const float x1 = xi[QK5_0 / 2 + j] * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
 | 
			
		||||
        const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
 | 
			
		||||
        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
 | 
			
		||||
        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
 | 
			
		||||
    }
 | 
			
		||||
    memcpy(dsti->qh, &qh, sizeof(qh));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q5_1 *  dsti = (block_q5_1 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float min = xi[0];
 | 
			
		||||
    float max = xi[0];
 | 
			
		||||
 | 
			
		||||
    for (int j = 1; j < QK5_1; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        min           = v < min ? v : min;
 | 
			
		||||
        max           = v > max ? v : max;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = (max - min) / 31;
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->dm.x() = d;
 | 
			
		||||
    dsti->dm.y() = min;
 | 
			
		||||
 | 
			
		||||
    uint32_t qh = 0;
 | 
			
		||||
    for (int j = 0; j < QK5_1 / 2; ++j) {
 | 
			
		||||
        const float x0 = (xi[0 + j] - min) * id;
 | 
			
		||||
        const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
 | 
			
		||||
        const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
 | 
			
		||||
        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
 | 
			
		||||
        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
 | 
			
		||||
    }
 | 
			
		||||
    memcpy(dsti->qh, &qh, sizeof(qh));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float *  xi   = (const float *) cxi;
 | 
			
		||||
    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;
 | 
			
		||||
    float vmax = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_NL; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        if (amax < sycl::fabs((float) v)) {
 | 
			
		||||
            amax = sycl::fabs((float) v);
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    float       d  = vmax / kvalues_iq4nl[0];
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    float sumqx = 0, sumq2 = 0;
 | 
			
		||||
    for (int j = 0; j < QK4_NL / 2; ++j) {
 | 
			
		||||
        const float   x0  = xi[0 + j] * id;
 | 
			
		||||
        const float   x1  = xi[QK4_NL / 2 + j] * id;
 | 
			
		||||
        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
 | 
			
		||||
        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
 | 
			
		||||
        dsti->qs[j]       = xi0 | (xi1 << 4);
 | 
			
		||||
        const float v0    = kvalues_iq4nl[xi0];
 | 
			
		||||
        const float v1    = kvalues_iq4nl[xi1];
 | 
			
		||||
        const float w0    = xi[0 + j] * xi[0 + j];
 | 
			
		||||
        const float w1    = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
 | 
			
		||||
        sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
 | 
			
		||||
        sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const char * cxi, char * cdsti) {
 | 
			
		||||
    float * cdstf = (float *) (cdsti);
 | 
			
		||||
 
 | 
			
		||||
@@ -2,10 +2,222 @@
 | 
			
		||||
#define GGML_SYCL_CPY_HPP
 | 
			
		||||
 | 
			
		||||
#include "common.hpp"
 | 
			
		||||
#include <float.h>
 | 
			
		||||
 | 
			
		||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 | 
			
		||||
 | 
			
		||||
__dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
 | 
			
		||||
    if (x <= val[0]) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
    if (x >= val[n - 1]) {
 | 
			
		||||
        return n - 1;
 | 
			
		||||
    }
 | 
			
		||||
    int ml = 0, mu = n - 1;
 | 
			
		||||
    while (mu - ml > 1) {
 | 
			
		||||
        int mav = (ml + mu) / 2;
 | 
			
		||||
        if (x < val[mav]) {
 | 
			
		||||
            mu = mav;
 | 
			
		||||
        } else {
 | 
			
		||||
            ml = mav;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q8_0 *  dsti = (block_q8_0 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;  // absolute max
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK8_0; j++) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        amax          = sycl::fmax(amax, sycl::fabs((float) v));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = amax / ((1 << 7) - 1);
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->d = d;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK8_0; ++j) {
 | 
			
		||||
        const float x0 = xi[j] * id;
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = sycl::round((float) x0);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q4_0 *  dsti = (block_q4_0 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;
 | 
			
		||||
    float vmax = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_0; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        if (amax < sycl::fabs((float) v)) {
 | 
			
		||||
            amax = sycl::fabs((float) v);
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = vmax / -8;
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->d = d;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_0 / 2; ++j) {
 | 
			
		||||
        const float x0 = xi[0 + j] * id;
 | 
			
		||||
        const float x1 = xi[QK4_0 / 2 + j] * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f));
 | 
			
		||||
        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f));
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = xi0;
 | 
			
		||||
        dsti->qs[j] |= xi1 << 4;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q4_1 *  dsti = (block_q4_1 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float vmin = FLT_MAX;
 | 
			
		||||
    float vmax = -FLT_MAX;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_1; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
 | 
			
		||||
        vmin = sycl::min(v, vmin);
 | 
			
		||||
        vmax = sycl::max(v, vmax);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = (vmax - vmin) / ((1 << 4) - 1);
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->dm.x() = d;
 | 
			
		||||
    dsti->dm.y() = vmin;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_1 / 2; ++j) {
 | 
			
		||||
        const float x0 = (xi[0 + j] - vmin) * id;
 | 
			
		||||
        const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f));
 | 
			
		||||
        const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f));
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = xi0;
 | 
			
		||||
        dsti->qs[j] |= xi1 << 4;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q5_0 *  dsti = (block_q5_0 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;
 | 
			
		||||
    float vmax = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK5_0; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        if (amax < sycl::fabs((float) v)) {
 | 
			
		||||
            amax = sycl::fabs((float) v);
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = vmax / -16;
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->d = d;
 | 
			
		||||
 | 
			
		||||
    uint32_t qh = 0;
 | 
			
		||||
    for (int j = 0; j < QK5_0 / 2; ++j) {
 | 
			
		||||
        const float x0 = xi[0 + j] * id;
 | 
			
		||||
        const float x1 = xi[QK5_0 / 2 + j] * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f));
 | 
			
		||||
        const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f));
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
 | 
			
		||||
        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
 | 
			
		||||
        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2);
 | 
			
		||||
    }
 | 
			
		||||
    memcpy(dsti->qh, &qh, sizeof(qh));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float * xi   = (const float *) cxi;
 | 
			
		||||
    block_q5_1 *  dsti = (block_q5_1 *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float min = xi[0];
 | 
			
		||||
    float max = xi[0];
 | 
			
		||||
 | 
			
		||||
    for (int j = 1; j < QK5_1; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        min           = v < min ? v : min;
 | 
			
		||||
        max           = v > max ? v : max;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const float d  = (max - min) / 31;
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    dsti->dm.x() = d;
 | 
			
		||||
    dsti->dm.y() = min;
 | 
			
		||||
 | 
			
		||||
    uint32_t qh = 0;
 | 
			
		||||
    for (int j = 0; j < QK5_1 / 2; ++j) {
 | 
			
		||||
        const float x0 = (xi[0 + j] - min) * id;
 | 
			
		||||
        const float x1 = (xi[QK5_1 / 2 + j] - min) * id;
 | 
			
		||||
 | 
			
		||||
        const uint8_t xi0 = (uint8_t) (x0 + 0.5f);
 | 
			
		||||
        const uint8_t xi1 = (uint8_t) (x1 + 0.5f);
 | 
			
		||||
 | 
			
		||||
        dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
 | 
			
		||||
        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
 | 
			
		||||
        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2);
 | 
			
		||||
    }
 | 
			
		||||
    memcpy(dsti->qh, &qh, sizeof(qh));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 | 
			
		||||
    const float *  xi   = (const float *) cxi;
 | 
			
		||||
    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
 | 
			
		||||
 | 
			
		||||
    float amax = 0.0f;
 | 
			
		||||
    float vmax = 0.0f;
 | 
			
		||||
 | 
			
		||||
    for (int j = 0; j < QK4_NL; ++j) {
 | 
			
		||||
        const float v = xi[j];
 | 
			
		||||
        if (amax < sycl::fabs((float) v)) {
 | 
			
		||||
            amax = sycl::fabs((float) v);
 | 
			
		||||
            vmax = v;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    float       d  = vmax / kvalues_iq4nl[0];
 | 
			
		||||
    const float id = d ? 1.0f / d : 0.0f;
 | 
			
		||||
 | 
			
		||||
    float sumqx = 0, sumq2 = 0;
 | 
			
		||||
    for (int j = 0; j < QK4_NL / 2; ++j) {
 | 
			
		||||
        const float   x0  = xi[0 + j] * id;
 | 
			
		||||
        const float   x1  = xi[QK4_NL / 2 + j] * id;
 | 
			
		||||
        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
 | 
			
		||||
        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
 | 
			
		||||
        dsti->qs[j]       = xi0 | (xi1 << 4);
 | 
			
		||||
        const float v0    = kvalues_iq4nl[xi0];
 | 
			
		||||
        const float v1    = kvalues_iq4nl[xi1];
 | 
			
		||||
        const float w0    = xi[0 + j] * xi[0 + j];
 | 
			
		||||
        const float w1    = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j];
 | 
			
		||||
        sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j];
 | 
			
		||||
        sumq2 += w0 * v0 * v0 + w1 * v1 * v1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    dsti->d = sumq2 > 0 ? sumqx / sumq2 : d;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1);
 | 
			
		||||
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 | 
			
		||||
 | 
			
		||||
#endif // GGML_SYCL_CPY_HPP
 | 
			
		||||
#endif  // GGML_SYCL_CPY_HPP
 | 
			
		||||
 
 | 
			
		||||
@@ -4229,11 +4229,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
 | 
			
		||||
            }
 | 
			
		||||
        case GGML_OP_SET_ROWS:
 | 
			
		||||
            {
 | 
			
		||||
                // TODO: add support
 | 
			
		||||
                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
 | 
			
		||||
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
 | 
			
		||||
                return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
 | 
			
		||||
            } break;
 | 
			
		||||
                return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
 | 
			
		||||
                         op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 ||
 | 
			
		||||
                         op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) &&
 | 
			
		||||
                        (op->src[1]->type == GGML_TYPE_I64));
 | 
			
		||||
            }
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_OP_CPY:
 | 
			
		||||
            {
 | 
			
		||||
                ggml_type src0_type = op->src[0]->type;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,5 @@
 | 
			
		||||
#include "set_rows.hpp"
 | 
			
		||||
#include "cpy.hpp"
 | 
			
		||||
 | 
			
		||||
namespace utils {
 | 
			
		||||
template<typename T>
 | 
			
		||||
@@ -15,6 +16,68 @@ convert (const char* src, char* dst) {
 | 
			
		||||
   *reinterpret_cast<TOut*>(dst) = dst_val;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename blockType, int qk, cpy_kernel_t cpyblck>
 | 
			
		||||
static void set_rows_sycl_q(const char * __restrict__ src0_d,
 | 
			
		||||
                            const int64_t * __restrict__ src1_d,
 | 
			
		||||
                            blockType * __restrict__ dst_d,
 | 
			
		||||
                            // tensor dimensions src0 and src1
 | 
			
		||||
                            const int64_t ne00,
 | 
			
		||||
                            const int64_t ne01,
 | 
			
		||||
                            const int64_t ne02,
 | 
			
		||||
                            const int64_t ne03,
 | 
			
		||||
                            const int64_t ne10,
 | 
			
		||||
                            const int64_t ne11,
 | 
			
		||||
                            const int64_t ne12,
 | 
			
		||||
                            const int64_t ne13,
 | 
			
		||||
                            // strides for src0
 | 
			
		||||
                            const size_t  nb00,
 | 
			
		||||
                            const size_t  nb01,
 | 
			
		||||
                            const size_t  nb02,
 | 
			
		||||
                            const size_t  nb03,
 | 
			
		||||
                            // strides for src1
 | 
			
		||||
                            const size_t  nb10,
 | 
			
		||||
                            const size_t  nb11,
 | 
			
		||||
                            const size_t  nb12,
 | 
			
		||||
                            const size_t  nb13,
 | 
			
		||||
                            // strides for dst
 | 
			
		||||
                            const size_t  nb1,
 | 
			
		||||
                            const size_t  nb2,
 | 
			
		||||
                            const size_t  nb3,
 | 
			
		||||
                            queue_ptr     stream) {
 | 
			
		||||
    const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk;
 | 
			
		||||
    constexpr int block_size   = 256;
 | 
			
		||||
    const int64_t grid_size    = ceil_div(total_blocks, block_size);
 | 
			
		||||
 | 
			
		||||
    sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
 | 
			
		||||
        const int64_t i = item_ct1.get_global_linear_id();
 | 
			
		||||
        if (i >= total_blocks) {
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
        const int64_t i_base      = i * qk;
 | 
			
		||||
        const int64_t i03         = i_base / (ne00 * ne01 * ne02);
 | 
			
		||||
        const int64_t rem1        = i_base - i03 * (ne00 * ne01 * ne02);
 | 
			
		||||
        const int64_t i02         = rem1 / (ne00 * ne01);
 | 
			
		||||
        const int64_t rem2        = rem1 - i02 * ne00 * ne01;
 | 
			
		||||
        const int64_t i01         = rem2 / ne00;
 | 
			
		||||
        const int64_t i00         = rem2 - i01 * ne00;
 | 
			
		||||
        const int64_t i12         = i03 % ne12;
 | 
			
		||||
        const int64_t i11         = i02 % ne11;
 | 
			
		||||
        const int64_t i10         = i01;
 | 
			
		||||
        const size_t  src_offset  = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 });
 | 
			
		||||
        const char *  src_block   = src0_d + src_offset + i00 * sizeof(float);
 | 
			
		||||
        const size_t  src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 });
 | 
			
		||||
        const int64_t dst_row     = src1_d[src1_offset / sizeof(int64_t)];
 | 
			
		||||
        const size_t  dst_offset =
 | 
			
		||||
            calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType);
 | 
			
		||||
        char * dst_block = reinterpret_cast<char *>(reinterpret_cast<char *>(dst_d) + dst_offset);
 | 
			
		||||
        cpyblck(src_block, dst_block);
 | 
			
		||||
    });
 | 
			
		||||
    GGML_UNUSED(ne10);
 | 
			
		||||
    GGML_UNUSED(ne13);
 | 
			
		||||
    GGML_UNUSED(nb00);
 | 
			
		||||
    GGML_UNUSED(nb13);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<typename TIn, typename TOut>
 | 
			
		||||
static void k_set_rows(
 | 
			
		||||
        const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
 | 
			
		||||
@@ -124,6 +187,37 @@ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 | 
			
		||||
                stream
 | 
			
		||||
            );
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_BF16:
 | 
			
		||||
            set_rows_sycl<float, sycl::ext::oneapi::bfloat16>(
 | 
			
		||||
                (const char *)src0->data, src1_dd, (char *)dst->data,
 | 
			
		||||
                ne00, ne01, ne02, ne03,
 | 
			
		||||
                ne11, ne12,
 | 
			
		||||
                nb01, nb02, nb03,
 | 
			
		||||
                nb10, nb11, nb12,
 | 
			
		||||
                nb1, nb2, nb3,
 | 
			
		||||
                sizeof(float), sizeof(sycl::ext::oneapi::bfloat16),
 | 
			
		||||
                stream
 | 
			
		||||
            );
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_Q8_0:
 | 
			
		||||
            set_rows_sycl_q<block_q8_0, QK8_0, cpy_blck_f32_q8_0>((const char *)src0->data, src1_dd, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_Q5_1:
 | 
			
		||||
            set_rows_sycl_q<block_q5_1, QK5_1, cpy_blck_f32_q5_1>((const char *)src0->data, src1_dd, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_Q5_0:
 | 
			
		||||
            set_rows_sycl_q<block_q5_0, QK5_0, cpy_blck_f32_q5_0>((const char *)src0->data, src1_dd, (block_q5_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_Q4_1:
 | 
			
		||||
            set_rows_sycl_q<block_q4_1, QK4_1, cpy_blck_f32_q4_1>((const char *)src0->data, src1_dd, (block_q4_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_Q4_0:
 | 
			
		||||
            set_rows_sycl_q<block_q4_0, QK4_0, cpy_blck_f32_q4_0>((const char *)src0->data, src1_dd, (block_q4_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
 | 
			
		||||
            break;
 | 
			
		||||
        case GGML_TYPE_IQ4_NL:
 | 
			
		||||
            set_rows_sycl_q<block_iq4_nl, QK4_NL, cpy_blck_f32_iq4_nl>((const char *)src0->data, src1_dd, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
 | 
			
		||||
            break;
 | 
			
		||||
 | 
			
		||||
        default:
 | 
			
		||||
            GGML_ABORT("Unsupported tensor type!");
 | 
			
		||||
            break;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user