mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	SYCL: Migrate away from deprecated ggml_tensor->backend (#10840)
* Migrate to tensor->buffer for checking backend buffer type: 1
* SYCL: common.cpp try to migrate away from tensor->backend
* SYCL: fix assertions and add proper comments
* SYCL: remove extra space
* SYCL: Add back static to ggml_backend_buffer_is_sycl_split function
* SYCL: Add pragma directive to suppress warning spam
* SYCL: Integrate debug logs with GGML_LOG and other fixes
* Revert "SYCL: Integrate debug logs with GGML_LOG and other fixes"
This reverts commit 2607b7de0f.
Let's keep the current SYCL specific logging mechanism for now
* SYCL: Use GGML_SYCL_DEBUG after reverting
* SYCL: reg_get_proc_address func, update to the current func signature
* SYCL: Refactor SYCL buffer checks in ggml_sycl_cpy_tensor_2d
			
			
This commit is contained in:
		@@ -11,6 +11,8 @@
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
#include "common.hpp"
 | 
			
		||||
 | 
			
		||||
#include "ggml-backend-impl.h"
 | 
			
		||||
#include "ggml-impl.h"
 | 
			
		||||
 | 
			
		||||
int get_current_device_id() {
 | 
			
		||||
@@ -65,9 +67,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
 | 
			
		||||
                                 const ggml_sycl_op_flatten_t op) try {
 | 
			
		||||
 | 
			
		||||
    const bool use_src1 = src1 != nullptr;
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    if(use_src1)
 | 
			
		||||
      GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
 | 
			
		||||
    GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
 | 
			
		||||
 | 
			
		||||
    // dd = data device
 | 
			
		||||
    float * src0_ddf = (float *) src0->data;
 | 
			
		||||
 
 | 
			
		||||
@@ -26,7 +26,11 @@
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_DECL_SYCL
 | 
			
		||||
#define GGML_COMMON_IMPL_SYCL
 | 
			
		||||
/* suppress warning spam */
 | 
			
		||||
#pragma clang diagnostic push
 | 
			
		||||
#pragma clang diagnostic ignored "-Wnested-anon-types"
 | 
			
		||||
#include "ggml-common.h"
 | 
			
		||||
#pragma clang diagnostic pop
 | 
			
		||||
 | 
			
		||||
void* ggml_sycl_host_malloc(size_t size);
 | 
			
		||||
void ggml_sycl_host_free(void* ptr);
 | 
			
		||||
 
 | 
			
		||||
@@ -288,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
 | 
			
		||||
                                     ggml_tensor *tensor) try {
 | 
			
		||||
    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
 | 
			
		||||
 | 
			
		||||
    if (tensor->view_src != NULL && tensor->view_offs == 0) {
 | 
			
		||||
    if (tensor->view_src != NULL) {
 | 
			
		||||
        assert(tensor->view_src->buffer->buft == buffer->buft);
 | 
			
		||||
        tensor->backend = tensor->view_src->backend;
 | 
			
		||||
        tensor->extra = tensor->view_src->extra;
 | 
			
		||||
        return;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -539,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
 | 
			
		||||
    auto dev_count = ggml_backend_sycl_get_device_count();
 | 
			
		||||
 | 
			
		||||
    if (device>=dev_count or device<0) {
 | 
			
		||||
        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
 | 
			
		||||
        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
 | 
			
		||||
            device, dev_count-1);
 | 
			
		||||
        GGML_ASSERT(device<dev_count);
 | 
			
		||||
    }
 | 
			
		||||
@@ -567,7 +565,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
 | 
			
		||||
 | 
			
		||||
    int device = ctx->device;
 | 
			
		||||
    if (device>=ggml_sycl_info().device_count or device<0) {
 | 
			
		||||
        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
 | 
			
		||||
        GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
 | 
			
		||||
            device, ggml_sycl_info().device_count-1);
 | 
			
		||||
        GGML_ASSERT(device<ggml_sycl_info().device_count);
 | 
			
		||||
    }
 | 
			
		||||
@@ -746,7 +744,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
 | 
			
		||||
            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // FIXME: do not crash if cudaMalloc fails
 | 
			
		||||
        // FIXME: do not crash if SYCL Buffer alloc fails
 | 
			
		||||
        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
 | 
			
		||||
        ggml_sycl_set_device(i);
 | 
			
		||||
        const queue_ptr stream = ctx->streams[i];
 | 
			
		||||
@@ -788,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
 | 
			
		||||
                CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event()));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT;
 | 
			
		||||
    tensor->extra = extra;
 | 
			
		||||
}
 | 
			
		||||
catch (sycl::exception const &exc) {
 | 
			
		||||
@@ -2349,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
 | 
			
		||||
 | 
			
		||||
    dpct::memcpy_direction kind;
 | 
			
		||||
    char * src_ptr;
 | 
			
		||||
    if (src->backend == GGML_BACKEND_TYPE_CPU) {
 | 
			
		||||
    if (ggml_backend_buffer_is_host(src->buffer)) {
 | 
			
		||||
        kind = dpct::host_to_device;
 | 
			
		||||
        //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__);
 | 
			
		||||
        src_ptr = (char *) src->data;
 | 
			
		||||
        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr);
 | 
			
		||||
    } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) {
 | 
			
		||||
        GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
 | 
			
		||||
    } else if (ggml_backend_buffer_is_sycl(src->buffer)) {
 | 
			
		||||
        // If buffer is a SYCL buffer
 | 
			
		||||
        //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__);
 | 
			
		||||
        kind    = dpct::device_to_device;
 | 
			
		||||
        src_ptr = (char *) src->data;
 | 
			
		||||
    } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) {
 | 
			
		||||
        /*
 | 
			
		||||
        If buffer is a SYCL split buffer
 | 
			
		||||
        */
 | 
			
		||||
        //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__);
 | 
			
		||||
        GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]);
 | 
			
		||||
        kind = dpct::device_to_device;
 | 
			
		||||
        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
 | 
			
		||||
        int id;
 | 
			
		||||
@@ -2857,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
 | 
			
		||||
    const int nb2 = dst->nb[2];
 | 
			
		||||
    const int nb3 = dst->nb[3];
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer));
 | 
			
		||||
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer));
 | 
			
		||||
    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
 | 
			
		||||
@@ -2878,7 +2885,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
 | 
			
		||||
 | 
			
		||||
    int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
 | 
			
		||||
 | 
			
		||||
    const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
 | 
			
		||||
    const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
 | 
			
		||||
    GGML_ASSERT(!(split && ne02 > 1));
 | 
			
		||||
    GGML_ASSERT(!(split && ne03 > 1));
 | 
			
		||||
    GGML_ASSERT(!(split && ne02 < ne12));
 | 
			
		||||
@@ -3198,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
 | 
			
		||||
                                       const ggml_tensor *src1,
 | 
			
		||||
                                       ggml_tensor *dst) try {
 | 
			
		||||
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
 | 
			
		||||
    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
 | 
			
		||||
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
 | 
			
		||||
    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 | 
			
		||||
@@ -3231,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
 | 
			
		||||
    GGML_ASSERT(!ggml_is_transposed(src0));
 | 
			
		||||
    GGML_ASSERT(!ggml_is_transposed(src1));
 | 
			
		||||
    GGML_ASSERT(!ggml_is_permuted(src0));
 | 
			
		||||
    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 | 
			
		||||
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 | 
			
		||||
 | 
			
		||||
@@ -3293,7 +3300,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
 | 
			
		||||
                                             ggml_tensor *dst) try {
 | 
			
		||||
    GGML_ASSERT(!ggml_is_transposed(src0));
 | 
			
		||||
    GGML_ASSERT(!ggml_is_transposed(src1));
 | 
			
		||||
    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
 | 
			
		||||
    GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
 | 
			
		||||
 | 
			
		||||
    GGML_TENSOR_BINARY_OP_LOCALS
 | 
			
		||||
@@ -4638,10 +4645,9 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re
 | 
			
		||||
static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
 | 
			
		||||
    GGML_UNUSED(reg);
 | 
			
		||||
 | 
			
		||||
    // TODO: update to the current function signature
 | 
			
		||||
    //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
 | 
			
		||||
    //    return (void *)ggml_backend_sycl_split_buffer_type;
 | 
			
		||||
    //}
 | 
			
		||||
    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
 | 
			
		||||
        return (void *)ggml_backend_sycl_split_buffer_type;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // SYCL doesn't support registering host memory, left here for reference
 | 
			
		||||
    // "ggml_backend_register_host_buffer"
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user