mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	SYCL: Migrate away from deprecated ggml_tensor->backend (#10840)
* Migrate to tensor->buffer for checking backend buffer type: 1
* SYCL: common.cpp try to migrate away from tensor->backend
* SYCL: fix assertions and add proper comments
* SYCL: remove extra space
* SYCL: Add back static to ggml_backend_buffer_is_sycl_split function
* SYCL: Add pragma directive to suppress warning spam
* SYCL: Integrate debug logs with GGML_LOG and other fixes
* Revert "SYCL: Integrate debug logs with GGML_LOG and other fixes"
This reverts commit 2607b7de0f.
Let's keep the current SYCL specific logging mechanism for now
* SYCL: Use GGML_SYCL_DEBUG after reverting
* SYCL: reg_get_proc_address func, update to the current func signature
* SYCL: Refactor SYCL buffer checks in ggml_sycl_cpy_tensor_2d
			
			
This commit is contained in:
		| @@ -11,6 +11,8 @@ | |||||||
| // | // | ||||||
|  |  | ||||||
| #include "common.hpp" | #include "common.hpp" | ||||||
|  |  | ||||||
|  | #include "ggml-backend-impl.h" | ||||||
| #include "ggml-impl.h" | #include "ggml-impl.h" | ||||||
|  |  | ||||||
| int get_current_device_id() { | int get_current_device_id() { | ||||||
| @@ -65,9 +67,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr | |||||||
|                                  const ggml_sycl_op_flatten_t op) try { |                                  const ggml_sycl_op_flatten_t op) try { | ||||||
|  |  | ||||||
|     const bool use_src1 = src1 != nullptr; |     const bool use_src1 = src1 != nullptr; | ||||||
|  |     if(use_src1) | ||||||
|     GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |       GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0); | ||||||
|     GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |     GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); | ||||||
|  |  | ||||||
|     // dd = data device |     // dd = data device | ||||||
|     float * src0_ddf = (float *) src0->data; |     float * src0_ddf = (float *) src0->data; | ||||||
|   | |||||||
| @@ -26,7 +26,11 @@ | |||||||
|  |  | ||||||
| #define GGML_COMMON_DECL_SYCL | #define GGML_COMMON_DECL_SYCL | ||||||
| #define GGML_COMMON_IMPL_SYCL | #define GGML_COMMON_IMPL_SYCL | ||||||
|  | /* suppress warning spam */ | ||||||
|  | #pragma clang diagnostic push | ||||||
|  | #pragma clang diagnostic ignored "-Wnested-anon-types" | ||||||
| #include "ggml-common.h" | #include "ggml-common.h" | ||||||
|  | #pragma clang diagnostic pop | ||||||
|  |  | ||||||
| void* ggml_sycl_host_malloc(size_t size); | void* ggml_sycl_host_malloc(size_t size); | ||||||
| void ggml_sycl_host_free(void* ptr); | void ggml_sycl_host_free(void* ptr); | ||||||
|   | |||||||
| @@ -288,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, | |||||||
|                                      ggml_tensor *tensor) try { |                                      ggml_tensor *tensor) try { | ||||||
|     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; |     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; | ||||||
|  |  | ||||||
|     if (tensor->view_src != NULL && tensor->view_offs == 0) { |     if (tensor->view_src != NULL) { | ||||||
|         assert(tensor->view_src->buffer->buft == buffer->buft); |         assert(tensor->view_src->buffer->buft == buffer->buft); | ||||||
|         tensor->backend = tensor->view_src->backend; |  | ||||||
|         tensor->extra = tensor->view_src->extra; |  | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -539,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { | |||||||
|     auto dev_count = ggml_backend_sycl_get_device_count(); |     auto dev_count = ggml_backend_sycl_get_device_count(); | ||||||
|  |  | ||||||
|     if (device>=dev_count or device<0) { |     if (device>=dev_count or device<0) { | ||||||
|         printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", |         GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", | ||||||
|             device, dev_count-1); |             device, dev_count-1); | ||||||
|         GGML_ASSERT(device<dev_count); |         GGML_ASSERT(device<dev_count); | ||||||
|     } |     } | ||||||
| @@ -567,7 +565,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte | |||||||
|  |  | ||||||
|     int device = ctx->device; |     int device = ctx->device; | ||||||
|     if (device>=ggml_sycl_info().device_count or device<0) { |     if (device>=ggml_sycl_info().device_count or device<0) { | ||||||
|         printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", |         GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", | ||||||
|             device, ggml_sycl_info().device_count-1); |             device, ggml_sycl_info().device_count-1); | ||||||
|         GGML_ASSERT(device<ggml_sycl_info().device_count); |         GGML_ASSERT(device<ggml_sycl_info().device_count); | ||||||
|     } |     } | ||||||
| @@ -746,7 +744,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, | |||||||
|             size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); |             size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // FIXME: do not crash if cudaMalloc fails |         // FIXME: do not crash if SYCL Buffer alloc fails | ||||||
|         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first |         // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first | ||||||
|         ggml_sycl_set_device(i); |         ggml_sycl_set_device(i); | ||||||
|         const queue_ptr stream = ctx->streams[i]; |         const queue_ptr stream = ctx->streams[i]; | ||||||
| @@ -788,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, | |||||||
|                 CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); |                 CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; |  | ||||||
|     tensor->extra = extra; |     tensor->extra = extra; | ||||||
| } | } | ||||||
| catch (sycl::exception const &exc) { | catch (sycl::exception const &exc) { | ||||||
| @@ -2349,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, | |||||||
|  |  | ||||||
|     dpct::memcpy_direction kind; |     dpct::memcpy_direction kind; | ||||||
|     char * src_ptr; |     char * src_ptr; | ||||||
|     if (src->backend == GGML_BACKEND_TYPE_CPU) { |     if (ggml_backend_buffer_is_host(src->buffer)) { | ||||||
|         kind = dpct::host_to_device; |         kind = dpct::host_to_device; | ||||||
|  |         //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__); | ||||||
|         src_ptr = (char *) src->data; |         src_ptr = (char *) src->data; | ||||||
|         // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); |         // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); | ||||||
|     } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { |     } else if (ggml_backend_buffer_is_sycl(src->buffer)) { | ||||||
|         GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); |         // If buffer is a SYCL buffer | ||||||
|  |         //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__); | ||||||
|  |         kind    = dpct::device_to_device; | ||||||
|  |         src_ptr = (char *) src->data; | ||||||
|  |     } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) { | ||||||
|  |         /* | ||||||
|  |         If buffer is a SYCL split buffer | ||||||
|  |         */ | ||||||
|  |         //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__); | ||||||
|  |         GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]); | ||||||
|         kind = dpct::device_to_device; |         kind = dpct::device_to_device; | ||||||
|         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; |         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; | ||||||
|         int id; |         int id; | ||||||
| @@ -2857,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten | |||||||
|     const int nb2 = dst->nb[2]; |     const int nb2 = dst->nb[2]; | ||||||
|     const int nb3 = dst->nb[3]; |     const int nb3 = dst->nb[3]; | ||||||
|  |  | ||||||
|     GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |     GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); | ||||||
|     GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |     GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer)); | ||||||
|     GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); |     GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); | ||||||
|  |  | ||||||
|     GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); |     GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); | ||||||
| @@ -2878,7 +2885,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten | |||||||
|  |  | ||||||
|     int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); |     int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); | ||||||
|  |  | ||||||
|     const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; |     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); | ||||||
|     GGML_ASSERT(!(split && ne02 > 1)); |     GGML_ASSERT(!(split && ne02 > 1)); | ||||||
|     GGML_ASSERT(!(split && ne03 > 1)); |     GGML_ASSERT(!(split && ne03 > 1)); | ||||||
|     GGML_ASSERT(!(split && ne02 < ne12)); |     GGML_ASSERT(!(split && ne02 < ne12)); | ||||||
| @@ -3198,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg | |||||||
|                                        const ggml_tensor *src1, |                                        const ggml_tensor *src1, | ||||||
|                                        ggml_tensor *dst) try { |                                        ggml_tensor *dst) try { | ||||||
|     GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); |     GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); | ||||||
|     GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |     GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer)); | ||||||
|     GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation |     GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation | ||||||
|     GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation |     GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation | ||||||
|     GGML_ASSERT(src0->type == GGML_TYPE_F16); |     GGML_ASSERT(src0->type == GGML_TYPE_F16); | ||||||
| @@ -3231,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml | |||||||
|     GGML_ASSERT(!ggml_is_transposed(src0)); |     GGML_ASSERT(!ggml_is_transposed(src0)); | ||||||
|     GGML_ASSERT(!ggml_is_transposed(src1)); |     GGML_ASSERT(!ggml_is_transposed(src1)); | ||||||
|     GGML_ASSERT(!ggml_is_permuted(src0)); |     GGML_ASSERT(!ggml_is_permuted(src0)); | ||||||
|     GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |     GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer)); | ||||||
|     GGML_ASSERT(src0->type == GGML_TYPE_F16); |     GGML_ASSERT(src0->type == GGML_TYPE_F16); | ||||||
|     GGML_ASSERT(src1->type == GGML_TYPE_F32); |     GGML_ASSERT(src1->type == GGML_TYPE_F32); | ||||||
|  |  | ||||||
| @@ -3293,7 +3300,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, | |||||||
|                                              ggml_tensor *dst) try { |                                              ggml_tensor *dst) try { | ||||||
|     GGML_ASSERT(!ggml_is_transposed(src0)); |     GGML_ASSERT(!ggml_is_transposed(src0)); | ||||||
|     GGML_ASSERT(!ggml_is_transposed(src1)); |     GGML_ASSERT(!ggml_is_transposed(src1)); | ||||||
|     GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); |     GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer)); | ||||||
|     GGML_ASSERT(src0->type == GGML_TYPE_F16); |     GGML_ASSERT(src0->type == GGML_TYPE_F16); | ||||||
|  |  | ||||||
|     GGML_TENSOR_BINARY_OP_LOCALS |     GGML_TENSOR_BINARY_OP_LOCALS | ||||||
| @@ -4638,10 +4645,9 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re | |||||||
| static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) { | static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) { | ||||||
|     GGML_UNUSED(reg); |     GGML_UNUSED(reg); | ||||||
|  |  | ||||||
|     // TODO: update to the current function signature |     if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { | ||||||
|     //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { |         return (void *)ggml_backend_sycl_split_buffer_type; | ||||||
|     //    return (void *)ggml_backend_sycl_split_buffer_type; |     } | ||||||
|     //} |  | ||||||
|  |  | ||||||
|     // SYCL doesn't support registering host memory, left here for reference |     // SYCL doesn't support registering host memory, left here for reference | ||||||
|     // "ggml_backend_register_host_buffer" |     // "ggml_backend_register_host_buffer" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Akarshan Biswas
					Akarshan Biswas