mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vulkan : do not use tensor->extra (#9407)
* vulkan : do not use tensor->extra This patch allows using the Vulkan backend with the RPC backend as tensor->extra is no longer used. Ref: #8536 * Adapt GGML_VULKAN_CHECK_RESULTS to extra removal (#2) --------- Co-authored-by: 0cc4m <picard12@live.de>
This commit is contained in:
		 Radoslav Gerganov
					Radoslav Gerganov
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							76b37d1541
						
					
				
				
					commit
					00b7317e63
				
			| @@ -433,16 +433,6 @@ struct vk_context_struct { | |||||||
| typedef std::shared_ptr<vk_context_struct> vk_context; | typedef std::shared_ptr<vk_context_struct> vk_context; | ||||||
| typedef std::weak_ptr<vk_context_struct> vk_context_ref; | typedef std::weak_ptr<vk_context_struct> vk_context_ref; | ||||||
|  |  | ||||||
| struct ggml_tensor_extra_gpu { |  | ||||||
|     vk_buffer_ref buffer_gpu; |  | ||||||
|     uint64_t offset; |  | ||||||
|  |  | ||||||
|     void reset() { |  | ||||||
|         buffer_gpu.reset(); |  | ||||||
|         offset = 0; |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| struct ggml_vk_garbage_collector { | struct ggml_vk_garbage_collector { | ||||||
|     std::vector<vk_semaphore> tl_semaphores; |     std::vector<vk_semaphore> tl_semaphores; | ||||||
|     std::vector<vk_semaphore> semaphores; |     std::vector<vk_semaphore> semaphores; | ||||||
| @@ -553,6 +543,31 @@ struct ggml_backend_vk_context { | |||||||
|     std::vector<vk_context_ref> tensor_ctxs; |     std::vector<vk_context_ref> tensor_ctxs; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT | ||||||
|  |  | ||||||
|  | static uint64_t vk_tensor_offset(const ggml_tensor * tensor) { | ||||||
|  |     if (tensor->view_src) { | ||||||
|  |         return (uint8_t *) tensor->view_src->data - (uint8_t *) vk_ptr_base; | ||||||
|  |     } | ||||||
|  |     return (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | struct ggml_backend_vk_buffer_context { | ||||||
|  |     vk_device_ref device; | ||||||
|  |     vk_buffer dev_buffer; | ||||||
|  |     std::string name; | ||||||
|  |  | ||||||
|  |     ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : | ||||||
|  |         device(device), | ||||||
|  |         dev_buffer(dev_buffer), | ||||||
|  |         name(name) { | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ~ggml_backend_vk_buffer_context() { | ||||||
|  |         ggml_vk_destroy_buffer(dev_buffer); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_MEMORY_DEBUG | #ifdef GGML_VULKAN_MEMORY_DEBUG | ||||||
| void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { | void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) { | ||||||
|     std::lock_guard<std::mutex> guard(log_mutex); |     std::lock_guard<std::mutex> guard(log_mutex); | ||||||
| @@ -3076,9 +3091,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|     const uint64_t r2 = ne12 / ne02; |     const uint64_t r2 = ne12 / ne02; | ||||||
|     const uint64_t r3 = ne13 / ne03; |     const uint64_t r3 = ne13 / ne03; | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; |     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer d_Qx; |     vk_buffer d_Qx; | ||||||
|     size_t qx_buf_offset = 0; |     size_t qx_buf_offset = 0; | ||||||
| @@ -3180,8 +3195,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|     const uint64_t d_buf_offset = extra->offset + dst->view_offs; |     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); |     GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); | ||||||
|     vk_buffer d_X; |     vk_buffer d_X; | ||||||
| @@ -3189,13 +3204,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|     vk_buffer d_Y; |     vk_buffer d_Y; | ||||||
|     uint64_t y_buf_offset = 0; |     uint64_t y_buf_offset = 0; | ||||||
|     if (!src0_uma) { |     if (!src0_uma) { | ||||||
|         d_Qx = extra_src0->buffer_gpu.lock(); |         d_Qx = src0_buf_ctx->dev_buffer; | ||||||
|         qx_buf_offset = extra_src0->offset + src0->view_offs; |         qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|         GGML_ASSERT(d_Qx != nullptr); |         GGML_ASSERT(d_Qx != nullptr); | ||||||
|     } |     } | ||||||
|     if (!src1_uma) { |     if (!src1_uma) { | ||||||
|         d_Qy = extra_src1->buffer_gpu.lock(); |         d_Qy = src1_buf_ctx->dev_buffer; | ||||||
|         qy_buf_offset = extra_src1->offset + src1->view_offs; |         qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Qy != nullptr); |         GGML_ASSERT(d_Qy != nullptr); | ||||||
|     } |     } | ||||||
|     if (qx_needs_dequant) { |     if (qx_needs_dequant) { | ||||||
| @@ -3276,9 +3291,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|     const uint64_t r2 = ne12 / ne02; |     const uint64_t r2 = ne12 / ne02; | ||||||
|     const uint64_t r3 = ne13 / ne03; |     const uint64_t r3 = ne13 / ne03; | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; |     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer d_Qx; |     vk_buffer d_Qx; | ||||||
|     size_t qx_buf_offset = 0; |     size_t qx_buf_offset = 0; | ||||||
| @@ -3357,21 +3372,21 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|     const uint64_t d_buf_offset = extra->offset + dst->view_offs; |     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     vk_buffer d_X; |     vk_buffer d_X; | ||||||
|     uint64_t x_buf_offset = 0; |     uint64_t x_buf_offset = 0; | ||||||
|     vk_buffer d_Y; |     vk_buffer d_Y; | ||||||
|     uint64_t y_buf_offset = 0; |     uint64_t y_buf_offset = 0; | ||||||
|     if(!src0_uma) { |     if(!src0_uma) { | ||||||
|         d_Qx = extra_src0->buffer_gpu.lock(); |         d_Qx = src0_buf_ctx->dev_buffer; | ||||||
|         qx_buf_offset = extra_src0->offset + src0->view_offs; |         qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|         GGML_ASSERT(d_Qx != nullptr); |         GGML_ASSERT(d_Qx != nullptr); | ||||||
|     } |     } | ||||||
|     if(!src1_uma) { |     if(!src1_uma) { | ||||||
|         d_Qy = extra_src1->buffer_gpu.lock(); |         d_Qy = src1_buf_ctx->dev_buffer; | ||||||
|         qy_buf_offset = extra_src1->offset + src1->view_offs; |         qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Qy != nullptr); |         GGML_ASSERT(d_Qy != nullptr); | ||||||
|     } |     } | ||||||
|     if (qx_needs_dequant) { |     if (qx_needs_dequant) { | ||||||
| @@ -3454,9 +3469,9 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c | |||||||
|  |  | ||||||
|     GGML_ASSERT(ne11 == 1); |     GGML_ASSERT(ne11 == 1); | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; |     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer d_Qy; |     vk_buffer d_Qy; | ||||||
|     size_t qy_buf_offset = 0; |     size_t qy_buf_offset = 0; | ||||||
| @@ -3482,15 +3497,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|     const uint64_t d_buf_offset = extra->offset + dst->view_offs; |     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); |     vk_buffer d_Qx = src0_buf_ctx->dev_buffer; | ||||||
|     const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; |     const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|     GGML_ASSERT(d_Qx != nullptr); |     GGML_ASSERT(d_Qx != nullptr); | ||||||
|     if (!src1_uma) { |     if (!src1_uma) { | ||||||
|         d_Qy = extra_src1->buffer_gpu.lock(); |         d_Qy = src1_buf_ctx->dev_buffer; | ||||||
|         qy_buf_offset = extra_src1->offset + src1->view_offs; |         qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Qx != nullptr); |         GGML_ASSERT(d_Qx != nullptr); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -3532,9 +3547,9 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con | |||||||
|  |  | ||||||
|     GGML_ASSERT(ne11 == 1); |     GGML_ASSERT(ne11 == 1); | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; |     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer d_Qy = nullptr; |     vk_buffer d_Qy = nullptr; | ||||||
|     size_t qy_buf_offset = 0; |     size_t qy_buf_offset = 0; | ||||||
| @@ -3561,15 +3576,15 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|     const uint64_t d_buf_offset = extra->offset + dst->view_offs; |     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); |     vk_buffer d_Qx = src0_buf_ctx->dev_buffer; | ||||||
|     const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs; |     const uint64_t qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|     GGML_ASSERT(d_Qx != nullptr); |     GGML_ASSERT(d_Qx != nullptr); | ||||||
|     if (!src1_uma) { |     if (!src1_uma) { | ||||||
|         d_Qy = extra_src1->buffer_gpu.lock(); |         d_Qy = src1_buf_ctx->dev_buffer; | ||||||
|         qy_buf_offset = extra_src1->offset + src1->view_offs; |         qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Qx != nullptr); |         GGML_ASSERT(d_Qx != nullptr); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -3631,10 +3646,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|  |  | ||||||
|     const uint64_t n_as = ne02; |     const uint64_t n_as = ne02; | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; |     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; |     ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer d_Qx; |     vk_buffer d_Qx; | ||||||
|     size_t qx_buf_offset = 0; |     size_t qx_buf_offset = 0; | ||||||
| @@ -3731,26 +3746,26 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|     const uint64_t d_buf_offset = extra->offset + dst->view_offs; |     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     vk_buffer d_X; |     vk_buffer d_X; | ||||||
|     uint64_t x_buf_offset = 0; |     uint64_t x_buf_offset = 0; | ||||||
|     vk_buffer d_Y; |     vk_buffer d_Y; | ||||||
|     uint64_t y_buf_offset = 0; |     uint64_t y_buf_offset = 0; | ||||||
|     if (!src0_uma) { |     if (!src0_uma) { | ||||||
|         d_Qx = extra_src0->buffer_gpu.lock(); |         d_Qx = src0_buf_ctx->dev_buffer; | ||||||
|         qx_buf_offset = extra_src0->offset + src0->view_offs; |         qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|         GGML_ASSERT(d_Qx != nullptr); |         GGML_ASSERT(d_Qx != nullptr); | ||||||
|     } |     } | ||||||
|     if (!src1_uma) { |     if (!src1_uma) { | ||||||
|         d_Qy = extra_src1->buffer_gpu.lock(); |         d_Qy = src1_buf_ctx->dev_buffer; | ||||||
|         qy_buf_offset = extra_src1->offset + src1->view_offs; |         qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Qy != nullptr); |         GGML_ASSERT(d_Qy != nullptr); | ||||||
|     } |     } | ||||||
|     if (!ids_uma) { |     if (!ids_uma) { | ||||||
|         d_ids = extra_ids->buffer_gpu.lock(); |         d_ids = ids_buf_ctx->dev_buffer; | ||||||
|         ids_buf_offset = extra_ids->offset + ids->view_offs; |         ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; | ||||||
|         GGML_ASSERT(d_ids != nullptr); |         GGML_ASSERT(d_ids != nullptr); | ||||||
|     } |     } | ||||||
|     if (qx_needs_dequant) { |     if (qx_needs_dequant) { | ||||||
| @@ -3836,10 +3851,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|     const uint64_t ne22 = dst->ne[2]; |     const uint64_t ne22 = dst->ne[2]; | ||||||
|     const uint64_t ne23 = dst->ne[3]; |     const uint64_t ne23 = dst->ne[3]; | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; |     ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; |     ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer d_Qx; |     vk_buffer d_Qx; | ||||||
|     size_t qx_buf_offset = 0; |     size_t qx_buf_offset = 0; | ||||||
| @@ -3924,26 +3939,26 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|     const uint64_t d_buf_offset = extra->offset + dst->view_offs; |     const uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     vk_buffer d_X; |     vk_buffer d_X; | ||||||
|     uint64_t x_buf_offset = 0; |     uint64_t x_buf_offset = 0; | ||||||
|     vk_buffer d_Y; |     vk_buffer d_Y; | ||||||
|     uint64_t y_buf_offset = 0; |     uint64_t y_buf_offset = 0; | ||||||
|     if(!src0_uma) { |     if(!src0_uma) { | ||||||
|         d_Qx = extra_src0->buffer_gpu.lock(); |         d_Qx = src0_buf_ctx->dev_buffer; | ||||||
|         qx_buf_offset = extra_src0->offset + src0->view_offs; |         qx_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|         GGML_ASSERT(d_Qx != nullptr); |         GGML_ASSERT(d_Qx != nullptr); | ||||||
|     } |     } | ||||||
|     if(!src1_uma) { |     if(!src1_uma) { | ||||||
|         d_Qy = extra_src1->buffer_gpu.lock(); |         d_Qy = src1_buf_ctx->dev_buffer; | ||||||
|         qy_buf_offset = extra_src1->offset + src1->view_offs; |         qy_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Qy != nullptr); |         GGML_ASSERT(d_Qy != nullptr); | ||||||
|     } |     } | ||||||
|     if(!ids_uma) { |     if(!ids_uma) { | ||||||
|         d_ids = extra_ids->buffer_gpu.lock(); |         d_ids = ids_buf_ctx->dev_buffer; | ||||||
|         ids_buf_offset = extra_ids->offset + ids->view_offs; |         ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs; | ||||||
|         GGML_ASSERT(d_ids != nullptr); |         GGML_ASSERT(d_ids != nullptr); | ||||||
|     } |     } | ||||||
|     if (qx_needs_dequant) { |     if (qx_needs_dequant) { | ||||||
| @@ -4250,7 +4265,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|     std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); |     std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); | ||||||
|     GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))));  // NOLINT |     GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))));  // NOLINT | ||||||
|     GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0));  // NOLINT |     GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0));  // NOLINT | ||||||
|     GGML_ASSERT(dst->extra != nullptr); |     GGML_ASSERT(dst->buffer != nullptr); | ||||||
|     const uint64_t ne00 = src0->ne[0]; |     const uint64_t ne00 = src0->ne[0]; | ||||||
|     const uint64_t ne01 = src0->ne[1]; |     const uint64_t ne01 = src0->ne[1]; | ||||||
|     const uint64_t ne02 = src0->ne[2]; |     const uint64_t ne02 = src0->ne[2]; | ||||||
| @@ -4296,10 +4311,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|  |  | ||||||
|     const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); |     const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |     ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; |     ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|     ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; |     ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr; | ||||||
|     ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr; |     ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr; | ||||||
|  |  | ||||||
|     vk_buffer d_X = nullptr; |     vk_buffer d_X = nullptr; | ||||||
|     size_t x_buf_offset = 0; |     size_t x_buf_offset = 0; | ||||||
| @@ -4330,7 +4345,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|     uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; |     uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; | ||||||
|     uint64_t d_sz = ggml_type_size(dst->type) * ned; |     uint64_t d_sz = ggml_type_size(dst->type) * ned; | ||||||
|  |  | ||||||
|     vk_buffer d_D = extra->buffer_gpu.lock(); |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|     // Workaround for tiny tensor inputs on ROPE |     // Workaround for tiny tensor inputs on ROPE | ||||||
|     if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { |     if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { | ||||||
| @@ -4338,21 +4353,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; |     uint64_t d_buf_offset = ((vk_tensor_offset(dst) + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; | ||||||
|     GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT |     GGML_ASSERT(d_buf_offset == vk_tensor_offset(dst) || op == GGML_OP_CPY);  // NOLINT | ||||||
|     if(!src0_uma) { |     if(!src0_uma) { | ||||||
|         d_X = extra_src0->buffer_gpu.lock(); |         d_X = src0_buf_ctx->dev_buffer; | ||||||
|         x_buf_offset = extra_src0->offset + src0->view_offs; |         x_buf_offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|         GGML_ASSERT(d_X != nullptr); |         GGML_ASSERT(d_X != nullptr); | ||||||
|     } |     } | ||||||
|     if (use_src1 && !src1_uma) { |     if (use_src1 && !src1_uma) { | ||||||
|         d_Y = extra_src1->buffer_gpu.lock(); |         d_Y = src1_buf_ctx->dev_buffer; | ||||||
|         y_buf_offset = extra_src1->offset + src1->view_offs; |         y_buf_offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|         GGML_ASSERT(d_Y != nullptr); |         GGML_ASSERT(d_Y != nullptr); | ||||||
|     } |     } | ||||||
|     if (use_src2 && !src2_uma) { |     if (use_src2 && !src2_uma) { | ||||||
|         d_Z = extra_src2->buffer_gpu.lock(); |         d_Z = src2_buf_ctx->dev_buffer; | ||||||
|         z_buf_offset = extra_src2->offset + src2->view_offs; |         z_buf_offset = vk_tensor_offset(src2) + src2->view_offs; | ||||||
|         GGML_ASSERT(d_Z != nullptr); |         GGML_ASSERT(d_Z != nullptr); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -4531,11 +4546,10 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, | |||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { | static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |  | ||||||
|     const uint32_t src0_type_size = ggml_type_size(src0->type); |     const uint32_t src0_type_size = ggml_type_size(src0->type); | ||||||
|     const uint32_t src1_type_size = ggml_type_size(src1->type); |     const uint32_t src1_type_size = ggml_type_size(src1->type); | ||||||
|     const uint32_t dst_type_size = ggml_type_size(dst->type); |     const uint32_t dst_type_size = ggml_type_size(dst->type); | ||||||
|     const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; |     const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; | ||||||
|  |  | ||||||
|     int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 |     int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 | ||||||
|     int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 |     int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 | ||||||
| @@ -4724,10 +4738,9 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { | static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; |  | ||||||
|     const uint32_t src0_type_size = ggml_type_size(src0->type); |     const uint32_t src0_type_size = ggml_type_size(src0->type); | ||||||
|     const uint32_t dst_type_size = ggml_type_size(dst->type); |     const uint32_t dst_type_size = ggml_type_size(dst->type); | ||||||
|     const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; |     const uint32_t d_offset = ((vk_tensor_offset(dst) + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; | ||||||
|  |  | ||||||
|     ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { |     ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, { | ||||||
|         (uint32_t)ggml_nelements(src0), |         (uint32_t)ggml_nelements(src0), | ||||||
| @@ -5535,14 +5548,6 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) { |  | ||||||
|     VK_LOG_DEBUG("ggml_vk_create_extra(" << tensor << " (" << tensor->name << ", " << ggml_op_name(tensor->op) << "))"); |  | ||||||
|     ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; |  | ||||||
|     extra->reset(); |  | ||||||
|     tensor->extra = extra; |  | ||||||
|     return extra; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { | static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { | ||||||
| #if defined(GGML_VULKAN_RUN_TESTS) | #if defined(GGML_VULKAN_RUN_TESTS) | ||||||
|     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); |     ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); | ||||||
| @@ -5711,9 +5716,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t | |||||||
| // Returns true if node has enqueued work into the queue, false otherwise | // Returns true if node has enqueued work into the queue, false otherwise | ||||||
| // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. | // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. | ||||||
| static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ | static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; |     if (ggml_is_empty(node) || !node->buffer) { | ||||||
|  |  | ||||||
|     if (ggml_is_empty(node) || extra == nullptr) { |  | ||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -5965,7 +5968,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
| } | } | ||||||
|  |  | ||||||
| static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ | static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ | ||||||
|     ggml_tensor_extra_gpu * extra = nullptr; |     ggml_backend_buffer * buf = nullptr; | ||||||
|  |  | ||||||
|     switch (tensor->op) { |     switch (tensor->op) { | ||||||
|     case GGML_OP_ADD: |     case GGML_OP_ADD: | ||||||
| @@ -6001,7 +6004,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * | |||||||
|     case GGML_OP_TIMESTEP_EMBEDDING: |     case GGML_OP_TIMESTEP_EMBEDDING: | ||||||
|     case GGML_OP_LEAKY_RELU: |     case GGML_OP_LEAKY_RELU: | ||||||
|     case GGML_OP_REPEAT: |     case GGML_OP_REPEAT: | ||||||
|         extra = (ggml_tensor_extra_gpu *) tensor->extra; |         buf = tensor->buffer; | ||||||
|  |  | ||||||
|         break; |         break; | ||||||
|     case GGML_OP_UNARY: |     case GGML_OP_UNARY: | ||||||
| @@ -6011,7 +6014,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * | |||||||
|         case GGML_UNARY_OP_GELU_QUICK: |         case GGML_UNARY_OP_GELU_QUICK: | ||||||
|         case GGML_UNARY_OP_RELU: |         case GGML_UNARY_OP_RELU: | ||||||
|         case GGML_UNARY_OP_TANH: |         case GGML_UNARY_OP_TANH: | ||||||
|             extra = (ggml_tensor_extra_gpu *) tensor->extra; |             buf = tensor->buffer; | ||||||
|             break; |             break; | ||||||
|         default: |         default: | ||||||
|             return false; |             return false; | ||||||
| @@ -6019,14 +6022,14 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * | |||||||
|         break; |         break; | ||||||
|     case GGML_OP_MUL_MAT: |     case GGML_OP_MUL_MAT: | ||||||
|     case GGML_OP_MUL_MAT_ID: |     case GGML_OP_MUL_MAT_ID: | ||||||
|         extra = (ggml_tensor_extra_gpu *) tensor->extra; |         buf = tensor->buffer; | ||||||
|  |  | ||||||
|         break; |         break; | ||||||
|     default: |     default: | ||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (extra == nullptr) { |     if (buf == nullptr) { | ||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -6167,42 +6170,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript | |||||||
|  |  | ||||||
| // device backend | // device backend | ||||||
|  |  | ||||||
| static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT |  | ||||||
|  |  | ||||||
| struct ggml_backend_vk_buffer_context { |  | ||||||
|     vk_device_ref device; |  | ||||||
|     vk_buffer dev_buffer; |  | ||||||
|     ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; |  | ||||||
|     size_t temp_tensor_extra_index = 0; |  | ||||||
|     std::string name; |  | ||||||
|  |  | ||||||
|     ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name) : |  | ||||||
|         device(device), |  | ||||||
|         dev_buffer(dev_buffer), |  | ||||||
|         name(name) { |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     ~ggml_backend_vk_buffer_context() { |  | ||||||
|         ggml_vk_destroy_buffer(dev_buffer); |  | ||||||
|         if (temp_tensor_extras != nullptr) { |  | ||||||
|             delete[] temp_tensor_extras; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() { |  | ||||||
|         if (temp_tensor_extras == nullptr) { |  | ||||||
|             temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_VK_MAX_NODES]; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         size_t alloc_index = temp_tensor_extra_index; |  | ||||||
|         temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_VK_MAX_NODES; |  | ||||||
|         ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; |  | ||||||
|         extra->reset(); |  | ||||||
|  |  | ||||||
|         return extra; |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { | GGML_CALL static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) { | ||||||
|     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; |     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; | ||||||
|     return ctx->name.c_str(); |     return ctx->name.c_str(); | ||||||
| @@ -6227,51 +6194,37 @@ GGML_CALL static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t bu | |||||||
|  |  | ||||||
| GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { | GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { | ||||||
|     VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); |     VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); | ||||||
|     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; |  | ||||||
|  |  | ||||||
|     if (tensor->view_src != nullptr) { |     if (tensor->view_src != nullptr) { | ||||||
|         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); |         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); | ||||||
|         GGML_ASSERT(tensor->view_src->extra != nullptr); |  | ||||||
|         tensor->extra = tensor->view_src->extra; |  | ||||||
|     } else { |  | ||||||
|         ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); |  | ||||||
|         extra->buffer_gpu = ctx->dev_buffer; |  | ||||||
|         extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; |  | ||||||
|         tensor->extra = extra; |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | ||||||
|     VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); |     VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; |     ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; | ||||||
|  |     vk_buffer buf = buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|     vk_buffer buf = extra->buffer_gpu.lock(); |     ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); | ||||||
|  |  | ||||||
|     ggml_vk_buffer_write(buf, extra->offset + tensor->view_offs + offset, data, size); |  | ||||||
|  |  | ||||||
|     GGML_UNUSED(buffer); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { | GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { | ||||||
|     VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); |     VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; |     ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context; | ||||||
|  |  | ||||||
|     vk_buffer buf = extra->buffer_gpu.lock(); |     vk_buffer buf = buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|     ggml_vk_buffer_read(buf, extra->offset + tensor->view_offs + offset, data, size); |     ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); | ||||||
|  |  | ||||||
|     GGML_UNUSED(buffer); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { | GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { | ||||||
|     if (ggml_backend_buffer_is_vk(src->buffer)) { |     if (ggml_backend_buffer_is_vk(src->buffer)) { | ||||||
|         ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; |         ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; | ||||||
|         ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; |         ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|  |  | ||||||
|         vk_buffer src_buf = src_extra->buffer_gpu.lock(); |         vk_buffer src_buf = src_buf_ctx->dev_buffer; | ||||||
|         vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); |         vk_buffer dst_buf = dst_buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|         ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); |         ggml_vk_buffer_copy(dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); | ||||||
|  |  | ||||||
|         return true; |         return true; | ||||||
|     } |     } | ||||||
| @@ -6449,7 +6402,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g | |||||||
|     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; |     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; | ||||||
|     GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); |     GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; |     ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; | ||||||
|  |  | ||||||
|     vk_context transfer_ctx; |     vk_context transfer_ctx; | ||||||
|  |  | ||||||
| @@ -6462,9 +6415,9 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g | |||||||
|         transfer_ctx = ctx->transfer_ctx.lock(); |         transfer_ctx = ctx->transfer_ctx.lock(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer buf = extra->buffer_gpu.lock(); |     vk_buffer buf = buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|     ggml_vk_buffer_write_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); |     ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); | ||||||
| } | } | ||||||
|  |  | ||||||
| GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { | GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { | ||||||
| @@ -6472,7 +6425,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c | |||||||
|     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; |     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; | ||||||
|     GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); |     GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); | ||||||
|  |  | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; |     ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; | ||||||
|  |  | ||||||
|     vk_context transfer_ctx; |     vk_context transfer_ctx; | ||||||
|  |  | ||||||
| @@ -6485,17 +6438,17 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c | |||||||
|         transfer_ctx = ctx->transfer_ctx.lock(); |         transfer_ctx = ctx->transfer_ctx.lock(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer buf = extra->buffer_gpu.lock(); |     vk_buffer buf = buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|     ggml_vk_buffer_read_async(transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size); |     ggml_vk_buffer_read_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size); | ||||||
| } | } | ||||||
|  |  | ||||||
| GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { | GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { | ||||||
|     VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); |     VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); | ||||||
|     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; |     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; | ||||||
|     if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { |     if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { | ||||||
|         ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; |         ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; | ||||||
|         ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; |         ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; | ||||||
|  |  | ||||||
|         vk_context transfer_ctx; |         vk_context transfer_ctx; | ||||||
|  |  | ||||||
| @@ -6508,10 +6461,10 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c | |||||||
|             transfer_ctx = ctx->transfer_ctx.lock(); |             transfer_ctx = ctx->transfer_ctx.lock(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         vk_buffer src_buf = src_extra->buffer_gpu.lock(); |         vk_buffer src_buf = src_buf_ctx->dev_buffer; | ||||||
|         vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); |         vk_buffer dst_buf = dst_buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|         ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src)); |         ggml_vk_buffer_copy_async(transfer_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); | ||||||
|         return true; |         return true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -6949,10 +6902,10 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) | |||||||
|         const size_t tensor_size = ggml_nbytes(tensor); |         const size_t tensor_size = ggml_nbytes(tensor); | ||||||
|         tensor_data = malloc(tensor_size); |         tensor_data = malloc(tensor_size); | ||||||
|  |  | ||||||
|         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; |         ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; | ||||||
|  |  | ||||||
|         vk_buffer buffer_gpu = extra->buffer_gpu.lock(); |         vk_buffer buffer_gpu = buf_ctx->dev_buffer; | ||||||
|         ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); |         ggml_vk_buffer_read(buffer_gpu, vk_tensor_offset(tensor) + tensor->view_offs, tensor_data, tensor_size); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; |     std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; | ||||||
| @@ -7026,9 +6979,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { | |||||||
|             memcpy(src0_clone->data, src0->data, src0_size); |             memcpy(src0_clone->data, src0->data, src0_size); | ||||||
|             memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); |             memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); | ||||||
|         } else if (ggml_backend_buffer_is_vk(src0->buffer)) { |         } else if (ggml_backend_buffer_is_vk(src0->buffer)) { | ||||||
|             ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; |             ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; | ||||||
|             vk_buffer buffer_gpu = extra->buffer_gpu.lock(); |             vk_buffer& buffer_gpu = buf_ctx->dev_buffer; | ||||||
|             uint64_t offset = extra->offset + src0->view_offs; |             uint64_t offset = vk_tensor_offset(src0) + src0->view_offs; | ||||||
|             if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { |             if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { | ||||||
|                 for (int i3 = 0; i3 < src0->ne[3]; i3++) { |                 for (int i3 = 0; i3 < src0->ne[3]; i3++) { | ||||||
|                     for (int i2 = 0; i2 < src0->ne[2]; i2++) { |                     for (int i2 = 0; i2 < src0->ne[2]; i2++) { | ||||||
| @@ -7068,9 +7021,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { | |||||||
|             memcpy(src1_clone->data, src1->data, src1_size); |             memcpy(src1_clone->data, src1->data, src1_size); | ||||||
|             memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); |             memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); | ||||||
|         } else if (ggml_backend_buffer_is_vk(src1->buffer)) { |         } else if (ggml_backend_buffer_is_vk(src1->buffer)) { | ||||||
|             ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; |             ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; | ||||||
|             vk_buffer buffer_gpu = extra->buffer_gpu.lock(); |             vk_buffer& buffer_gpu = buf_ctx->dev_buffer; | ||||||
|             uint64_t offset = extra->offset + src1->view_offs; |             uint64_t offset = vk_tensor_offset(src1) + src1->view_offs; | ||||||
|             if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { |             if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { | ||||||
|                 for (int i3 = 0; i3 < src1->ne[3]; i3++) { |                 for (int i3 = 0; i3 < src1->ne[3]; i3++) { | ||||||
|                     for (int i2 = 0; i2 < src1->ne[2]; i2++) { |                     for (int i2 = 0; i2 < src1->ne[2]; i2++) { | ||||||
| @@ -7110,9 +7063,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { | |||||||
|             memcpy(src2_clone->data, src2->data, src2_size); |             memcpy(src2_clone->data, src2->data, src2_size); | ||||||
|             memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); |             memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); | ||||||
|         } else if (ggml_backend_buffer_is_vk(src2->buffer)) { |         } else if (ggml_backend_buffer_is_vk(src2->buffer)) { | ||||||
|             ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra; |             ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src2->buffer->context; | ||||||
|             vk_buffer buffer_gpu = extra->buffer_gpu.lock(); |             vk_buffer& buffer_gpu = buf_ctx->dev_buffer; | ||||||
|             uint64_t offset = extra->offset + src2->view_offs; |             uint64_t offset = vk_tensor_offset(src2) + src2->view_offs; | ||||||
|             if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { |             if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) { | ||||||
|                 for (int i3 = 0; i3 < src2->ne[3]; i3++) { |                 for (int i3 = 0; i3 < src2->ne[3]; i3++) { | ||||||
|                     for (int i2 = 0; i2 < src2->ne[2]; i2++) { |                     for (int i2 = 0; i2 < src2->ne[2]; i2++) { | ||||||
| @@ -7167,7 +7120,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { | |||||||
|     } else if (tensor->op == GGML_OP_PAD) { |     } else if (tensor->op == GGML_OP_PAD) { | ||||||
|         tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); |         tensor_clone = ggml_pad(ggml_ctx, src0_clone, tensor->ne[0] - src0_clone->ne[0], tensor->ne[1] - src0_clone->ne[1], tensor->ne[2] - src0_clone->ne[2], tensor->ne[3] - src0_clone->ne[3]); | ||||||
|     } else if (tensor->op == GGML_OP_REPEAT) { |     } else if (tensor->op == GGML_OP_REPEAT) { | ||||||
|         tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); |         tensor_clone = ggml_repeat(ggml_ctx, src0_clone, tensor); | ||||||
|     } else if (tensor->op == GGML_OP_ADD) { |     } else if (tensor->op == GGML_OP_ADD) { | ||||||
|         tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); |         tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); | ||||||
|     } else if (tensor->op == GGML_OP_ACC) { |     } else if (tensor->op == GGML_OP_ACC) { | ||||||
| @@ -7312,14 +7265,15 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { | |||||||
|         size_t tensor_size = ggml_nbytes(tensor); |         size_t tensor_size = ggml_nbytes(tensor); | ||||||
|         tensor_data = malloc(tensor_size); |         tensor_data = malloc(tensor_size); | ||||||
|  |  | ||||||
|         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; |         ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; | ||||||
|  |  | ||||||
|         vk_buffer buffer_gpu = extra->buffer_gpu.lock(); |         vk_buffer& buffer_gpu = buf_ctx->dev_buffer; | ||||||
|         if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) { |         uint64_t offset = vk_tensor_offset(tensor) + tensor->view_offs; | ||||||
|             tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs); |         if (offset + tensor_size >= buffer_gpu->size) { | ||||||
|  |             tensor_size = buffer_gpu->size - offset; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         ggml_vk_buffer_read(buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size); |         ggml_vk_buffer_read(buffer_gpu, offset, tensor_data, tensor_size); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     float first_error_result = -1.0f; |     float first_error_result = -1.0f; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user