mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llava: Add ACC OP for GPU acceleration to the Vulkan backend in the LLAVA CLIP model. (#8984)
* llava: Add ACC OP for GPU acceleration to the Vulkan backend in the LLAVA CLIP model. - The CLIP model now prioritizes the Vulkan backend over the CPU when vulkan available. - A GGML_OP_ACC shader has been added. - The encoding performance of the CLIP model improved from 4.2s on the CPU to 0.9s on the GPU. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * fix-up coding style. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * Fix-up the missing initial parameter to resolve the compilation warning. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Add missing parameters. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Use nb1 and nb2 for dst. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * Fix check results ggml_acc call --------- Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> Co-authored-by: 0cc4m <picard12@live.de>
This commit is contained in:
		| @@ -20,6 +20,10 @@ | |||||||
| #include "ggml-cann.h" | #include "ggml-cann.h" | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #ifdef GGML_USE_VULKAN | ||||||
|  | #include "ggml-vulkan.h" | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #define STB_IMAGE_IMPLEMENTATION | #define STB_IMAGE_IMPLEMENTATION | ||||||
| #include "stb_image.h" | #include "stb_image.h" | ||||||
|  |  | ||||||
| @@ -1142,6 +1146,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|     LOG_TEE("%s: CLIP using CANN backend\n", __func__); |     LOG_TEE("%s: CLIP using CANN backend\n", __func__); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #ifdef GGML_USE_VULKAN | ||||||
|  |     new_clip->backend = ggml_backend_vk_init(0); | ||||||
|  |     LOG_TEE("%s: CLIP using Vulkan backend\n", __func__); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|     if (!new_clip->backend) { |     if (!new_clip->backend) { | ||||||
|         new_clip->backend = ggml_backend_cpu_init(); |         new_clip->backend = ggml_backend_cpu_init(); | ||||||
|   | |||||||
| @@ -180,6 +180,7 @@ struct vk_device_struct { | |||||||
|     vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; |     vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; | ||||||
|     vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; |     vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; | ||||||
|     vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; |     vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; | ||||||
|  |     vk_pipeline pipeline_acc_f32; | ||||||
|     vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16; |     vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16; | ||||||
|     vk_pipeline pipeline_mul_f32; |     vk_pipeline pipeline_mul_f32; | ||||||
|     vk_pipeline pipeline_div_f32; |     vk_pipeline pipeline_div_f32; | ||||||
| @@ -1687,6 +1688,8 @@ static void ggml_vk_load_shaders(vk_device& device) { | |||||||
|     ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); |     ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); | ||||||
|     ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); |     ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); | ||||||
|  |  | ||||||
|  |     ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); | ||||||
|  |  | ||||||
|     ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); |     ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); | ||||||
|     ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); |     ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); | ||||||
|  |  | ||||||
| @@ -3971,6 +3974,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const | |||||||
|             return ctx->device->pipeline_get_rows_f32[src0->type]; |             return ctx->device->pipeline_get_rows_f32[src0->type]; | ||||||
|         } |         } | ||||||
|         return nullptr; |         return nullptr; | ||||||
|  |     case GGML_OP_ACC: | ||||||
|  |         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { | ||||||
|  |             return ctx->device->pipeline_acc_f32; | ||||||
|  |         } | ||||||
|  |         return nullptr; | ||||||
|     case GGML_OP_ADD: |     case GGML_OP_ADD: | ||||||
|         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { |         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { | ||||||
|             return ctx->device->pipeline_add_f32; |             return ctx->device->pipeline_add_f32; | ||||||
| @@ -4463,6 +4471,28 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, | |||||||
|     }, dryrun); |     }, dryrun); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { | ||||||
|  |     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; | ||||||
|  |     const uint32_t src0_type_size = ggml_type_size(src0->type); | ||||||
|  |     const uint32_t src1_type_size = ggml_type_size(src1->type); | ||||||
|  |     const uint32_t dst_type_size = ggml_type_size(dst->type); | ||||||
|  |     const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; | ||||||
|  |  | ||||||
|  |     int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 | ||||||
|  |     int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 | ||||||
|  |     // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused | ||||||
|  |     int offset = dst->op_params[3] / 4; // offset in bytes | ||||||
|  |  | ||||||
|  |     ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, { | ||||||
|  |         (uint32_t)ggml_nelements(src0), | ||||||
|  |         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size, | ||||||
|  |         (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, | ||||||
|  |         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] /  dst_type_size, | ||||||
|  |         d_offset, | ||||||
|  |         0.0f, 0.0f, offset, | ||||||
|  |     }, dryrun); | ||||||
|  | } | ||||||
|  |  | ||||||
| static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { | static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { | ||||||
|     const uint32_t src0_type_size = ggml_type_size(src0->type); |     const uint32_t src0_type_size = ggml_type_size(src0->type); | ||||||
|     const uint32_t src1_type_size = ggml_type_size(src1->type); |     const uint32_t src1_type_size = ggml_type_size(src1->type); | ||||||
| @@ -5621,6 +5651,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|     case GGML_OP_REPEAT: |     case GGML_OP_REPEAT: | ||||||
|     case GGML_OP_GET_ROWS: |     case GGML_OP_GET_ROWS: | ||||||
|     case GGML_OP_ADD: |     case GGML_OP_ADD: | ||||||
|  |     case GGML_OP_ACC: | ||||||
|     case GGML_OP_MUL: |     case GGML_OP_MUL: | ||||||
|     case GGML_OP_DIV: |     case GGML_OP_DIV: | ||||||
|     case GGML_OP_CONCAT: |     case GGML_OP_CONCAT: | ||||||
| @@ -5668,6 +5699,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|     case GGML_OP_REPEAT: |     case GGML_OP_REPEAT: | ||||||
|         ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun); |         ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun); | ||||||
|  |  | ||||||
|  |         break; | ||||||
|  |     case GGML_OP_ACC: | ||||||
|  |         ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun); | ||||||
|  |  | ||||||
|         break; |         break; | ||||||
|     case GGML_OP_GET_ROWS: |     case GGML_OP_GET_ROWS: | ||||||
|         ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun); |         ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun); | ||||||
| @@ -5808,6 +5843,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * | |||||||
|  |  | ||||||
|     switch (tensor->op) { |     switch (tensor->op) { | ||||||
|     case GGML_OP_ADD: |     case GGML_OP_ADD: | ||||||
|  |     case GGML_OP_ACC: | ||||||
|     case GGML_OP_GET_ROWS: |     case GGML_OP_GET_ROWS: | ||||||
|     case GGML_OP_MUL: |     case GGML_OP_MUL: | ||||||
|     case GGML_OP_DIV: |     case GGML_OP_DIV: | ||||||
| @@ -6539,6 +6575,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const | |||||||
|         case GGML_OP_GROUP_NORM: |         case GGML_OP_GROUP_NORM: | ||||||
|         case GGML_OP_RMS_NORM: |         case GGML_OP_RMS_NORM: | ||||||
|         case GGML_OP_ADD: |         case GGML_OP_ADD: | ||||||
|  |         case GGML_OP_ACC: | ||||||
|         case GGML_OP_MUL: |         case GGML_OP_MUL: | ||||||
|         case GGML_OP_DIV: |         case GGML_OP_DIV: | ||||||
|         case GGML_OP_CONCAT: |         case GGML_OP_CONCAT: | ||||||
| @@ -6995,6 +7032,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { | |||||||
|         tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); |         tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone); | ||||||
|     } else if (tensor->op == GGML_OP_ADD) { |     } else if (tensor->op == GGML_OP_ADD) { | ||||||
|         tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); |         tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); | ||||||
|  |     } else if (tensor->op == GGML_OP_ACC) { | ||||||
|  |         tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]); | ||||||
|     } else if (tensor->op == GGML_OP_NORM) { |     } else if (tensor->op == GGML_OP_NORM) { | ||||||
|         tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); |         tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); | ||||||
|     } else if (tensor->op == GGML_OP_GROUP_NORM) { |     } else if (tensor->op == GGML_OP_GROUP_NORM) { | ||||||
|   | |||||||
							
								
								
									
										24
									
								
								ggml/src/vulkan-shaders/acc.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								ggml/src/vulkan-shaders/acc.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "types.comp" | ||||||
|  | #include "generic_binary_head.comp" | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint idx = gl_GlobalInvocationID.x; | ||||||
|  |     if (idx >= p.ne) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const uint offset = p.param3; | ||||||
|  |     const uint src1_i = idx - offset; | ||||||
|  |     const uint oz = src1_i / p.nb02; | ||||||
|  |     const uint oy = (src1_i - (oz * p.nb02)) / p.nb01; | ||||||
|  |     const uint ox = src1_i % p.nb01; | ||||||
|  |  | ||||||
|  |     if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) { | ||||||
|  |         data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11])); | ||||||
|  |     } else { | ||||||
|  |         data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)])); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| @@ -368,6 +368,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) { | |||||||
|         string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); |         string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); | ||||||
|     })); |     })); | ||||||
|  |  | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); |         string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); | ||||||
|     })); |     })); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Changyeon Kim
					Changyeon Kim