mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the MobileVLM model. (#9763)
* ggml: Add POOL2D OP for GPU ACC to the Vulkan. - The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend. - A GGML_OP_POOL_2D shader has been added. (Pooling) - The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Correct the incorrect order of the parameters. fix casting to int. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> --------- Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>
This commit is contained in:
		@@ -213,6 +213,7 @@ struct vk_device_struct {
 | 
			
		||||
    vk_pipeline pipeline_sum_rows_f32;
 | 
			
		||||
    vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
 | 
			
		||||
    vk_pipeline pipeline_timestep_embedding_f32;
 | 
			
		||||
    vk_pipeline pipeline_pool2d_f32;
 | 
			
		||||
 | 
			
		||||
    std::unordered_map<std::string, vk_pipeline_ref> pipelines;
 | 
			
		||||
    std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
 | 
			
		||||
@@ -403,6 +404,17 @@ struct vk_op_timestep_embedding_push_constants {
 | 
			
		||||
    uint32_t max_period;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct vk_op_pool2d_push_constants {
 | 
			
		||||
    uint32_t IW; uint32_t IH;
 | 
			
		||||
    uint32_t OW; uint32_t OH;
 | 
			
		||||
    uint32_t OC;
 | 
			
		||||
    uint32_t pelements;
 | 
			
		||||
    uint32_t op;
 | 
			
		||||
    int32_t k0; int32_t k1;
 | 
			
		||||
    int32_t s0; int32_t s1;
 | 
			
		||||
    int32_t p0; int32_t p1;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// Allow pre-recording command buffers
 | 
			
		||||
struct vk_staging_memcpy {
 | 
			
		||||
    vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
 | 
			
		||||
@@ -1803,6 +1815,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
 | 
			
		||||
 | 
			
		||||
    ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
 | 
			
		||||
 | 
			
		||||
    ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
 | 
			
		||||
 | 
			
		||||
    for (auto &c : compiles) {
 | 
			
		||||
        c.wait();
 | 
			
		||||
    }
 | 
			
		||||
@@ -4234,6 +4248,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
 | 
			
		||||
            return ctx->device->pipeline_timestep_embedding_f32;
 | 
			
		||||
        }
 | 
			
		||||
        return nullptr;
 | 
			
		||||
    case GGML_OP_POOL_2D:
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
 | 
			
		||||
            return ctx->device->pipeline_pool2d_f32;
 | 
			
		||||
        }
 | 
			
		||||
        return nullptr;
 | 
			
		||||
    case GGML_OP_LEAKY_RELU:
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
 | 
			
		||||
            return ctx->device->pipeline_leaky_relu_f32;
 | 
			
		||||
@@ -4464,6 +4483,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
 | 
			
		||||
            uint32_t half_ceil = (dim + 1) / 2;
 | 
			
		||||
            elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
 | 
			
		||||
        } break;
 | 
			
		||||
    case GGML_OP_POOL_2D:
 | 
			
		||||
        {
 | 
			
		||||
            const uint32_t N = dst->ne[3];
 | 
			
		||||
            const uint32_t OC = dst->ne[2];
 | 
			
		||||
            const uint32_t OH = dst->ne[1];
 | 
			
		||||
            const uint32_t OW = dst->ne[0];
 | 
			
		||||
            elements = { N * OC * OH * OW, 1, 1};
 | 
			
		||||
        } break;
 | 
			
		||||
    case GGML_OP_ADD:
 | 
			
		||||
    case GGML_OP_DIV:
 | 
			
		||||
    case GGML_OP_MUL:
 | 
			
		||||
@@ -4914,6 +4941,34 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
 | 
			
		||||
    }, dryrun);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
 | 
			
		||||
    uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
 | 
			
		||||
    const int32_t k1 = dst->op_params[1];
 | 
			
		||||
    const int32_t k0 = dst->op_params[2];
 | 
			
		||||
    const int32_t s1 = dst->op_params[3];
 | 
			
		||||
    const int32_t s0 = dst->op_params[4];
 | 
			
		||||
    const int32_t p1 = dst->op_params[5];
 | 
			
		||||
    const int32_t p0 = dst->op_params[6];
 | 
			
		||||
 | 
			
		||||
    const uint32_t IH = src0->ne[1];
 | 
			
		||||
    const uint32_t IW = src0->ne[0];
 | 
			
		||||
 | 
			
		||||
    const uint32_t N = dst->ne[3];
 | 
			
		||||
 | 
			
		||||
    const uint32_t OC = dst->ne[2];
 | 
			
		||||
    const uint32_t OH = dst->ne[1];
 | 
			
		||||
    const uint32_t OW = dst->ne[0];
 | 
			
		||||
 | 
			
		||||
    const uint32_t parallel_elements = N * OC * OH * OW;
 | 
			
		||||
 | 
			
		||||
    ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
 | 
			
		||||
        IW, IH, OW, OH, OC,
 | 
			
		||||
        parallel_elements,
 | 
			
		||||
        op,
 | 
			
		||||
        k0, k1, s0, s1, p0, p1,
 | 
			
		||||
    }, dryrun);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
 | 
			
		||||
    const float * op_params = (const float *)dst->op_params;
 | 
			
		||||
    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
 | 
			
		||||
@@ -5792,6 +5847,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 | 
			
		||||
    case GGML_OP_SUM_ROWS:
 | 
			
		||||
    case GGML_OP_IM2COL:
 | 
			
		||||
    case GGML_OP_TIMESTEP_EMBEDDING:
 | 
			
		||||
    case GGML_OP_POOL_2D:
 | 
			
		||||
    case GGML_OP_LEAKY_RELU:
 | 
			
		||||
        break;
 | 
			
		||||
    default:
 | 
			
		||||
@@ -5927,6 +5983,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
 | 
			
		||||
    case GGML_OP_TIMESTEP_EMBEDDING:
 | 
			
		||||
        ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
 | 
			
		||||
 | 
			
		||||
        break;
 | 
			
		||||
    case GGML_OP_POOL_2D:
 | 
			
		||||
        ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
 | 
			
		||||
 | 
			
		||||
        break;
 | 
			
		||||
    case GGML_OP_LEAKY_RELU:
 | 
			
		||||
        ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
 | 
			
		||||
@@ -6018,6 +6078,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
 | 
			
		||||
    case GGML_OP_SUM_ROWS:
 | 
			
		||||
    case GGML_OP_IM2COL:
 | 
			
		||||
    case GGML_OP_TIMESTEP_EMBEDDING:
 | 
			
		||||
    case GGML_OP_POOL_2D:
 | 
			
		||||
    case GGML_OP_LEAKY_RELU:
 | 
			
		||||
    case GGML_OP_REPEAT:
 | 
			
		||||
        buf = tensor->buffer;
 | 
			
		||||
@@ -6821,6 +6882,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
 | 
			
		||||
        case GGML_OP_SUM_ROWS:
 | 
			
		||||
        case GGML_OP_IM2COL:
 | 
			
		||||
        case GGML_OP_TIMESTEP_EMBEDDING:
 | 
			
		||||
        case GGML_OP_POOL_2D:
 | 
			
		||||
        case GGML_OP_LEAKY_RELU:
 | 
			
		||||
            return true;
 | 
			
		||||
        default:
 | 
			
		||||
@@ -7334,6 +7396,16 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
 | 
			
		||||
        const int32_t dim = tensor->op_params[0];
 | 
			
		||||
        const int32_t max_period = tensor->op_params[1];
 | 
			
		||||
        tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
 | 
			
		||||
    } else if (tensor->op == GGML_OP_POOL_2D) {
 | 
			
		||||
        enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
 | 
			
		||||
        const int32_t k0 = tensor->op_params[1];
 | 
			
		||||
        const int32_t k1 = tensor->op_params[2];
 | 
			
		||||
        const int32_t s0 = tensor->op_params[3];
 | 
			
		||||
        const int32_t s1 = tensor->op_params[4];
 | 
			
		||||
        const int32_t p0 = tensor->op_params[5];
 | 
			
		||||
        const int32_t p1 = tensor->op_params[6];
 | 
			
		||||
 | 
			
		||||
        tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
 | 
			
		||||
    } else if (tensor->op == GGML_OP_LEAKY_RELU) {
 | 
			
		||||
        const float * op_params = (const float *)tensor->op_params;
 | 
			
		||||
        tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user