mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	opencl: add f16 for add, sub, mul, div (#14984)
				
					
				
			This commit is contained in:
		@@ -400,10 +400,10 @@ struct ggml_backend_opencl_context {
 | 
			
		||||
    cl_program program_mul_mm_f32_f32_l4_lm;
 | 
			
		||||
    cl_program program_mul_mm_f16_f32_l4_lm;
 | 
			
		||||
 | 
			
		||||
    cl_kernel kernel_add, kernel_add_row;
 | 
			
		||||
    cl_kernel kernel_mul, kernel_mul_row;
 | 
			
		||||
    cl_kernel kernel_div, kernel_div_row;
 | 
			
		||||
    cl_kernel kernel_sub, kernel_sub_row;
 | 
			
		||||
    cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
 | 
			
		||||
    cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
 | 
			
		||||
    cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
 | 
			
		||||
    cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
 | 
			
		||||
    cl_kernel kernel_scale;
 | 
			
		||||
    cl_kernel kernel_silu, kernel_silu_4;
 | 
			
		||||
    cl_kernel kernel_gelu, kernel_gelu_4;
 | 
			
		||||
@@ -674,8 +674,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 | 
			
		||||
        backend_ctx->program_add =
 | 
			
		||||
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 | 
			
		||||
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_add     = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_add         = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_add_row     = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_add_f16     = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
 | 
			
		||||
        GGML_LOG_CONT(".");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -1089,8 +1091,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 | 
			
		||||
        backend_ctx->program_mul =
 | 
			
		||||
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 | 
			
		||||
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_mul     = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_mul         = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_mul_row     = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_mul_f16     = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
 | 
			
		||||
        GGML_LOG_CONT(".");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -1288,11 +1292,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 | 
			
		||||
#else
 | 
			
		||||
        const std::string kernel_src = read_file("div.cl");
 | 
			
		||||
#endif
 | 
			
		||||
        std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
 | 
			
		||||
                               " -cl-mad-enable -cl-finite-math-only ";
 | 
			
		||||
 | 
			
		||||
        backend_ctx->program_div =
 | 
			
		||||
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 | 
			
		||||
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_div     = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_div         = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_div_row     = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_div_f16     = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
 | 
			
		||||
        GGML_LOG_CONT(".");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -1308,8 +1317,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 | 
			
		||||
        backend_ctx->program_sub =
 | 
			
		||||
            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 | 
			
		||||
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_sub     = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_sub         = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_sub_row     = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_sub_f16     = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
 | 
			
		||||
        CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
 | 
			
		||||
        GGML_LOG_CONT(".");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -2447,12 +2458,15 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
 | 
			
		||||
                default:
 | 
			
		||||
                    return false;
 | 
			
		||||
            }
 | 
			
		||||
        case GGML_OP_ADD:
 | 
			
		||||
        case GGML_OP_SCALE:
 | 
			
		||||
            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
 | 
			
		||||
        case GGML_OP_ADD:
 | 
			
		||||
        case GGML_OP_MUL:
 | 
			
		||||
        case GGML_OP_DIV:
 | 
			
		||||
        case GGML_OP_SUB:
 | 
			
		||||
            return op->src[0]->type == GGML_TYPE_F32;
 | 
			
		||||
            return (op->src[0]->type == op->src[1]->type) &&
 | 
			
		||||
                   (op->src[0]->type == op->type) &&
 | 
			
		||||
                   (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
 | 
			
		||||
        case GGML_OP_UNARY:
 | 
			
		||||
            switch (ggml_get_unary_op(op)) {
 | 
			
		||||
                case GGML_UNARY_OP_GELU:
 | 
			
		||||
@@ -3680,35 +3694,39 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
    GGML_ASSERT(dst);
 | 
			
		||||
    GGML_ASSERT(dst->extra);
 | 
			
		||||
 | 
			
		||||
    const int  ne00 = src0 ? src0->ne[0] : 0;
 | 
			
		||||
    const int  ne01 = src0 ? src0->ne[1] : 0;
 | 
			
		||||
    const int  ne02 = src0 ? src0->ne[2] : 0;
 | 
			
		||||
    const int  ne03 = src0 ? src0->ne[3] : 0;
 | 
			
		||||
    GGML_ASSERT(src0->type == src1->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == dst->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
 | 
			
		||||
    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
 | 
			
		||||
    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
 | 
			
		||||
    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
 | 
			
		||||
    const int  ne00 = src0->ne[0];
 | 
			
		||||
    const int  ne01 = src0->ne[1];
 | 
			
		||||
    const int  ne02 = src0->ne[2];
 | 
			
		||||
    const int  ne03 = src0->ne[3];
 | 
			
		||||
 | 
			
		||||
    const int  ne10 = src1 ? src1->ne[0] : 0;
 | 
			
		||||
    const int  ne11 = src1 ? src1->ne[1] : 0;
 | 
			
		||||
    const int  ne12 = src1 ? src1->ne[2] : 0;
 | 
			
		||||
    const int  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
 | 
			
		||||
    const cl_ulong nb00 = src0->nb[0];
 | 
			
		||||
    const cl_ulong nb01 = src0->nb[1];
 | 
			
		||||
    const cl_ulong nb02 = src0->nb[2];
 | 
			
		||||
    const cl_ulong nb03 = src0->nb[3];
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
 | 
			
		||||
    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
 | 
			
		||||
    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
 | 
			
		||||
    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
 | 
			
		||||
    const int  ne10 = src1->ne[0];
 | 
			
		||||
    const int  ne11 = src1->ne[1];
 | 
			
		||||
    const int  ne12 = src1->ne[2];
 | 
			
		||||
    const int  ne13 = src1->ne[3]; UNUSED(ne13);
 | 
			
		||||
 | 
			
		||||
    const int  ne0  = dst ? dst->ne[0] : 0;
 | 
			
		||||
    const int  ne1  = dst ? dst->ne[1] : 0;
 | 
			
		||||
    const int  ne2  = dst ? dst->ne[2] : 0;
 | 
			
		||||
    const int  ne3  = dst ? dst->ne[3] : 0;
 | 
			
		||||
    const cl_ulong nb10 = src1->nb[0];
 | 
			
		||||
    const cl_ulong nb11 = src1->nb[1];
 | 
			
		||||
    const cl_ulong nb12 = src1->nb[2];
 | 
			
		||||
    const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
 | 
			
		||||
    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
 | 
			
		||||
    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
 | 
			
		||||
    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
 | 
			
		||||
    const int  ne0  = dst->ne[0];
 | 
			
		||||
    const int  ne1  = dst->ne[1];
 | 
			
		||||
    const int  ne2  = dst->ne[2];
 | 
			
		||||
    const int  ne3  = dst->ne[3];
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb0  = dst->nb[0];
 | 
			
		||||
    const cl_ulong nb1  = dst->nb[1];
 | 
			
		||||
    const cl_ulong nb2  = dst->nb[2];
 | 
			
		||||
    const cl_ulong nb3  = dst->nb[3];
 | 
			
		||||
 | 
			
		||||
    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 | 
			
		||||
 | 
			
		||||
@@ -3731,7 +3749,12 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
 | 
			
		||||
        bcast_row = true;
 | 
			
		||||
        int ne = ne00 / 4;
 | 
			
		||||
        kernel = backend_ctx->kernel_add_row;
 | 
			
		||||
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_add_row;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_add_row_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -3741,7 +3764,11 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 | 
			
		||||
    } else {
 | 
			
		||||
        kernel = backend_ctx->kernel_add;
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_add;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_add_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -3803,35 +3830,39 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
    GGML_ASSERT(dst);
 | 
			
		||||
    GGML_ASSERT(dst->extra);
 | 
			
		||||
 | 
			
		||||
    const int ne00 = src0 ? src0->ne[0] : 0;
 | 
			
		||||
    const int ne01 = src0 ? src0->ne[1] : 0;
 | 
			
		||||
    const int ne02 = src0 ? src0->ne[2] : 0;
 | 
			
		||||
    const int ne03 = src0 ? src0->ne[3] : 0;
 | 
			
		||||
    GGML_ASSERT(src0->type == src1->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == dst->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
 | 
			
		||||
    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
 | 
			
		||||
    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
 | 
			
		||||
    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
 | 
			
		||||
    const int ne00 = src0->ne[0];
 | 
			
		||||
    const int ne01 = src0->ne[1];
 | 
			
		||||
    const int ne02 = src0->ne[2];
 | 
			
		||||
    const int ne03 = src0->ne[3];
 | 
			
		||||
 | 
			
		||||
    const int ne10 = src1 ? src1->ne[0] : 0;
 | 
			
		||||
    const int ne11 = src1 ? src1->ne[1] : 0;
 | 
			
		||||
    const int ne12 = src1 ? src1->ne[2] : 0;
 | 
			
		||||
    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
 | 
			
		||||
    const cl_ulong nb00 = src0->nb[0];
 | 
			
		||||
    const cl_ulong nb01 = src0->nb[1];
 | 
			
		||||
    const cl_ulong nb02 = src0->nb[2];
 | 
			
		||||
    const cl_ulong nb03 = src0->nb[3];
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
 | 
			
		||||
    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
 | 
			
		||||
    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
 | 
			
		||||
    const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
 | 
			
		||||
    const int ne10 = src1->ne[0];
 | 
			
		||||
    const int ne11 = src1->ne[1];
 | 
			
		||||
    const int ne12 = src1->ne[2];
 | 
			
		||||
    const int ne13 = src1->ne[3]; UNUSED(ne13);
 | 
			
		||||
 | 
			
		||||
    const int ne0  = dst ? dst->ne[0] : 0;
 | 
			
		||||
    const int ne1  = dst ? dst->ne[1] : 0;
 | 
			
		||||
    const int ne2  = dst ? dst->ne[2] : 0;
 | 
			
		||||
    const int ne3  = dst ? dst->ne[3] : 0;
 | 
			
		||||
    const cl_ulong nb10 = src1->nb[0];
 | 
			
		||||
    const cl_ulong nb11 = src1->nb[1];
 | 
			
		||||
    const cl_ulong nb12 = src1->nb[2];
 | 
			
		||||
    const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb0  = dst ? dst->nb[0] : 0;
 | 
			
		||||
    const cl_ulong nb1  = dst ? dst->nb[1] : 0;
 | 
			
		||||
    const cl_ulong nb2  = dst ? dst->nb[2] : 0;
 | 
			
		||||
    const cl_ulong nb3  = dst ? dst->nb[3] : 0;
 | 
			
		||||
    const int ne0  = dst->ne[0];
 | 
			
		||||
    const int ne1  = dst->ne[1];
 | 
			
		||||
    const int ne2  = dst->ne[2];
 | 
			
		||||
    const int ne3  = dst->ne[3];
 | 
			
		||||
 | 
			
		||||
    const cl_ulong nb0  = dst->nb[0];
 | 
			
		||||
    const cl_ulong nb1  = dst->nb[1];
 | 
			
		||||
    const cl_ulong nb2  = dst->nb[2];
 | 
			
		||||
    const cl_ulong nb3  = dst->nb[3];
 | 
			
		||||
 | 
			
		||||
    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 | 
			
		||||
 | 
			
		||||
@@ -3854,7 +3885,12 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
 | 
			
		||||
        bcast_row = true;
 | 
			
		||||
        int ne = ne00 / 4;
 | 
			
		||||
        kernel = backend_ctx->kernel_mul_row;
 | 
			
		||||
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_mul_row;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_mul_row_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -3864,7 +3900,11 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 | 
			
		||||
    } else {
 | 
			
		||||
        kernel = backend_ctx->kernel_mul;
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_mul;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_mul_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -3926,6 +3966,10 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
    GGML_ASSERT(dst);
 | 
			
		||||
    GGML_ASSERT(dst->extra);
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(src0->type == src1->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == dst->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 | 
			
		||||
 | 
			
		||||
    const int ne00 = src0->ne[0];
 | 
			
		||||
    const int ne01 = src0->ne[1];
 | 
			
		||||
    const int ne02 = src0->ne[2];
 | 
			
		||||
@@ -3974,7 +4018,12 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
 | 
			
		||||
        bcast_row = true;
 | 
			
		||||
        int ne = ne00 / 4;
 | 
			
		||||
        kernel = backend_ctx->kernel_div_row;
 | 
			
		||||
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_div_row;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_div_row_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -3984,7 +4033,11 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 | 
			
		||||
    } else {
 | 
			
		||||
        kernel = backend_ctx->kernel_div;
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_div;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_div_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -4034,6 +4087,10 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
    GGML_ASSERT(dst);
 | 
			
		||||
    GGML_ASSERT(dst->extra);
 | 
			
		||||
 | 
			
		||||
    GGML_ASSERT(src0->type == src1->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == dst->type);
 | 
			
		||||
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 | 
			
		||||
 | 
			
		||||
    const int ne00 = src0->ne[0];
 | 
			
		||||
    const int ne01 = src0->ne[1];
 | 
			
		||||
    const int ne02 = src0->ne[2];
 | 
			
		||||
@@ -4082,7 +4139,12 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
 | 
			
		||||
        bcast_row = true;
 | 
			
		||||
        int ne = ne00 / 4;
 | 
			
		||||
        kernel = backend_ctx->kernel_sub_row;
 | 
			
		||||
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_sub_row;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_sub_row_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
@@ -4092,7 +4154,11 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 | 
			
		||||
    } else {
 | 
			
		||||
        kernel = backend_ctx->kernel_sub;
 | 
			
		||||
        if (src0->type == GGML_TYPE_F32) {
 | 
			
		||||
            kernel = backend_ctx->kernel_sub;
 | 
			
		||||
        } else {
 | 
			
		||||
            kernel = backend_ctx->kernel_sub_f16;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 | 
			
		||||
        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 | 
			
		||||
 
 | 
			
		||||
@@ -81,3 +81,76 @@ kernel void kernel_add_row(
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] + src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_add_f16(
 | 
			
		||||
        global char * src0,
 | 
			
		||||
        ulong  offset0,
 | 
			
		||||
        global char * src1,
 | 
			
		||||
        ulong  offset1,
 | 
			
		||||
        global char * dst,
 | 
			
		||||
        ulong  offsetd,
 | 
			
		||||
        int   ne00,
 | 
			
		||||
        int   ne01,
 | 
			
		||||
        int   ne02,
 | 
			
		||||
        int   ne03,
 | 
			
		||||
        ulong nb00,
 | 
			
		||||
        ulong nb01,
 | 
			
		||||
        ulong nb02,
 | 
			
		||||
        ulong nb03,
 | 
			
		||||
        int   ne10,
 | 
			
		||||
        int   ne11,
 | 
			
		||||
        int   ne12,
 | 
			
		||||
        int   ne13,
 | 
			
		||||
        ulong nb10,
 | 
			
		||||
        ulong nb11,
 | 
			
		||||
        ulong nb12,
 | 
			
		||||
        ulong nb13,
 | 
			
		||||
        int   ne0,
 | 
			
		||||
        int   ne1,
 | 
			
		||||
        int   ne2,
 | 
			
		||||
        int   ne3,
 | 
			
		||||
        ulong nb0,
 | 
			
		||||
        ulong nb1,
 | 
			
		||||
        ulong nb2,
 | 
			
		||||
        ulong nb3
 | 
			
		||||
) {
 | 
			
		||||
    src0 = src0 + offset0;
 | 
			
		||||
    src1 = src1 + offset1;
 | 
			
		||||
    dst = dst + offsetd;
 | 
			
		||||
 | 
			
		||||
    int i03 = get_group_id(2);
 | 
			
		||||
    int i02 = get_group_id(1);
 | 
			
		||||
    int i01 = get_group_id(0);
 | 
			
		||||
 | 
			
		||||
    int i13 = i03 % ne13;
 | 
			
		||||
    int i12 = i02 % ne12;
 | 
			
		||||
    int i11 = i01 % ne11;
 | 
			
		||||
 | 
			
		||||
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
 | 
			
		||||
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
 | 
			
		||||
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
 | 
			
		||||
 | 
			
		||||
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
 | 
			
		||||
        const int i10 = i0 % ne10;
 | 
			
		||||
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) + *((global half *)(src1_ptr + i10*nb10));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_add_row_f16(
 | 
			
		||||
        global half4 * src0,
 | 
			
		||||
        ulong  offset0,
 | 
			
		||||
        global half4 * src1,
 | 
			
		||||
        ulong  offset1,
 | 
			
		||||
        global half4 * dst,
 | 
			
		||||
        ulong  offsetd,
 | 
			
		||||
        int ne
 | 
			
		||||
) {
 | 
			
		||||
    src0 = (global half4*)((global char*)src0 + offset0);
 | 
			
		||||
    src1 = (global half4*)((global char*)src1 + offset1);
 | 
			
		||||
    dst = (global half4*)((global char*)dst + offsetd);
 | 
			
		||||
 | 
			
		||||
    // This performs better than using %.
 | 
			
		||||
    uint gid = get_global_id(0);
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] + src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -70,3 +70,69 @@ kernel void kernel_div_row(
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] / src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_div_f16(
 | 
			
		||||
        global char * src0,
 | 
			
		||||
        ulong offset0,
 | 
			
		||||
        global char * src1,
 | 
			
		||||
        ulong offset1,
 | 
			
		||||
        global char * dst,
 | 
			
		||||
        ulong offsetd,
 | 
			
		||||
        ulong nb00,
 | 
			
		||||
        ulong nb01,
 | 
			
		||||
        ulong nb02,
 | 
			
		||||
        ulong nb03,
 | 
			
		||||
        int ne10,
 | 
			
		||||
        int ne11,
 | 
			
		||||
        int ne12,
 | 
			
		||||
        int ne13,
 | 
			
		||||
        ulong nb10,
 | 
			
		||||
        ulong nb11,
 | 
			
		||||
        ulong nb12,
 | 
			
		||||
        ulong nb13,
 | 
			
		||||
        int ne0,
 | 
			
		||||
        ulong nb0,
 | 
			
		||||
        ulong nb1,
 | 
			
		||||
        ulong nb2,
 | 
			
		||||
        ulong nb3
 | 
			
		||||
) {
 | 
			
		||||
    src0 = src0 + offset0;
 | 
			
		||||
    src1 = src1 + offset1;
 | 
			
		||||
    dst  = dst + offsetd;
 | 
			
		||||
 | 
			
		||||
    int i03 = get_group_id(2);
 | 
			
		||||
    int i02 = get_group_id(1);
 | 
			
		||||
    int i01 = get_group_id(0);
 | 
			
		||||
 | 
			
		||||
    int i13 = i03 % ne13;
 | 
			
		||||
    int i12 = i02 % ne12;
 | 
			
		||||
    int i11 = i01 % ne11;
 | 
			
		||||
 | 
			
		||||
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
 | 
			
		||||
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
 | 
			
		||||
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
 | 
			
		||||
 | 
			
		||||
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
 | 
			
		||||
        const int i10 = i0 % ne10;
 | 
			
		||||
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) / *((global half *)(src1_ptr + i10*nb10));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_div_row_f16(
 | 
			
		||||
        global half4 * src0,
 | 
			
		||||
        ulong offset0,
 | 
			
		||||
        global half4 * src1,
 | 
			
		||||
        ulong offset1,
 | 
			
		||||
        global half4 * dst,
 | 
			
		||||
        ulong offsetd,
 | 
			
		||||
        int ne
 | 
			
		||||
) {
 | 
			
		||||
    src0 = (global half4*)((global char*)src0 + offset0);
 | 
			
		||||
    src1 = (global half4*)((global char*)src1 + offset1);
 | 
			
		||||
    dst = (global half4*)((global char*)dst + offsetd);
 | 
			
		||||
 | 
			
		||||
    // This performs better than using %.
 | 
			
		||||
    uint gid = get_global_id(0);
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] / src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -77,3 +77,76 @@ kernel void kernel_mul_row(
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] * src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_mul_f16(
 | 
			
		||||
        global char * src0,
 | 
			
		||||
        ulong offset0,
 | 
			
		||||
        global char * src1,
 | 
			
		||||
        ulong offset1,
 | 
			
		||||
        global char * dst,
 | 
			
		||||
        ulong offsetd,
 | 
			
		||||
        int ne00,
 | 
			
		||||
        int ne01,
 | 
			
		||||
        int ne02,
 | 
			
		||||
        int ne03,
 | 
			
		||||
        ulong nb00,
 | 
			
		||||
        ulong nb01,
 | 
			
		||||
        ulong nb02,
 | 
			
		||||
        ulong nb03,
 | 
			
		||||
        int ne10,
 | 
			
		||||
        int ne11,
 | 
			
		||||
        int ne12,
 | 
			
		||||
        int ne13,
 | 
			
		||||
        ulong nb10,
 | 
			
		||||
        ulong nb11,
 | 
			
		||||
        ulong nb12,
 | 
			
		||||
        ulong nb13,
 | 
			
		||||
        int ne0,
 | 
			
		||||
        int ne1,
 | 
			
		||||
        int ne2,
 | 
			
		||||
        int ne3,
 | 
			
		||||
        ulong nb0,
 | 
			
		||||
        ulong nb1,
 | 
			
		||||
        ulong nb2,
 | 
			
		||||
        ulong nb3
 | 
			
		||||
) {
 | 
			
		||||
    src0 = src0 + offset0;
 | 
			
		||||
    src1 = src1 + offset1;
 | 
			
		||||
    dst  = dst + offsetd;
 | 
			
		||||
 | 
			
		||||
    int i03 = get_group_id(2);
 | 
			
		||||
    int i02 = get_group_id(1);
 | 
			
		||||
    int i01 = get_group_id(0);
 | 
			
		||||
 | 
			
		||||
    int i13 = i03 % ne13;
 | 
			
		||||
    int i12 = i02 % ne12;
 | 
			
		||||
    int i11 = i01 % ne11;
 | 
			
		||||
 | 
			
		||||
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
 | 
			
		||||
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
 | 
			
		||||
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
 | 
			
		||||
 | 
			
		||||
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
 | 
			
		||||
        const int i10 = i0 % ne10;
 | 
			
		||||
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) * *((global half *)(src1_ptr + i10*nb10));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_mul_row_f16(
 | 
			
		||||
        global half4 * src0,
 | 
			
		||||
        ulong offset0,
 | 
			
		||||
        global half4 * src1,
 | 
			
		||||
        ulong offset1,
 | 
			
		||||
        global half4 * dst,
 | 
			
		||||
        ulong offsetd,
 | 
			
		||||
        int ne
 | 
			
		||||
) {
 | 
			
		||||
    src0 = (global half4*)((global char*)src0 + offset0);
 | 
			
		||||
    src1 = (global half4*)((global char*)src1 + offset1);
 | 
			
		||||
    dst = (global half4*)((global char*)dst + offsetd);
 | 
			
		||||
 | 
			
		||||
    // This performs better than using %.
 | 
			
		||||
    uint gid = get_global_id(0);
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] * src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -70,3 +70,69 @@ kernel void kernel_sub_row(
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] - src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_sub_f16(
 | 
			
		||||
        global char * src0,
 | 
			
		||||
        ulong offset0,
 | 
			
		||||
        global char * src1,
 | 
			
		||||
        ulong offset1,
 | 
			
		||||
        global char * dst,
 | 
			
		||||
        ulong offsetd,
 | 
			
		||||
        ulong nb00,
 | 
			
		||||
        ulong nb01,
 | 
			
		||||
        ulong nb02,
 | 
			
		||||
        ulong nb03,
 | 
			
		||||
        int ne10,
 | 
			
		||||
        int ne11,
 | 
			
		||||
        int ne12,
 | 
			
		||||
        int ne13,
 | 
			
		||||
        ulong nb10,
 | 
			
		||||
        ulong nb11,
 | 
			
		||||
        ulong nb12,
 | 
			
		||||
        ulong nb13,
 | 
			
		||||
        int ne0,
 | 
			
		||||
        ulong nb0,
 | 
			
		||||
        ulong nb1,
 | 
			
		||||
        ulong nb2,
 | 
			
		||||
        ulong nb3
 | 
			
		||||
) {
 | 
			
		||||
    src0 = src0 + offset0;
 | 
			
		||||
    src1 = src1 + offset1;
 | 
			
		||||
    dst  = dst + offsetd;
 | 
			
		||||
 | 
			
		||||
    int i03 = get_group_id(2);
 | 
			
		||||
    int i02 = get_group_id(1);
 | 
			
		||||
    int i01 = get_group_id(0);
 | 
			
		||||
 | 
			
		||||
    int i13 = i03 % ne13;
 | 
			
		||||
    int i12 = i02 % ne12;
 | 
			
		||||
    int i11 = i01 % ne11;
 | 
			
		||||
 | 
			
		||||
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
 | 
			
		||||
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
 | 
			
		||||
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
 | 
			
		||||
 | 
			
		||||
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
 | 
			
		||||
        const int i10 = i0 % ne10;
 | 
			
		||||
        *((global half *)(dst_ptr + i0*nb0)) = *((global half *)(src0_ptr + i0*nb00)) - *((global half *)(src1_ptr + i10*nb10));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kernel void kernel_sub_row_f16(
 | 
			
		||||
        global half4 * src0,
 | 
			
		||||
        ulong offset0,
 | 
			
		||||
        global half4 * src1,
 | 
			
		||||
        ulong offset1,
 | 
			
		||||
        global half4 * dst,
 | 
			
		||||
        ulong offsetd,
 | 
			
		||||
        int ne
 | 
			
		||||
) {
 | 
			
		||||
    src0 = (global half4*)((global char*)src0 + offset0);
 | 
			
		||||
    src1 = (global half4*)((global char*)src1 + offset1);
 | 
			
		||||
    dst = (global half4*)((global char*)dst + offsetd);
 | 
			
		||||
 | 
			
		||||
    // This performs better than using %.
 | 
			
		||||
    uint gid = get_global_id(0);
 | 
			
		||||
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
 | 
			
		||||
    dst[gid] = src0[gid] - src1[idx1];
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user