mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Add Conv2d for CPU (#14388)
* Conv2D: Add CPU version * Half decent * Tiled approach for F32 * remove file * Fix tests * Support F16 operations * add assert about size * Review: further formatting fixes, add assert and use CPU version of fp32->fp16
This commit is contained in:
		| @@ -482,6 +482,7 @@ extern "C" { | ||||
|         GGML_OP_CONV_TRANSPOSE_1D, | ||||
|         GGML_OP_IM2COL, | ||||
|         GGML_OP_IM2COL_BACK, | ||||
|         GGML_OP_CONV_2D, | ||||
|         GGML_OP_CONV_2D_DW, | ||||
|         GGML_OP_CONV_TRANSPOSE_2D, | ||||
|         GGML_OP_POOL_1D, | ||||
| @@ -1813,6 +1814,17 @@ extern "C" { | ||||
|             struct ggml_tensor  * b, | ||||
|             int                   stride); | ||||
|  | ||||
|     GGML_API struct ggml_tensor * ggml_conv_2d_direct( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC] | ||||
|             struct ggml_tensor  * b,   // input data [W, H, C, N] | ||||
|             int                   s0,  // stride dimension 0 | ||||
|             int                   s1,  // stride dimension 1 | ||||
|             int                   p0,  // padding dimension 0 | ||||
|             int                   p1,  // padding dimension 1 | ||||
|             int                   d0,  // dilation dimension 0 | ||||
|             int                   d1); // dilation dimension 1 | ||||
|  | ||||
|     enum ggml_op_pool { | ||||
|         GGML_OP_POOL_MAX, | ||||
|         GGML_OP_POOL_AVG, | ||||
|   | ||||
| @@ -1193,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void ggml_compute_forward_mul_mat( | ||||
| void ggml_compute_forward_mul_mat( | ||||
|         const struct ggml_compute_params * params, | ||||
|               struct ggml_tensor * dst) { | ||||
|  | ||||
| @@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm | ||||
|             { | ||||
|                 ggml_compute_forward_im2col_back_f32(params, tensor); | ||||
|             } break; | ||||
|         case GGML_OP_CONV_2D: | ||||
|             { | ||||
|                 ggml_compute_forward_conv_2d(params, tensor); | ||||
|             } break; | ||||
|         case GGML_OP_CONV_2D_DW: | ||||
|             { | ||||
|                 ggml_compute_forward_conv_2d_dw(params, tensor); | ||||
| @@ -2228,6 +2232,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { | ||||
|             } break; | ||||
|         case GGML_OP_IM2COL: | ||||
|         case GGML_OP_IM2COL_BACK: | ||||
|         case GGML_OP_CONV_2D: | ||||
|         case GGML_OP_CONV_2D_DW: | ||||
|         case GGML_OP_CONV_TRANSPOSE_1D: | ||||
|         case GGML_OP_CONV_TRANSPOSE_2D: | ||||
| @@ -2746,6 +2751,10 @@ struct ggml_cplan ggml_graph_plan( | ||||
|                             GGML_ABORT("fatal error"); | ||||
|                         } | ||||
|                     } break; | ||||
|                 case GGML_OP_CONV_2D: | ||||
|                     { | ||||
|                         cur = GGML_IM2COL_WORK_SIZE; | ||||
|                     } break; | ||||
|                 case GGML_OP_CONV_TRANSPOSE_2D: | ||||
|                     { | ||||
|                         const int64_t ne00 = node->src[0]->ne[0]; // W | ||||
|   | ||||
| @@ -3,6 +3,7 @@ | ||||
| #include "ggml-cpu.h" | ||||
| #include "ggml-impl.h" | ||||
| #include "binary-ops.h" | ||||
| #include "ggml.h" | ||||
| #include "unary-ops.h" | ||||
| #include "vec.h" | ||||
|  | ||||
| @@ -6545,6 +6546,186 @@ void ggml_compute_forward_im2col_back_f32( | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k, | ||||
|                               void * a, void * b, float * c) { | ||||
|     const ggml_type_traits * traits = ggml_get_type_traits(type); | ||||
|     struct ggml_tensor src1 = {}; | ||||
|     src1.type  = type; | ||||
|     src1.ne[0] = k; | ||||
|     src1.ne[1] = m; | ||||
|     src1.ne[2] = 1; | ||||
|     src1.ne[3] = 1; | ||||
|     src1.nb[0] = traits->type_size; | ||||
|     src1.nb[1] = k * traits->type_size; | ||||
|     src1.nb[2] = src1.nb[1]; | ||||
|     src1.nb[3] = src1.nb[2]; | ||||
|     src1.data  = a; | ||||
|  | ||||
|     struct ggml_tensor src0 = {}; | ||||
|     src0.type  = type; | ||||
|     src0.ne[0] = k; | ||||
|     src0.ne[1] = n; | ||||
|     src0.ne[2] = 1; | ||||
|     src0.ne[3] = 1; | ||||
|     src0.nb[0] = traits->type_size; | ||||
|     src0.nb[1] = k * traits->type_size; | ||||
|     src0.nb[2] = src0.nb[1]; | ||||
|     src0.nb[3] = src0.nb[2]; | ||||
|     src0.data  = b; | ||||
|  | ||||
|     struct ggml_tensor dst = {}; | ||||
|     dst.ne[0] = n; | ||||
|     dst.ne[1] = m; | ||||
|     dst.ne[2] = 1; | ||||
|     dst.ne[3] = 1; | ||||
|     dst.nb[0] = sizeof(float); | ||||
|     dst.nb[1] = n * sizeof(float); | ||||
|     dst.nb[2] = dst.nb[1]; | ||||
|     dst.nb[3] = dst.nb[2]; | ||||
|     dst.data  = c; | ||||
|     dst.src[0] = &src0; | ||||
|     dst.src[1] = &src1; | ||||
|  | ||||
|     ggml_compute_forward_mul_mat(params, &dst); | ||||
| } | ||||
|  | ||||
| // ggml_compute_forward_conv_2d | ||||
|  | ||||
| static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params, | ||||
|                                               const ggml_tensor *         kernel,  // [KW, KH, IC, OC] | ||||
|                                               const ggml_tensor *         src,     // [W, H, C, N] | ||||
|                                               ggml_tensor *               dst,     // [OW, OH, OC, N] | ||||
|                                               ggml_type                   kernel_type) { | ||||
|  | ||||
|     GGML_ASSERT(ggml_is_contiguous(kernel)); | ||||
|     GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32); | ||||
|     GGML_ASSERT(kernel->type == kernel_type); | ||||
|  | ||||
|     const ggml_type_traits * traits = ggml_get_type_traits(kernel_type); | ||||
|  | ||||
|     const int32_t stride_x   = dst->op_params[0]; | ||||
|     const int32_t stride_y   = dst->op_params[1]; | ||||
|     const int32_t pad_x      = dst->op_params[2]; | ||||
|     const int32_t pad_y      = dst->op_params[3]; | ||||
|     const int32_t dilation_x = dst->op_params[4]; | ||||
|     const int32_t dilation_y = dst->op_params[5]; | ||||
|  | ||||
|     const int64_t c_in  = src->ne[2]; | ||||
|     const int64_t c_out = kernel->ne[3]; | ||||
|     GGML_ASSERT(c_in == kernel->ne[2]); | ||||
|  | ||||
|     const int64_t src_w = src->ne[0]; | ||||
|     const int64_t src_h = src->ne[1]; | ||||
|     const int64_t knl_w = kernel->ne[0]; | ||||
|     const int64_t knl_h = kernel->ne[1]; | ||||
|     const int64_t dst_w = dst->ne[0]; | ||||
|     const int64_t dst_h = dst->ne[1]; | ||||
|  | ||||
|     const float * src_data = (float *) src->data; | ||||
|     void  * knl_data       = kernel->data; | ||||
|     float * dst_data       = (float *) dst->data; | ||||
|  | ||||
|     const int64_t knl_n           = knl_w * knl_h * c_in; | ||||
|     const int64_t patch_total     = dst->ne[3] * dst_w * dst_h; | ||||
|  | ||||
|     const int64_t space_per_patch   = knl_n * traits->type_size + c_out * sizeof(float); | ||||
|     const int64_t batch_size        = params->wsize / space_per_patch; | ||||
|     const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; | ||||
|     const int64_t batch_n           = (patch_total + patches_per_batch - 1) / patches_per_batch; | ||||
|  | ||||
|     GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); | ||||
|  | ||||
|     void * tmp = params->wdata; | ||||
|  | ||||
|     for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { | ||||
|  | ||||
|         const int64_t patch_start_batch = batch_i * patches_per_batch; | ||||
|         const int64_t patch_end_batch   = std::min(patch_start_batch + patches_per_batch, | ||||
|                                               patch_total); | ||||
|         const int64_t patch_n           = patch_end_batch - patch_start_batch; | ||||
|  | ||||
|         const int64_t patch_per_thread  = (patch_n + params->nth - 1) / params->nth; | ||||
|         const int64_t patch_start       = patch_start_batch + params->ith * patch_per_thread; | ||||
|         const int64_t patch_end         = std::min(patch_start + patch_per_thread, patch_end_batch); | ||||
|  | ||||
|         //im2col for a patch | ||||
|         for (int64_t p = patch_start; p < patch_end; ++p) { | ||||
|             const int64_t  batch_n     =  p / (dst_w * dst_h); | ||||
|             const int64_t  src_x       = (p / dst_w) % dst_h; | ||||
|             const int64_t  src_y       =  p % dst_w; | ||||
|  | ||||
|             const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]); | ||||
|             char *        dst_row  = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size; | ||||
|  | ||||
|             for (int64_t ic = 0; ic < c_in; ++ic) { | ||||
|                 for (int64_t ky = 0; ky < knl_h; ++ky) { | ||||
|                     for (int64_t kx = 0; kx < knl_w; ++kx) { | ||||
|                         const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y; | ||||
|                         const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x; | ||||
|  | ||||
|                         int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx; | ||||
|  | ||||
|                         float src_val; | ||||
|                         if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { | ||||
|                             src_val = 0.0f; | ||||
|                         } else { | ||||
|                             const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]); | ||||
|                             src_val               = *src_ptr; | ||||
|                         } | ||||
|  | ||||
|                         char * element_ptr = dst_row + dst_idx * traits->type_size; | ||||
|                         if (kernel_type == GGML_TYPE_F32) { | ||||
|                             *(float *) element_ptr = src_val; | ||||
|                         } else if (kernel_type == GGML_TYPE_F16) { | ||||
|                             *(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         }   // patches handled by this thread | ||||
|  | ||||
|         ggml_barrier(params->threadpool); | ||||
|  | ||||
|         float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size); | ||||
|  | ||||
|         GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize); | ||||
|  | ||||
|         // GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out] | ||||
|         ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output); | ||||
|  | ||||
|         ggml_barrier(params->threadpool); | ||||
|  | ||||
|  | ||||
|         //permute back [OC, N, OH, OW] to [N, OC, OH, OW] | ||||
|         const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth; | ||||
|         const int64_t permute_start = params->ith * permute_per_thread; | ||||
|         const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n); | ||||
|  | ||||
|         for (int64_t i = permute_start; i < permute_end; ++i) { | ||||
|             const int64_t p       = patch_start_batch + i; | ||||
|             const int64_t batch_n = p / (dst_w * dst_h); | ||||
|             const int64_t dst_y   = (p / dst_w) % dst_h; | ||||
|             const int64_t dst_x   = p % dst_w; | ||||
|  | ||||
|             for (int64_t oc = 0; oc < c_out; ++oc) { | ||||
|                 const float value = gemm_output[i * c_out + oc]; | ||||
|                 float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]); | ||||
|                 *dst_ptr = value; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| void ggml_compute_forward_conv_2d( | ||||
|         const ggml_compute_params * params, | ||||
|         ggml_tensor * dst) { | ||||
|  | ||||
|     const ggml_tensor * src0 = dst->src[0]; | ||||
|     const ggml_tensor * src1 = dst->src[1]; | ||||
|  | ||||
|     ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type); | ||||
| } | ||||
|  | ||||
| // ggml_compute_forward_conv_transpose_2d | ||||
|  | ||||
| void ggml_compute_forward_conv_transpose_2d( | ||||
|   | ||||
| @@ -20,6 +20,9 @@ | ||||
|  | ||||
| static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); | ||||
|  | ||||
| // Work buffer size for im2col operations in CONV2D | ||||
| #define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| @@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc | ||||
| void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| @@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru | ||||
| void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
| void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| } | ||||
|   | ||||
| @@ -945,6 +945,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { | ||||
|     "CONV_TRANSPOSE_1D", | ||||
|     "IM2COL", | ||||
|     "IM2COL_BACK", | ||||
|     "CONV_2D", | ||||
|     "CONV_2D_DW", | ||||
|     "CONV_TRANSPOSE_2D", | ||||
|     "POOL_1D", | ||||
| @@ -986,7 +987,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { | ||||
|     "GLU", | ||||
| }; | ||||
|  | ||||
| static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); | ||||
| static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); | ||||
|  | ||||
| static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { | ||||
|     "none", | ||||
| @@ -1044,6 +1045,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { | ||||
|     "conv_transpose_1d(x)", | ||||
|     "im2col(x)", | ||||
|     "im2col_back(x)", | ||||
|     "conv_2d(x)", | ||||
|     "conv_2d_dw(x)", | ||||
|     "conv_transpose_2d(x)", | ||||
|     "pool_1d(x)", | ||||
| @@ -1085,7 +1087,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { | ||||
|     "glu(x)", | ||||
| }; | ||||
|  | ||||
| static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); | ||||
| static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86"); | ||||
|  | ||||
| static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); | ||||
|  | ||||
| @@ -4291,6 +4293,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct( | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| // ggml_conv_2d_direct | ||||
|  | ||||
| struct ggml_tensor * ggml_conv_2d_direct( | ||||
|         struct ggml_context * ctx, | ||||
|         struct ggml_tensor  * a,   // convolution kernel [KW, KH, IC, OC] | ||||
|         struct ggml_tensor  * b,   // input data [W, H, C, N] | ||||
|         int                   s0,  // stride dimension 0 | ||||
|         int                   s1,  // stride dimension 1 | ||||
|         int                   p0,  // padding dimension 0 | ||||
|         int                   p1,  // padding dimension 1 | ||||
|         int                   d0,  // dilation dimension 0 | ||||
|         int                   d1) {// dilation dimension 1 | ||||
|  | ||||
|     GGML_ASSERT(a->ne[2] == b->ne[2]); | ||||
|     //GGML_ASSERT(a->type == b->type); | ||||
|  | ||||
|     int64_t ne[4]; | ||||
|     ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); | ||||
|     ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); | ||||
|     ne[2] = a->ne[3]; | ||||
|     ne[3] = b->ne[3]; | ||||
|  | ||||
|     struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne); | ||||
|  | ||||
|     ggml_set_op_params_i32(result, 0, s0); | ||||
|     ggml_set_op_params_i32(result, 1, s1); | ||||
|     ggml_set_op_params_i32(result, 2, p0); | ||||
|     ggml_set_op_params_i32(result, 3, p1); | ||||
|     ggml_set_op_params_i32(result, 4, d0); | ||||
|     ggml_set_op_params_i32(result, 5, d1); | ||||
|  | ||||
|     result->op = GGML_OP_CONV_2D; | ||||
|     result->src[0] = a; | ||||
|     result->src[1] = b; | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| // ggml_conv_transpose_2d_p0 | ||||
|  | ||||
| static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Aman Gupta
					Aman Gupta