mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	clip : refactor + bug fixes (#4696)
* clip : refactor + bug fixes ggml-ci * server : add log message
This commit is contained in:
		| @@ -146,6 +146,27 @@ static std::string get_ftype(int ftype) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // | ||||||
|  | // image data | ||||||
|  | // | ||||||
|  |  | ||||||
|  | // RGB uint8 image | ||||||
|  | struct clip_image_u8 { | ||||||
|  |     int nx; | ||||||
|  |     int ny; | ||||||
|  |  | ||||||
|  |     std::vector<uint8_t> buf; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | // RGB float32 image (NHWC) | ||||||
|  | // Memory layout: RGBRGBRGB... | ||||||
|  | struct clip_image_f32 { | ||||||
|  |     int nx; | ||||||
|  |     int ny; | ||||||
|  |  | ||||||
|  |     std::vector<float> buf; | ||||||
|  | }; | ||||||
|  |  | ||||||
| // | // | ||||||
| // clip layers | // clip layers | ||||||
| // | // | ||||||
| @@ -207,13 +228,18 @@ struct clip_ctx { | |||||||
|     bool has_text_encoder    = false; |     bool has_text_encoder    = false; | ||||||
|     bool has_vision_encoder  = false; |     bool has_vision_encoder  = false; | ||||||
|     bool has_llava_projector = false; |     bool has_llava_projector = false; | ||||||
|  |  | ||||||
|     struct clip_vision_model vision_model; |     struct clip_vision_model vision_model; | ||||||
|  |  | ||||||
|     float image_mean[3]; |     float image_mean[3]; | ||||||
|     float image_std[3]; |     float image_std[3]; | ||||||
|     bool use_gelu = false; |     bool use_gelu = false; | ||||||
|     int32_t ftype = 1; |     int32_t ftype = 1; | ||||||
|     struct ggml_context * ctx; |  | ||||||
|     struct gguf_context * ctx_gguf; |     struct gguf_context * ctx_gguf; | ||||||
|  |     struct ggml_context * ctx_data; | ||||||
|  |  | ||||||
|  |     std::vector<uint8_t> buf_compute_meta; | ||||||
|  |  | ||||||
|     // memory buffers to evaluate the model |     // memory buffers to evaluate the model | ||||||
|     ggml_backend_buffer_t params_buffer = NULL; |     ggml_backend_buffer_t params_buffer = NULL; | ||||||
| @@ -222,7 +248,7 @@ struct clip_ctx { | |||||||
|     ggml_allocr * compute_alloc = NULL; |     ggml_allocr * compute_alloc = NULL; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) { | static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { | ||||||
|     if (!ctx->has_vision_encoder) { |     if (!ctx->has_vision_encoder) { | ||||||
|         printf("This gguf file seems to have no vision encoder\n"); |         printf("This gguf file seems to have no vision encoder\n"); | ||||||
|         return nullptr; |         return nullptr; | ||||||
| @@ -246,9 +272,10 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima | |||||||
|     if (ctx->has_llava_projector) { |     if (ctx->has_llava_projector) { | ||||||
|         GGML_ASSERT(batch_size == 1); |         GGML_ASSERT(batch_size == 1); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     struct ggml_init_params params = { |     struct ggml_init_params params = { | ||||||
|         /*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(), |         /*.mem_size   =*/ ctx->buf_compute_meta.size(), | ||||||
|         /*.mem_buffer =*/ NULL, |         /*.mem_buffer =*/ ctx->buf_compute_meta.data(), | ||||||
|         /*.no_alloc   =*/ true, |         /*.no_alloc   =*/ true, | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| @@ -272,7 +299,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima | |||||||
|                 for (int k = 0; k < 3; k++) { |                 for (int k = 0; k < 3; k++) { | ||||||
|                     for (int y = 0; y < ny; y++) { |                     for (int y = 0; y < ny; y++) { | ||||||
|                         for (int x = 0; x < nx; x++) { |                         for (int x = 0; x < nx; x++) { | ||||||
|                             data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k]; |                             data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k]; | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -413,7 +440,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima | |||||||
|         ggml_allocr_alloc(ctx->compute_alloc, patches); |         ggml_allocr_alloc(ctx->compute_alloc, patches); | ||||||
|         if (!ggml_allocr_is_measure(ctx->compute_alloc)) { |         if (!ggml_allocr_is_measure(ctx->compute_alloc)) { | ||||||
|             int* patches_data = (int*)malloc(ggml_nbytes(patches)); |             int* patches_data = (int*)malloc(ggml_nbytes(patches)); | ||||||
|             for (int i = 0; i < num_positions; i++) { |             for (int i = 0; i < num_patches; i++) { | ||||||
|                 patches_data[i] = i + 1; |                 patches_data[i] = i + 1; | ||||||
|             } |             } | ||||||
|             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); |             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); | ||||||
| @@ -561,8 +588,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|             /*.no_alloc =*/ true, |             /*.no_alloc =*/ true, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         new_clip->ctx = ggml_init(params); |         new_clip->ctx_data = ggml_init(params); | ||||||
|         if (!new_clip->ctx) { |         if (!new_clip->ctx_data) { | ||||||
|             fprintf(stderr, "%s: ggml_init() failed\n", __func__); |             fprintf(stderr, "%s: ggml_init() failed\n", __func__); | ||||||
|             clip_free(new_clip); |             clip_free(new_clip); | ||||||
|             return nullptr; |             return nullptr; | ||||||
| @@ -579,7 +606,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|         for (int i = 0; i < n_tensors; ++i) { |         for (int i = 0; i < n_tensors; ++i) { | ||||||
|             const char * name = gguf_get_tensor_name(ctx, i); |             const char * name = gguf_get_tensor_name(ctx, i); | ||||||
|             struct ggml_tensor * t = ggml_get_tensor(meta, name); |             struct ggml_tensor * t = ggml_get_tensor(meta, name); | ||||||
|             struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t); |             struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t); | ||||||
|             ggml_set_name(cur, name); |             ggml_set_name(cur, name); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -588,7 +615,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|         ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer); |         ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer); | ||||||
|         for (int i = 0; i < n_tensors; ++i) { |         for (int i = 0; i < n_tensors; ++i) { | ||||||
|             const char * name = gguf_get_tensor_name(ctx, i); |             const char * name = gguf_get_tensor_name(ctx, i); | ||||||
|             struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name); |             struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); | ||||||
|             ggml_allocr_alloc(alloc, cur); |             ggml_allocr_alloc(alloc, cur); | ||||||
|             const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); |             const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); | ||||||
|             fin.seekg(offset, std::ios::beg); |             fin.seekg(offset, std::ios::beg); | ||||||
| @@ -644,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|             printf("v_n_layer          %d\n", hparams.n_layer); |             printf("v_n_layer          %d\n", hparams.n_layer); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD); |         vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); | ||||||
|         vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD); |         vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); | ||||||
|         vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v")); |         vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); | ||||||
|         vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight")); |         vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); | ||||||
|         vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias")); |         vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); | ||||||
|         vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight")); |         vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); | ||||||
|         vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias")); |         vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); | ||||||
|         vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight")); |         vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); | ||||||
|         vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias")); |         vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); | ||||||
|  |  | ||||||
|         vision_model.layers.resize(hparams.n_layer); |         vision_model.layers.resize(hparams.n_layer); | ||||||
|         for (int il = 0; il < hparams.n_layer; ++il) { |         for (int il = 0; il < hparams.n_layer; ++il) { | ||||||
|             auto & layer = vision_model.layers[il]; |             auto & layer = vision_model.layers[il]; | ||||||
|             layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight")); |             layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight")); | ||||||
|             layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight")); |             layer.q_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "weight")); | ||||||
|             layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight")); |             layer.v_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "weight")); | ||||||
|             layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight")); |             layer.o_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight")); | ||||||
|             layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight")); |             layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "weight")); | ||||||
|             layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight")); |             layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "weight")); | ||||||
|             layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight")); |             layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "weight")); | ||||||
|             layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight")); |             layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "weight")); | ||||||
|             layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias")); |             layer.k_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "bias")); | ||||||
|             layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias")); |             layer.q_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "bias")); | ||||||
|             layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias")); |             layer.v_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "bias")); | ||||||
|             layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias")); |             layer.o_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias")); | ||||||
|             layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias")); |             layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "bias")); | ||||||
|             layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias")); |             layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "bias")); | ||||||
|             layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias")); |             layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "bias")); | ||||||
|             layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias")); |             layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "bias")); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -682,6 +709,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|  |  | ||||||
|     // measure mem requirement and allocate |     // measure mem requirement and allocate | ||||||
|     { |     { | ||||||
|  |         new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); | ||||||
|         new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend); |         new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend); | ||||||
|         clip_image_f32_batch batch; |         clip_image_f32_batch batch; | ||||||
|         batch.size = 1; |         batch.size = 1; | ||||||
| @@ -697,26 +725,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | |||||||
|     return new_clip; |     return new_clip; | ||||||
| } | } | ||||||
|  |  | ||||||
| clip_image_u8 * make_clip_image_u8() { | struct clip_image_u8 * clip_image_u8_init() { | ||||||
|     auto img = new clip_image_u8(); |     return new clip_image_u8(); | ||||||
|     return img; |  | ||||||
| } | } | ||||||
| clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); } |  | ||||||
|  |  | ||||||
| void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; } | struct clip_image_f32 * clip_image_f32_init() { | ||||||
| void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; } |     return new clip_image_f32(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void clip_image_u8_free (struct clip_image_u8  * img) { delete img; } | ||||||
|  | void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } | ||||||
|  |  | ||||||
| static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { | static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { | ||||||
|     img->nx = nx; |     img->nx = nx; | ||||||
|     img->ny = ny; |     img->ny = ny; | ||||||
|     img->size = nx * ny * 3; |     img->buf.resize(3 * nx * ny); | ||||||
|     img->data = new uint8_t[img->size](); |     memcpy(img->buf.data(), data, img->buf.size()); | ||||||
|     memcpy(img->data, data, img->size); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { | bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { | ||||||
|     int nx, ny, nc; |     int nx, ny, nc; | ||||||
|     auto data = stbi_load(fname, &nx, &ny, &nc, 3); |     auto * data = stbi_load(fname, &nx, &ny, &nc, 3); | ||||||
|     if (!data) { |     if (!data) { | ||||||
|         fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname); |         fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname); | ||||||
|         return false; |         return false; | ||||||
| @@ -728,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { | |||||||
|  |  | ||||||
| bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { | bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { | ||||||
|     int nx, ny, nc; |     int nx, ny, nc; | ||||||
|     auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); |     auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); | ||||||
|     if (!data) { |     if (!data) { | ||||||
|         fprintf(stderr, "%s: failed to decode image bytes\n", __func__); |         fprintf(stderr, "%s: failed to decode image bytes\n", __func__); | ||||||
|         return false; |         return false; | ||||||
| @@ -740,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length | |||||||
|  |  | ||||||
| // normalize: x = (x - mean) / std | // normalize: x = (x - mean) / std | ||||||
| // TODO: implement bicubic interpolation instead of linear. | // TODO: implement bicubic interpolation instead of linear. | ||||||
| bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) { | bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) { | ||||||
|     if (!ctx->has_vision_encoder) { |     if (!ctx->has_vision_encoder) { | ||||||
|         printf("This gguf file seems to have no vision encoder\n"); |         printf("This gguf file seems to have no vision encoder\n"); | ||||||
|         return false; |         return false; | ||||||
| @@ -749,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip | |||||||
|     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) |     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) | ||||||
|     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 |     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 | ||||||
|  |  | ||||||
|     clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily |     clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily | ||||||
|     if (pad2square && img->nx != img->ny) { |     if (pad2square && img->nx != img->ny) { | ||||||
|         int longer_side = std::max(img->nx, img->ny); |         int longer_side = std::max(img->nx, img->ny); | ||||||
|         temp->nx = longer_side; |         temp->nx = longer_side; | ||||||
|         temp->ny = longer_side; |         temp->ny = longer_side; | ||||||
|         temp->size = 3 * longer_side * longer_side; |         temp->buf.resize(3 * longer_side * longer_side); | ||||||
|         temp->data = new uint8_t[temp->size](); |         const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA | ||||||
|         uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA |  | ||||||
|  |  | ||||||
|         // fill with background color |         // fill with background color | ||||||
|         for (size_t i = 0; i < temp->size; i++) { |         for (size_t i = 0; i < temp->buf.size(); i++) { | ||||||
|             temp->data[i] = bc[i % 3]; |             temp->buf[i] = bc[i % 3]; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // copy from the input image |         // copy from the input image | ||||||
| @@ -768,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip | |||||||
|             for (int x = 0; x < img->nx; x++) { |             for (int x = 0; x < img->nx; x++) { | ||||||
|                 const int i = 3 * (y * img->nx + x); |                 const int i = 3 * (y * img->nx + x); | ||||||
|                 const int j = 3 * (y * temp->nx + x); |                 const int j = 3 * (y * temp->nx + x); | ||||||
|                 temp->data[j] = img->data[i]; |                 temp->buf[j]   = img->buf[i]; | ||||||
|                 temp->data[j+1] = img->data[i+1]; |                 temp->buf[j+1] = img->buf[i+1]; | ||||||
|                 temp->data[j+2] = img->data[i+2]; |                 temp->buf[j+2] = img->buf[i+2]; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } else { |     } else { | ||||||
|         temp->nx = img->nx; |         temp->nx = img->nx; | ||||||
|         temp->ny = img->ny; |         temp->ny = img->ny; | ||||||
|         temp->size = img->size; |         temp->buf.resize(img->buf.size()); | ||||||
|         temp->data = new uint8_t[temp->size](); |         memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); | ||||||
|         memcpy(&temp->data[0], &img->data[0], temp->size); // copy |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const int nx = temp->nx; |     const int nx = temp->nx; | ||||||
| @@ -789,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip | |||||||
|  |  | ||||||
|     res->nx = nx2; |     res->nx = nx2; | ||||||
|     res->ny = ny2; |     res->ny = ny2; | ||||||
|     res->size = 3 * nx2 * ny2; |     res->buf.resize(3 * nx2 * ny2); | ||||||
|     res->data = new float[res->size](); |  | ||||||
|  |  | ||||||
|     const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; |     const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size; | ||||||
|  |  | ||||||
| @@ -821,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip | |||||||
|                 const int j10 = 3 * (y1 * nx + x0) + c; |                 const int j10 = 3 * (y1 * nx + x0) + c; | ||||||
|                 const int j11 = 3 * (y1 * nx + x1) + c; |                 const int j11 = 3 * (y1 * nx + x1) + c; | ||||||
|  |  | ||||||
|                 const float v00 = temp->data[j00]; |                 const float v00 = temp->buf[j00]; | ||||||
|                 const float v01 = temp->data[j01]; |                 const float v01 = temp->buf[j01]; | ||||||
|                 const float v10 = temp->data[j10]; |                 const float v10 = temp->buf[j10]; | ||||||
|                 const float v11 = temp->data[j11]; |                 const float v11 = temp->buf[j11]; | ||||||
|  |  | ||||||
|                 const float v0 = v00 * (1.0f - dx) + v01 * dx; |                 const float v0 = v00 * (1.0f - dx) + v01 * dx; | ||||||
|                 const float v1 = v10 * (1.0f - dx) + v11 * dx; |                 const float v1 = v10 * (1.0f - dx) + v11 * dx; | ||||||
| @@ -835,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip | |||||||
|  |  | ||||||
|                 const int i = 3 * (y * nx3 + x) + c; |                 const int i = 3 * (y * nx3 + x) + c; | ||||||
|  |  | ||||||
|                 res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; |                 res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -845,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip | |||||||
| } | } | ||||||
|  |  | ||||||
| void clip_free(clip_ctx * ctx) { | void clip_free(clip_ctx * ctx) { | ||||||
|     ggml_free(ctx->ctx); |     ggml_free(ctx->ctx_data); | ||||||
|     gguf_free(ctx->ctx_gguf); |     gguf_free(ctx->ctx_gguf); | ||||||
|  |  | ||||||
|     delete ctx; |     delete ctx; | ||||||
| } | } | ||||||
|  |  | ||||||
| bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { | bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { | ||||||
|     if (!ctx->has_vision_encoder) { |     if (!ctx->has_vision_encoder) { | ||||||
|         printf("This gguf file seems to have no vision encoder\n"); |         printf("This gguf file seems to have no vision encoder\n"); | ||||||
|         return false; |         return false; | ||||||
| @@ -862,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 | |||||||
|     return clip_image_batch_encode(ctx, n_threads, &imgs, vec); |     return clip_image_batch_encode(ctx, n_threads, &imgs, vec); | ||||||
| } | } | ||||||
|  |  | ||||||
| bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { | bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { | ||||||
|  |  | ||||||
|     if (!ctx->has_vision_encoder) { |     if (!ctx->has_vision_encoder) { | ||||||
|         printf("This gguf file seems to have no vision encoder\n"); |         printf("This gguf file seems to have no vision encoder\n"); | ||||||
|         return false; |         return false; | ||||||
| @@ -926,11 +952,12 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i | |||||||
|             return false; |             return false; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     auto ctx_clip = clip_model_load(fname_inp, 2); |     auto * ctx_clip = clip_model_load(fname_inp, 2); | ||||||
|     const auto & ctx_src = ctx_clip->ctx_gguf; |  | ||||||
|     const auto & ctx_data = ctx_clip->ctx; |  | ||||||
|  |  | ||||||
|     auto ctx_out = gguf_init_empty(); |     const auto & ctx_src = ctx_clip->ctx_gguf; | ||||||
|  |     const auto & ctx_data = ctx_clip->ctx_data; | ||||||
|  |  | ||||||
|  |     auto * ctx_out = gguf_init_empty(); | ||||||
|     gguf_set_kv(ctx_out, ctx_src); |     gguf_set_kv(ctx_out, ctx_src); | ||||||
|     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); |     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); | ||||||
|     gguf_set_val_u32(ctx_out, "general.file_type", itype); |     gguf_set_val_u32(ctx_out, "general.file_type", itype); | ||||||
|   | |||||||
| @@ -35,31 +35,14 @@ struct clip_vision_hparams { | |||||||
|     float eps; |     float eps; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| /** load mmproj model */ | CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); | ||||||
| CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity); |  | ||||||
| /** free mmproj model */ |  | ||||||
| CLIP_API void clip_free(struct clip_ctx * ctx); | CLIP_API void clip_free(struct clip_ctx * ctx); | ||||||
|  |  | ||||||
| size_t clip_embd_nbytes(const struct clip_ctx * ctx); | CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); | ||||||
| int clip_n_patches(const struct clip_ctx * ctx); |  | ||||||
| int clip_n_mmproj_embd(const struct clip_ctx * ctx); |  | ||||||
|  |  | ||||||
| // RGB uint8 image | CLIP_API int clip_n_patches    (const struct clip_ctx * ctx); | ||||||
| struct clip_image_u8 { | CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); | ||||||
|     int nx; |  | ||||||
|     int ny; |  | ||||||
|     uint8_t * data = NULL; |  | ||||||
|     size_t size; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| // RGB float32 image (NHWC) |  | ||||||
| // Memory layout: RGBRGBRGB... |  | ||||||
| struct clip_image_f32 { |  | ||||||
|     int nx; |  | ||||||
|     int ny; |  | ||||||
|     float * data = NULL; |  | ||||||
|     size_t size; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| struct clip_image_u8_batch { | struct clip_image_u8_batch { | ||||||
|     struct clip_image_u8 * data; |     struct clip_image_u8 * data; | ||||||
| @@ -71,21 +54,22 @@ struct clip_image_f32_batch { | |||||||
|     size_t size; |     size_t size; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct clip_image_u8 * make_clip_image_u8(); | CLIP_API struct clip_image_u8  * clip_image_u8_init (); | ||||||
| struct clip_image_f32 * make_clip_image_f32(); | CLIP_API struct clip_image_f32 * clip_image_f32_init(); | ||||||
| CLIP_API void clip_image_u8_free(clip_image_u8 * img); |  | ||||||
| CLIP_API void clip_image_f32_free(clip_image_f32 * img); | CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); | ||||||
|  | CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); | ||||||
|  |  | ||||||
| CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); | CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); | ||||||
|  |  | ||||||
| /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ | /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ | ||||||
| CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); | CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); | ||||||
|  |  | ||||||
| bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); | CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square); | ||||||
| bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); | CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); | ||||||
|  | CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); | ||||||
|  |  | ||||||
| bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs, | CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); | ||||||
|                              float * vec); |  | ||||||
|  |  | ||||||
| bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype); |  | ||||||
|  |  | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
|   | |||||||
| @@ -10,7 +10,7 @@ | |||||||
| #include "base64.hpp" | #include "base64.hpp" | ||||||
|  |  | ||||||
| static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { | static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { | ||||||
|     clip_image_f32 * img_res = make_clip_image_f32(); |     clip_image_f32 * img_res = clip_image_f32_init(); | ||||||
|     if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { |     if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { | ||||||
|         fprintf(stderr, "%s: unable to preprocess image\n", __func__); |         fprintf(stderr, "%s: unable to preprocess image\n", __func__); | ||||||
|         clip_image_f32_free(img_res); |         clip_image_f32_free(img_res); | ||||||
| @@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ | |||||||
| } | } | ||||||
|  |  | ||||||
| LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { | LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { | ||||||
|     clip_image_u8 * img = make_clip_image_u8(); |     clip_image_u8 * img = clip_image_u8_init(); | ||||||
|     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { |     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { | ||||||
|         clip_image_u8_free(img); |         clip_image_u8_free(img); | ||||||
|         fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); |         fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); | ||||||
|   | |||||||
| @@ -82,7 +82,7 @@ static inline bool is_base64(uint8_t c) | |||||||
|     return (isalnum(c) || (c == '+') || (c == '/')); |     return (isalnum(c) || (c == '+') || (c == '/')); | ||||||
| } | } | ||||||
|  |  | ||||||
| static std::vector<uint8_t> base64_decode(std::string const &encoded_string) | static std::vector<uint8_t> base64_decode(const std::string & encoded_string) | ||||||
| { | { | ||||||
|     int i = 0; |     int i = 0; | ||||||
|     int j = 0; |     int j = 0; | ||||||
| @@ -212,7 +212,7 @@ struct slot_image | |||||||
|     float * image_embedding = nullptr; |     float * image_embedding = nullptr; | ||||||
|     int32_t image_tokens = 0; |     int32_t image_tokens = 0; | ||||||
|  |  | ||||||
|     clip_image_u8 img_data; |     clip_image_u8 * img_data; | ||||||
|  |  | ||||||
|     std::string prefix_prompt; // before of this image |     std::string prefix_prompt; // before of this image | ||||||
| }; | }; | ||||||
| @@ -437,7 +437,9 @@ struct llama_client_slot | |||||||
|         for (slot_image & img : images) |         for (slot_image & img : images) | ||||||
|         { |         { | ||||||
|             free(img.image_embedding); |             free(img.image_embedding); | ||||||
|             delete[] img.img_data.data; |             if (img.img_data) { | ||||||
|  |                 clip_image_u8_free(img.img_data); | ||||||
|  |             } | ||||||
|             img.prefix_prompt = ""; |             img.prefix_prompt = ""; | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -851,24 +853,17 @@ struct llama_server_context | |||||||
|             { |             { | ||||||
|                 for (const auto &img : *images_data) |                 for (const auto &img : *images_data) | ||||||
|                 { |                 { | ||||||
|                     std::string data_b64 = img["data"].get<std::string>(); |                     const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>()); | ||||||
|  |  | ||||||
|                     slot_image img_sl; |                     slot_image img_sl; | ||||||
|                     img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size(); |                     img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size(); | ||||||
|                     int width, height, channels; |                     img_sl.img_data = clip_image_u8_init(); | ||||||
|                     std::vector<uint8_t> image_buffer = base64_decode(data_b64); |                     if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) | ||||||
|                     data_b64.clear(); |                     { | ||||||
|                     auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3); |  | ||||||
|                     if (!data) { |  | ||||||
|                         LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); |                         LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); | ||||||
|                         return false; |                         return false; | ||||||
|                     } |                     } | ||||||
|                     LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height); |                     LOG_TEE("slot %i - loaded image\n", slot->id); | ||||||
|                     img_sl.img_data.nx = width; |  | ||||||
|                     img_sl.img_data.ny = height; |  | ||||||
|                     img_sl.img_data.size = width * height * 3; |  | ||||||
|                     img_sl.img_data.data = new uint8_t[width * height * 3](); |  | ||||||
|                     memcpy(img_sl.img_data.data, data, width * height * 3); |  | ||||||
|                     stbi_image_free(data); |  | ||||||
|                     img_sl.request_encode_image = true; |                     img_sl.request_encode_image = true; | ||||||
|                     slot->images.push_back(img_sl); |                     slot->images.push_back(img_sl); | ||||||
|                 } |                 } | ||||||
| @@ -1143,8 +1138,8 @@ struct llama_server_context | |||||||
|             { |             { | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|             clip_image_f32 img_res; |             clip_image_f32 * img_res = clip_image_f32_init(); | ||||||
|             if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true)) |             if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true)) | ||||||
|             { |             { | ||||||
|                 LOG_TEE("Error processing the given image"); |                 LOG_TEE("Error processing the given image"); | ||||||
|                 clip_free(clp_ctx); |                 clip_free(clp_ctx); | ||||||
| @@ -1159,11 +1154,12 @@ struct llama_server_context | |||||||
|                 return false; |                 return false; | ||||||
|             } |             } | ||||||
|             LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); |             LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); | ||||||
|             if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding)) |             if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) | ||||||
|             { |             { | ||||||
|                 LOG_TEE("Unable to encode image\n"); |                 LOG_TEE("Unable to encode image\n"); | ||||||
|                 return false; |                 return false; | ||||||
|             } |             } | ||||||
|  |             clip_image_f32_free(img_res); | ||||||
|             img.request_encode_image = false; |             img.request_encode_image = false; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov