mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	llava : support v1.6 (#5267)
* Create llava-survery-v2.py * Update convert-image-encoder-to-gguf.py * Update convert-image-encoder-to-gguf.py * Rename llava-survery-v2.py to llava-surgery-v2.py * Update convert-image-encoder-to-gguf.py will now search for projector * Update convert-image-encoder-to-gguf.py whoops * Update llava-surgery-v2.py * Clip: Bugfix for normalization (it did not loat the 3 std and mean values) Clip: bicubic resize function Clip: added save-to-bmp/pil for debugging and conversion from/to 32/8 images Clip: added normalization with FP16 precision simulation (image tensors match HF implementation, can be switched off, only used for llava-1.6) Clip: added newline tensor, mergetype kv, image-grid kv, new resize-pad function with resolution from gridpoints Clip: clip_image_preprocess now returns a float * vector instead of float, this way llava 1.5 and 1.6 is supported llava: added ggml cpu graph for embedding patching, added spatial_unpad preliminary support, added a lot of comments that need to be cleaned when all is final convert-image-encoder: fixed image-grid flattening * whitespace corrections * ws * Tensors are now properly permuted. Before the embeddings were inserted 1:1, now they are split into the 24x24 patches as in reference. * ws * added verbose_prompt support into cli added stopwords for llava-1.6 into cli * moved llava functions to llava.cpp, made clip.h C compatible API, replaced vector style functions with pointers, added a debug define to remove functions from compilation while not needed * ws * convert : skip unknown tensors (need for LLaVA) * llava : update readme * llava : fix compile warnings * llava : style * convert : add --skip-unknown CLI arg * server : remove clip structs * bugfix for non llava-1.6 It should now work with llava-1.5 as well * clip : minor code rearrange * llava : update readme a bit --------- Co-authored-by: John <cmt-nct@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										37
									
								
								convert.py
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								convert.py
									
									
									
									
									
								
							| @@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM | ||||
|             for (name, tensor) in model.items()} | ||||
|  | ||||
|  | ||||
| def convert_model_names(model: LazyModel, params: Params) -> LazyModel: | ||||
| def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel: | ||||
|     tmap = gguf.TensorNameMap(ARCH, params.n_layer) | ||||
|     should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) | ||||
|  | ||||
| @@ -1199,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: | ||||
|     for name, lazy_tensor in model.items(): | ||||
|         tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) | ||||
|         if name_new is None: | ||||
|             raise Exception(f"Unexpected tensor name: {name}") | ||||
|             if skip_unknown: | ||||
|                 print(f"Unexpected tensor name: {name} - skipping") | ||||
|                 continue | ||||
|             else: | ||||
|                 raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") | ||||
|  | ||||
|         if tensor_type in should_skip: | ||||
|             print(f"skipping tensor {name_new}") | ||||
| @@ -1377,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None: | ||||
|         output_choices.append("q8_0") | ||||
|     vocab_types = ["spm", "bpe", "hfft"] | ||||
|     parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") | ||||
|     parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None) | ||||
|     parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model") | ||||
|     parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file") | ||||
|     parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab") | ||||
|     parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") | ||||
|     parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file") | ||||
|     parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") | ||||
|     parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input") | ||||
|     parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") | ||||
|     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)") | ||||
|     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) | ||||
|     parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine") | ||||
|     parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides") | ||||
|     parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None) | ||||
|     parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model") | ||||
|     parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file") | ||||
|     parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab") | ||||
|     parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") | ||||
|     parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file") | ||||
|     parser.add_argument("--vocab-type",   choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") | ||||
|     parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input") | ||||
|     parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") | ||||
|     parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)") | ||||
|     parser.add_argument("--concurrency",  type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) | ||||
|     parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine") | ||||
|     parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides") | ||||
|     parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing") | ||||
|  | ||||
|     args = parser.parse_args(args_in) | ||||
|     if args.awq_path: | ||||
| @@ -1461,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None: | ||||
|     print(f"Special vocab info: {special_vocab}") | ||||
|  | ||||
|     model   = model_plus.model | ||||
|     model   = convert_model_names(model, params) | ||||
|     model   = convert_model_names(model, params, args.skip_unknown) | ||||
|     ftype   = pick_output_type(model, args.outtype) | ||||
|     model   = convert_to_output_type(model, ftype) | ||||
|     outfile = args.outfile or default_outfile(model_plus.paths, ftype) | ||||
|   | ||||
| @@ -19,9 +19,9 @@ After building, run: `./llava-cli` to see the usage. For example: | ||||
|  | ||||
| **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. | ||||
|  | ||||
| ## Model conversion | ||||
| ## LLaVA 1.5 | ||||
|  | ||||
| - Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally: | ||||
| - Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: | ||||
|  | ||||
| ```sh | ||||
| git clone https://huggingface.co/liuhaotian/llava-v1.5-7b | ||||
| @@ -55,8 +55,14 @@ python ./convert.py ../llava-v1.5-7b | ||||
|  | ||||
| Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. | ||||
|  | ||||
| ## LLaVA 1.6 | ||||
|  | ||||
| - Use `llava-surgery-v2.py` | ||||
|  | ||||
| - TODO: add detailed instructions | ||||
|  | ||||
| ## TODO | ||||
|  | ||||
| - [ ] Support non-CPU backend for the image encoding part. | ||||
| - [x] Support non-CPU backend for the image encoding part. | ||||
| - [ ] Support different sampling methods. | ||||
| - [ ] Support more model variants. | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| // NOTE: This is modified from clip.cpp only for LLaVA, | ||||
| // so there might be still unnecessary artifacts hanging around | ||||
| // I'll gradually clean and extend it | ||||
|  | ||||
| // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch | ||||
| #include "clip.h" | ||||
| #include "ggml.h" | ||||
| #include "ggml-alloc.h" | ||||
| @@ -30,6 +30,26 @@ | ||||
| #include <vector> | ||||
| #include <sstream> | ||||
| #include <cinttypes> | ||||
| #include <limits> | ||||
|  | ||||
| //#define CLIP_DEBUG_FUNCTIONS | ||||
|  | ||||
| // RGB uint8 image | ||||
| struct clip_image_u8 { | ||||
|     int nx; | ||||
|     int ny; | ||||
|  | ||||
|     std::vector<uint8_t> buf; | ||||
| }; | ||||
|  | ||||
| // RGB float32 image (NHWC) | ||||
| // Memory layout: RGBRGBRGB... | ||||
| struct clip_image_f32 { | ||||
|     int nx; | ||||
|     int ny; | ||||
|  | ||||
|     std::vector<float> buf; | ||||
| }; | ||||
|  | ||||
| static std::string format(const char * fmt, ...) { | ||||
|     va_list ap; | ||||
| @@ -50,50 +70,56 @@ static std::string format(const char * fmt, ...) { | ||||
| // key constants | ||||
| // | ||||
|  | ||||
| #define KEY_FTYPE "general.file_type" | ||||
| #define KEY_NAME "general.name" | ||||
| #define KEY_DESCRIPTION "general.description" | ||||
| #define KEY_HAS_TEXT_ENC "clip.has_text_encoder" | ||||
| #define KEY_HAS_VIS_ENC "clip.has_vision_encoder" | ||||
| #define KEY_FTYPE          "general.file_type" | ||||
| #define KEY_NAME           "general.name" | ||||
| #define KEY_DESCRIPTION    "general.description" | ||||
| #define KEY_HAS_TEXT_ENC   "clip.has_text_encoder" | ||||
| #define KEY_HAS_VIS_ENC    "clip.has_vision_encoder" | ||||
| #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" | ||||
| #define KEY_USE_GELU "clip.use_gelu" | ||||
| #define KEY_N_EMBD "clip.%s.embedding_length" | ||||
| #define KEY_N_FF "clip.%s.feed_forward_length" | ||||
| #define KEY_N_BLOCK "clip.%s.block_count" | ||||
| #define KEY_N_HEAD "clip.%s.attention.head_count" | ||||
| #define KEY_USE_GELU       "clip.use_gelu" | ||||
| #define KEY_N_EMBD         "clip.%s.embedding_length" | ||||
| #define KEY_N_FF           "clip.%s.feed_forward_length" | ||||
| #define KEY_N_BLOCK        "clip.%s.block_count" | ||||
| #define KEY_N_HEAD         "clip.%s.attention.head_count" | ||||
| #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" | ||||
| #define KEY_PROJ_DIM "clip.%s.projection_dim" | ||||
| #define KEY_TOKENS "tokenizer.ggml.tokens" | ||||
| #define KEY_N_POSITIONS "clip.text.context_length" | ||||
| #define KEY_IMAGE_SIZE "clip.vision.image_size" | ||||
| #define KEY_PATCH_SIZE "clip.vision.patch_size" | ||||
| #define KEY_IMAGE_MEAN "clip.vision.image_mean" | ||||
| #define KEY_IMAGE_STD "clip.vision.image_std" | ||||
| #define KEY_PROJ_TYPE "clip.projector_type" | ||||
| #define KEY_PROJ_DIM       "clip.%s.projection_dim" | ||||
| #define KEY_TOKENS         "tokenizer.ggml.tokens" | ||||
| #define KEY_N_POSITIONS    "clip.text.context_length" | ||||
| #define KEY_IMAGE_SIZE     "clip.vision.image_size" | ||||
| #define KEY_PATCH_SIZE     "clip.vision.patch_size" | ||||
| #define KEY_IMAGE_MEAN     "clip.vision.image_mean" | ||||
| #define KEY_IMAGE_STD      "clip.vision.image_std" | ||||
| #define KEY_PROJ_TYPE      "clip.projector_type" | ||||
|  | ||||
| #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type" | ||||
| #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints" | ||||
| #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" | ||||
|  | ||||
|  | ||||
| // | ||||
| // tensor name constants | ||||
| // | ||||
|  | ||||
| #define TN_TOKEN_EMBD "%s.token_embd.weight" | ||||
| #define TN_POS_EMBD "%s.position_embd.weight" | ||||
| #define TN_CLASS_EMBD "v.class_embd" | ||||
| #define TN_PATCH_EMBD "v.patch_embd.weight" | ||||
| #define TN_ATTN_K "%s.blk.%d.attn_k.%s" | ||||
| #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" | ||||
| #define TN_ATTN_V "%s.blk.%d.attn_v.%s" | ||||
| #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" | ||||
| #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" | ||||
| #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" | ||||
| #define TN_LN_1 "%s.blk.%d.ln1.%s" | ||||
| #define TN_LN_2 "%s.blk.%d.ln2.%s" | ||||
| #define TN_LN_PRE "%s.pre_ln.%s" | ||||
| #define TN_LN_POST "%s.post_ln.%s" | ||||
| #define TN_TEXT_PROJ "text_projection.weight" | ||||
| #define TN_VIS_PROJ "visual_projection.weight" | ||||
| #define TN_LLAVA_PROJ "mm.%d.%s" | ||||
| #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" | ||||
| #define TN_TOKEN_EMBD      "%s.token_embd.weight" | ||||
| #define TN_POS_EMBD        "%s.position_embd.weight" | ||||
| #define TN_CLASS_EMBD      "v.class_embd" | ||||
| #define TN_PATCH_EMBD      "v.patch_embd.weight" | ||||
| #define TN_ATTN_K          "%s.blk.%d.attn_k.%s" | ||||
| #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s" | ||||
| #define TN_ATTN_V          "%s.blk.%d.attn_v.%s" | ||||
| #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s" | ||||
| #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s" | ||||
| #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s" | ||||
| #define TN_LN_1            "%s.blk.%d.ln1.%s" | ||||
| #define TN_LN_2            "%s.blk.%d.ln2.%s" | ||||
| #define TN_LN_PRE          "%s.pre_ln.%s" | ||||
| #define TN_LN_POST         "%s.post_ln.%s" | ||||
| #define TN_TEXT_PROJ       "text_projection.weight" | ||||
| #define TN_VIS_PROJ        "visual_projection.weight" | ||||
| #define TN_LLAVA_PROJ      "mm.%d.%s" | ||||
| #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s" | ||||
| #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" | ||||
| #define TN_IMAGE_NEWLINE   "model.image_newline" | ||||
|  | ||||
|  | ||||
| enum projector_type { | ||||
| @@ -104,8 +130,8 @@ enum projector_type { | ||||
| }; | ||||
|  | ||||
| static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = { | ||||
|     { PROJECTOR_TYPE_MLP,           "mlp"     }, | ||||
|     { PROJECTOR_TYPE_LDP,          "ldp"    }, | ||||
|     { PROJECTOR_TYPE_MLP, "mlp" }, | ||||
|     { PROJECTOR_TYPE_LDP, "ldp" }, | ||||
| }; | ||||
|  | ||||
|  | ||||
| @@ -165,7 +191,6 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| static void replace_all(std::string & s, const std::string & search, const std::string & replace) { | ||||
|     std::string result; | ||||
|     for (size_t pos = 0; ; pos += search.length()) { | ||||
| @@ -217,7 +242,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") { | ||||
| static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { | ||||
|     size_t tensor_size = ggml_nbytes(tensor); | ||||
|     printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", | ||||
|             prefix, ggml_n_dims(tensor), tensor->name, tensor_size, | ||||
| @@ -233,31 +258,136 @@ static projector_type clip_projector_type_from_string(const std::string & name) | ||||
|     return PROJECTOR_TYPE_UNKNOWN; | ||||
| } | ||||
|  | ||||
| // | ||||
| // image data | ||||
| // | ||||
| #ifdef CLIP_DEBUG_FUNCTIONS | ||||
| static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { | ||||
|     std::ofstream file(filename, std::ios::binary); | ||||
|     if (!file.is_open()) { | ||||
|         std::cerr << "Failed to open file for writing: " << filename << std::endl; | ||||
|         return; | ||||
|     } | ||||
|  | ||||
| // RGB uint8 image | ||||
| struct clip_image_u8 { | ||||
|     int nx; | ||||
|     int ny; | ||||
|     // PPM header: P6 format, width, height, and max color value | ||||
|     file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; | ||||
|  | ||||
|     std::vector<uint8_t> buf; | ||||
| }; | ||||
|     // Write pixel data | ||||
|     for (size_t i = 0; i < img.buf.size(); i += 3) { | ||||
|         // PPM expects binary data in RGB format, which matches our image buffer | ||||
|         file.write(reinterpret_cast<const char*>(&img.buf[i]), 3); | ||||
|     } | ||||
|  | ||||
| // RGB float32 image (NHWC) | ||||
| // Memory layout: RGBRGBRGB... | ||||
| struct clip_image_f32 { | ||||
|     int nx; | ||||
|     int ny; | ||||
|     file.close(); | ||||
| } | ||||
|  | ||||
| static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { | ||||
|     std::ofstream file(filename, std::ios::binary); | ||||
|     if (!file.is_open()) { | ||||
|         std::cerr << "Failed to open file for writing: " << filename << std::endl; | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data | ||||
|     int bytesPerPixel = 3; | ||||
|     int widthInBytes = img.nx * bytesPerPixel; | ||||
|     int paddingAmount = (4 - (widthInBytes % 4)) % 4; | ||||
|     int stride = widthInBytes + paddingAmount; | ||||
|  | ||||
|     // Bitmap file header | ||||
|     unsigned char fileHeader[14] = { | ||||
|         'B','M',     // Signature | ||||
|         0,0,0,0,    // Image file size in bytes | ||||
|         0,0,0,0,    // Reserved | ||||
|         54,0,0,0    // Start of pixel array | ||||
|     }; | ||||
|  | ||||
|     // Total file size | ||||
|     fileSize = 54 + (stride * img.ny); | ||||
|     fileHeader[2] = (unsigned char)(fileSize); | ||||
|     fileHeader[3] = (unsigned char)(fileSize >> 8); | ||||
|     fileHeader[4] = (unsigned char)(fileSize >> 16); | ||||
|     fileHeader[5] = (unsigned char)(fileSize >> 24); | ||||
|  | ||||
|     // Bitmap information header (BITMAPINFOHEADER) | ||||
|     unsigned char infoHeader[40] = { | ||||
|         40,0,0,0,   // Size of this header (40 bytes) | ||||
|         0,0,0,0,    // Image width | ||||
|         0,0,0,0,    // Image height | ||||
|         1,0,        // Number of color planes | ||||
|         24,0,       // Bits per pixel | ||||
|         0,0,0,0,    // No compression | ||||
|         0,0,0,0,    // Image size (can be 0 for no compression) | ||||
|         0,0,0,0,    // X pixels per meter (not specified) | ||||
|         0,0,0,0,    // Y pixels per meter (not specified) | ||||
|         0,0,0,0,    // Total colors (color table not used) | ||||
|         0,0,0,0     // Important colors (all are important) | ||||
|     }; | ||||
|  | ||||
|     // Width and height in the information header | ||||
|     infoHeader[4] = (unsigned char)(img.nx); | ||||
|     infoHeader[5] = (unsigned char)(img.nx >> 8); | ||||
|     infoHeader[6] = (unsigned char)(img.nx >> 16); | ||||
|     infoHeader[7] = (unsigned char)(img.nx >> 24); | ||||
|     infoHeader[8] = (unsigned char)(img.ny); | ||||
|     infoHeader[9] = (unsigned char)(img.ny >> 8); | ||||
|     infoHeader[10] = (unsigned char)(img.ny >> 16); | ||||
|     infoHeader[11] = (unsigned char)(img.ny >> 24); | ||||
|  | ||||
|     // Write file headers | ||||
|     file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader)); | ||||
|     file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader)); | ||||
|  | ||||
|     // Pixel data | ||||
|     std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row | ||||
|     for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top | ||||
|         for (int x = 0; x < img.nx; ++x) { | ||||
|             // Each pixel | ||||
|             size_t pixelIndex = (y * img.nx + x) * 3; | ||||
|             unsigned char pixel[3] = { | ||||
|                 img.buf[pixelIndex + 2], // BMP stores pixels in BGR format | ||||
|                 img.buf[pixelIndex + 1], | ||||
|                 img.buf[pixelIndex] | ||||
|             }; | ||||
|             file.write(reinterpret_cast<char*>(pixel), 3); | ||||
|         } | ||||
|         // Write padding for the row | ||||
|         file.write(reinterpret_cast<char*>(padding.data()), paddingAmount); | ||||
|     } | ||||
|  | ||||
|     file.close(); | ||||
| } | ||||
|  | ||||
| // debug function to convert f32 to u8 | ||||
| static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { | ||||
|     dst.nx = src.nx; | ||||
|     dst.ny = src.ny; | ||||
|     dst.buf.resize(3 * src.nx * src.ny); | ||||
|     for (size_t i = 0; i < src.buf.size(); ++i) { | ||||
|         dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); | ||||
|     } | ||||
| } | ||||
| #endif | ||||
|  | ||||
|     std::vector<float> buf; | ||||
| }; | ||||
|  | ||||
| // | ||||
| // clip layers | ||||
| // | ||||
|  | ||||
| struct clip_hparams { | ||||
|     int32_t image_size; | ||||
|     int32_t patch_size; | ||||
|     int32_t hidden_size; | ||||
|     int32_t n_intermediate; | ||||
|     int32_t projection_dim; | ||||
|     int32_t n_head; | ||||
|     int32_t n_layer; | ||||
|  | ||||
|     float eps; | ||||
|  | ||||
|     char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) | ||||
|  | ||||
|     int32_t image_grid_pinpoints[32]; | ||||
|     int32_t image_crop_resolution; | ||||
| }; | ||||
|  | ||||
| struct clip_layer { | ||||
|     // attention | ||||
|     struct ggml_tensor * k_w; | ||||
| @@ -287,7 +417,7 @@ struct clip_layer { | ||||
| }; | ||||
|  | ||||
| struct clip_vision_model { | ||||
|     struct clip_vision_hparams hparams; | ||||
|     struct clip_hparams hparams; | ||||
|  | ||||
|     // embeddings | ||||
|     struct ggml_tensor * class_embedding; | ||||
| @@ -310,6 +440,8 @@ struct clip_vision_model { | ||||
|     struct ggml_tensor * mm_2_w = NULL; | ||||
|     struct ggml_tensor * mm_2_b = NULL; | ||||
|  | ||||
|     struct ggml_tensor * image_newline = NULL; | ||||
|  | ||||
|     // Yi type models with mlp+normalization projection | ||||
|     struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 | ||||
|     struct ggml_tensor * mm_1_b = NULL; | ||||
| @@ -364,9 +496,10 @@ struct clip_ctx { | ||||
|     std::vector<uint8_t> buf_compute_meta; | ||||
|  | ||||
|     // memory buffers to evaluate the model | ||||
|     ggml_backend_buffer_t params_buffer = NULL; | ||||
|     ggml_backend_buffer_t params_buffer  = NULL; | ||||
|     ggml_backend_buffer_t compute_buffer = NULL; | ||||
|     ggml_backend_t backend = NULL; | ||||
|  | ||||
|     ggml_backend_t backend       = NULL; | ||||
|     ggml_gallocr_t compute_alloc = NULL; | ||||
| }; | ||||
|  | ||||
| @@ -379,18 +512,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | ||||
|     const auto & model = ctx->vision_model; | ||||
|     const auto & hparams = model.hparams; | ||||
|  | ||||
|     const int image_size = hparams.image_size; | ||||
|     const int patch_size = hparams.patch_size; | ||||
|     const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); | ||||
|     const int num_positions = num_patches + 1; | ||||
|     const int hidden_size = hparams.hidden_size; | ||||
|     const int n_head = hparams.n_head; | ||||
|     const int d_head = hidden_size / n_head; | ||||
|     const int n_layer = hparams.n_layer; | ||||
|     //const int n_intermediate = hparams.n_intermediate; | ||||
|     //const int projection_dim = hparams.projection_dim; | ||||
|     const float eps = hparams.eps; | ||||
|     int batch_size = imgs->size; | ||||
|     const int image_size           = hparams.image_size; | ||||
|     const int patch_size           = hparams.patch_size; | ||||
|     const int num_patches          = ((image_size / patch_size) * (image_size / patch_size)); | ||||
|     const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); | ||||
|     const int num_positions        = num_patches + 1; | ||||
|     const int hidden_size          = hparams.hidden_size; | ||||
|     const int n_head               = hparams.n_head; | ||||
|     const int d_head               = hidden_size / n_head; | ||||
|     const int n_layer              = hparams.n_layer; | ||||
|     const float eps                = hparams.eps; | ||||
|  | ||||
|     const int batch_size = imgs->size; | ||||
|  | ||||
|     if (ctx->has_llava_projector) { | ||||
|         GGML_ASSERT(batch_size == 1); | ||||
|     } | ||||
| @@ -540,7 +674,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | ||||
|             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); | ||||
|  | ||||
|             embeddings = ggml_gelu(ctx0, embeddings); | ||||
|  | ||||
|             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); | ||||
|             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); | ||||
|  | ||||
| @@ -791,10 +924,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | ||||
|         if (idx != -1) { | ||||
|             const std::string proj_type = gguf_get_val_str(ctx, idx); | ||||
|             new_clip->proj_type = clip_projector_type_from_string(proj_type); | ||||
|         } | ||||
|         else { | ||||
|         } else { | ||||
|             new_clip->proj_type = PROJECTOR_TYPE_MLP; | ||||
|         } | ||||
|  | ||||
|         if (new_clip->proj_type == PROJECTOR_TYPE_MLP) { | ||||
|             if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) { | ||||
|                 new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM; | ||||
| @@ -920,11 +1053,41 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | ||||
|         hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision")); | ||||
|         hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision")); | ||||
|  | ||||
|         try { | ||||
|             int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); | ||||
|             int n = gguf_get_arr_n(ctx, idx); | ||||
|             const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); | ||||
|             for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) { | ||||
|                 hparams.image_grid_pinpoints[i] = pinpoints[i]; | ||||
|             } | ||||
|             if (n < 32) | ||||
|                 hparams.image_grid_pinpoints[n] = 0; | ||||
|         } catch (std::runtime_error & e) { | ||||
|             hparams.image_grid_pinpoints[0]=0; | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); | ||||
|             strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); | ||||
|         } catch (std::runtime_error & e) { | ||||
|             strcpy(hparams.mm_patch_merge_type, "flat"); | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 | ||||
|         } catch(const std::exception& e) { | ||||
|             hparams.image_crop_resolution = hparams.image_size; | ||||
|         } | ||||
|  | ||||
|         int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN); | ||||
|         int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD); | ||||
|  | ||||
|         const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean); | ||||
|         const float * std_data  = (const float *)gguf_get_arr_data(ctx, idx_std); | ||||
|  | ||||
|         for (int i = 0; i < 3; ++i) { | ||||
|             new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean)); | ||||
|             new_clip->image_std[i]  = *((const float *)gguf_get_arr_data(ctx, idx_std)); | ||||
|             new_clip->image_mean[i] = mean_data[i]; | ||||
|             new_clip->image_std[i]  = std_data[i]; | ||||
|         } | ||||
|  | ||||
|         if (verbosity >= 2) { | ||||
| @@ -936,13 +1099,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | ||||
|             printf("v_projection_dim   %d\n", hparams.projection_dim); | ||||
|             printf("v_n_head           %d\n", hparams.n_head); | ||||
|             printf("v_n_layer          %d\n", hparams.n_layer); | ||||
|             printf("v_eps              %f\n", hparams.eps); | ||||
|             printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); | ||||
|             printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); | ||||
|             printf("v_image_grid_pinpoints: "); | ||||
|             for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) { | ||||
|                 printf("%d ", hparams.image_grid_pinpoints[i]); | ||||
|             } | ||||
|             printf("\n"); | ||||
|             printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); | ||||
|  | ||||
|         } | ||||
|  | ||||
|         vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); | ||||
|         vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); | ||||
|         vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); | ||||
|         vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); | ||||
|         vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); | ||||
|         try { | ||||
|             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); | ||||
|             vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); | ||||
|             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); | ||||
|             vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); | ||||
|             vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); | ||||
|         } catch(const std::exception& e) { | ||||
|             fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); | ||||
|         } | ||||
|  | ||||
|         // LLaVA projection | ||||
|         if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { | ||||
| @@ -968,40 +1145,43 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | ||||
|                 vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); | ||||
|                 vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); | ||||
|             } catch (std::runtime_error & e) {  } | ||||
|         } | ||||
|         else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { | ||||
|             try { | ||||
|                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); | ||||
|                 // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); | ||||
|             } catch (std::runtime_error & e) {  } | ||||
|         } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { | ||||
|             // MobileVLM projection | ||||
|             vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); | ||||
|             vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); | ||||
|             vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); | ||||
|             vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); | ||||
|             vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); | ||||
|             vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); | ||||
|             vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); | ||||
|             vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); | ||||
|             vision_model.mm_model_mlp_1_b               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias")); | ||||
|             vision_model.mm_model_mlp_3_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight")); | ||||
|             vision_model.mm_model_mlp_3_b               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias")); | ||||
|             vision_model.mm_model_block_1_block_0_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); | ||||
|             vision_model.mm_model_block_1_block_0_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); | ||||
|             vision_model.mm_model_block_1_block_0_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); | ||||
|             vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); | ||||
|             vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); | ||||
|             vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); | ||||
|             vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); | ||||
|             vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); | ||||
|             vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); | ||||
|             vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); | ||||
|             vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); | ||||
|             vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); | ||||
|             vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); | ||||
|             vision_model.mm_model_block_1_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); | ||||
|             vision_model.mm_model_block_1_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); | ||||
|             vision_model.mm_model_block_1_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); | ||||
|             vision_model.mm_model_block_2_block_0_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); | ||||
|             vision_model.mm_model_block_2_block_0_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); | ||||
|             vision_model.mm_model_block_2_block_0_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); | ||||
|             vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); | ||||
|             vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); | ||||
|             vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); | ||||
|             vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); | ||||
|             vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); | ||||
|             vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); | ||||
|             vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); | ||||
|         } | ||||
|         else { | ||||
|             vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); | ||||
|             vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); | ||||
|             vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); | ||||
|         } else { | ||||
|             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; | ||||
|             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); | ||||
|         } | ||||
|  | ||||
|         vision_model.layers.resize(hparams.n_layer); | ||||
|  | ||||
|         for (int il = 0; il < hparams.n_layer; ++il) { | ||||
|             auto & layer = vision_model.layers[il]; | ||||
|             layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight")); | ||||
| @@ -1084,24 +1264,255 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| // normalize: x = (x - mean) / std | ||||
| // TODO: implement bicubic interpolation instead of linear. | ||||
| bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) { | ||||
| // Linear interpolation between two points | ||||
| inline float lerp(float s, float e, float t) { | ||||
|     return s + (e - s) * t; | ||||
| } | ||||
| // Bilinear resize function | ||||
| static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { | ||||
|     dst.nx = target_width; | ||||
|     dst.ny = target_height; | ||||
|     dst.buf.resize(3 * target_width * target_height); | ||||
|  | ||||
|     float x_ratio = static_cast<float>(src.nx - 1) / target_width; | ||||
|     float y_ratio = static_cast<float>(src.ny - 1) / target_height; | ||||
|  | ||||
|     for (int y = 0; y < target_height; y++) { | ||||
|         for (int x = 0; x < target_width; x++) { | ||||
|             float px = x_ratio * x; | ||||
|             float py = y_ratio * y; | ||||
|             int x_floor = static_cast<int>(px); | ||||
|             int y_floor = static_cast<int>(py); | ||||
|             float x_lerp = px - x_floor; | ||||
|             float y_lerp = py - y_floor; | ||||
|  | ||||
|             for (int c = 0; c < 3; c++) { | ||||
|                 float top = lerp( | ||||
|                     static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]), | ||||
|                     static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), | ||||
|                     x_lerp | ||||
|                 ); | ||||
|                 float bottom = lerp( | ||||
|                     static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), | ||||
|                     static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), | ||||
|                     x_lerp | ||||
|                 ); | ||||
|                 dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not | ||||
| static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { | ||||
|     dst->nx = src->nx; | ||||
|     dst->ny = src->ny; | ||||
|     dst->buf.resize(src->buf.size()); | ||||
|  | ||||
|     for (size_t i = 0; i < src->buf.size(); ++i) { | ||||
|         int c = i % 3; // rgb | ||||
|         dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c]; | ||||
|     } | ||||
| } | ||||
|  | ||||
| inline float clip(float x, float lower, float upper) { | ||||
|     return std::max(lower, std::min(x, upper)); | ||||
| } | ||||
|  | ||||
| static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { | ||||
|     const int nx = img.nx; | ||||
|     const int ny = img.ny; | ||||
|  | ||||
|     dst.nx = target_width; | ||||
|     dst.ny = target_height; | ||||
|     dst.buf.resize(3 * target_width * target_height); | ||||
|  | ||||
|     float Cc; | ||||
|     float C[5]; | ||||
|     float d0, d2, d3, a0, a1, a2, a3; | ||||
|     int i, j, k, jj; | ||||
|     int x, y; | ||||
|     float dx, dy; | ||||
|     float tx, ty; | ||||
|  | ||||
|     tx = (float)nx / (float)target_width; | ||||
|     ty = (float)ny / (float)target_height; | ||||
|  | ||||
|     // Bicubic interpolation; adapted from ViT.cpp, inspired from : | ||||
|     //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 | ||||
|     //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation | ||||
|  | ||||
|     for (i = 0; i < target_height; i++) { | ||||
|         for (j = 0; j < target_width; j++) { | ||||
|             x = (int)(tx * j); | ||||
|             y = (int)(ty * i); | ||||
|  | ||||
|             dx = tx * j - x; | ||||
|             dy = ty * i - y; | ||||
|  | ||||
|             for (k = 0; k < 3; k++) { | ||||
|                 for (jj = 0; jj <= 3; jj++) { | ||||
|                     d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; | ||||
|                     d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; | ||||
|                     d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; | ||||
|                     a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; | ||||
|  | ||||
|                     a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; | ||||
|                     a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2; | ||||
|                     a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3; | ||||
|  | ||||
|                     C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; | ||||
|  | ||||
|                     d0 = C[0] - C[1]; | ||||
|                     d2 = C[2] - C[1]; | ||||
|                     d3 = C[3] - C[1]; | ||||
|                     a0 = C[1]; | ||||
|                     a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; | ||||
|                     a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2; | ||||
|                     a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3; | ||||
|                     Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; | ||||
|  | ||||
|                     const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); | ||||
|                     dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| // llava-1.6 type of resize_and_pad (black) | ||||
| static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) { | ||||
|     int target_width = target_resolution.first; | ||||
|     int target_height = target_resolution.second; | ||||
|  | ||||
|     float scale_w = static_cast<float>(target_width) / image.nx; | ||||
|     float scale_h = static_cast<float>(target_height) / image.ny; | ||||
|  | ||||
|     int new_width, new_height; | ||||
|  | ||||
|     if (scale_w < scale_h) { | ||||
|         new_width = target_width; | ||||
|         new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height); | ||||
|     } else { | ||||
|         new_height = target_height; | ||||
|         new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width); | ||||
|     } | ||||
|  | ||||
|     clip_image_u8 resized_image; | ||||
|     // bilinear_resize(image, resized_image, new_width, new_height); | ||||
|     bicubic_resize(image, resized_image, new_width, new_height); | ||||
|  | ||||
|     clip_image_u8 padded_image; | ||||
|     padded_image.nx = target_width; | ||||
|     padded_image.ny = target_height; | ||||
|     padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black | ||||
|  | ||||
|     // Calculate padding offsets | ||||
|     int pad_x = (target_width - new_width) / 2; | ||||
|     int pad_y = (target_height - new_height) / 2; | ||||
|  | ||||
|     // Copy the resized image into the center of the padded buffer | ||||
|     for (int y = 0; y < new_height; ++y) { | ||||
|         for (int x = 0; x < new_width; ++x) { | ||||
|             for (int c = 0; c < 3; ++c) { | ||||
|                 padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     image_output = std::move(padded_image); | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Selects the best resolution from a list of possible resolutions based on the original size. | ||||
|  * | ||||
|  * @param original_size The original size of the image in the format (width, height). | ||||
|  * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. | ||||
|  * @return The best fit resolution in the format (width, height). | ||||
|  */ | ||||
| static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) { | ||||
|     int original_width = original_size.first; | ||||
|     int original_height = original_size.second; | ||||
|     std::pair<int, int> best_fit; | ||||
|     int max_effective_resolution = 0; | ||||
|     int min_wasted_resolution = std::numeric_limits<int>::max(); | ||||
|  | ||||
|     for (const auto& resolution : possible_resolutions) { | ||||
|         int width = resolution.first; | ||||
|         int height = resolution.second; | ||||
|         float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height); | ||||
|         int downscaled_width = static_cast<int>(original_width * scale); | ||||
|         int downscaled_height = static_cast<int>(original_height * scale); | ||||
|         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); | ||||
|         int wasted_resolution = (width * height) - effective_resolution; | ||||
|         // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); | ||||
|         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { | ||||
|             max_effective_resolution = effective_resolution; | ||||
|             min_wasted_resolution = wasted_resolution; | ||||
|             best_fit = resolution; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return best_fit; | ||||
| } | ||||
|  | ||||
| static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) { | ||||
|     std::vector<clip_image_u8*> patches; | ||||
|     int width = image.nx; | ||||
|     int height = image.ny; | ||||
|     for (int i = 0; i < height; i += patch_size) { | ||||
|         for (int j = 0; j < width; j += patch_size) { | ||||
|             clip_image_u8 *patch = clip_image_u8_init(); | ||||
|             patch->nx = std::min(patch_size, width - j); | ||||
|             patch->ny = std::min(patch_size, height - i); | ||||
|             patch->buf.resize(3 * patch->nx * patch->ny); | ||||
|             for (int y = 0; y < patch->ny; ++y) { | ||||
|                 for (int x = 0; x < patch->nx; ++x) { | ||||
|                     for (int c = 0; c < 3; ++c) { | ||||
|                         patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c]; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             patches.push_back(patch); | ||||
|         } | ||||
|     } | ||||
|     return patches; | ||||
| } | ||||
|  | ||||
| // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector | ||||
| // res_imgs memory is being allocated here, previous allocations will be freed if found | ||||
| bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { | ||||
|     bool pad_to_square = true; | ||||
|     if (!ctx->has_vision_encoder) { | ||||
|         printf("This gguf file seems to have no vision encoder\n"); | ||||
|         return false; | ||||
|     } | ||||
|     auto & params = ctx->vision_model.hparams; | ||||
|     // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing | ||||
|     if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) { | ||||
|         pad_to_square = false; | ||||
|     } | ||||
|     // free the previous res_imgs if any set | ||||
|     if (res_imgs.size > 0 && res_imgs.size < 100) { | ||||
|         for (size_t i = 0; i < res_imgs.size; i++) { | ||||
|             clip_image_f32_free(&(res_imgs.data[i])); | ||||
|         } | ||||
|         delete[] res_imgs.data; | ||||
|     } | ||||
|     res_imgs.data = nullptr; | ||||
|     res_imgs.size = 0; | ||||
|  | ||||
|     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) | ||||
|     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 | ||||
|  | ||||
|     clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily | ||||
|     if (pad2square && img->nx != img->ny) { | ||||
|     if (pad_to_square && img->nx != img->ny) { | ||||
|         int longer_side = std::max(img->nx, img->ny); | ||||
|         temp->nx = longer_side; | ||||
|         temp->ny = longer_side; | ||||
|         temp->buf.resize(3 * longer_side * longer_side); | ||||
|         const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA | ||||
|         const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255) | ||||
|  | ||||
|         // fill with background color | ||||
|         for (size_t i = 0; i < temp->buf.size(); i++) { | ||||
| @@ -1119,18 +1530,63 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli | ||||
|             } | ||||
|         } | ||||
|     } else { | ||||
|         temp->nx = img->nx; | ||||
|         temp->ny = img->ny; | ||||
|         temp->buf.resize(img->buf.size()); | ||||
|         memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); | ||||
|         if (params.image_grid_pinpoints[0] != 0) { | ||||
|             // "spatial_unpad" with "anyres" processing for llava-1.6 | ||||
|             std::vector<std::pair<int, int>> possible_resolutions; | ||||
|             for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { | ||||
|                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); | ||||
|             } | ||||
|             std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); | ||||
|             // clip_image_save_to_bmp(*img, "input.bmp"); | ||||
|             resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6 | ||||
|             // clip_image_save_to_bmp(*temp, "resized.bmp"); | ||||
|             // visually verify normalized image: | ||||
|             // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); | ||||
|             // { | ||||
|             //     clip_image_u8 * temp2 = clip_image_u8_init(); | ||||
|             //     clip_image_convert_f32_to_u8(*res, *temp2); | ||||
|             //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp"); | ||||
|             //     clip_image_u8_free(temp2); | ||||
|             // } | ||||
|  | ||||
|             std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6) | ||||
|  | ||||
|             clip_image_u8 *image_original_resize = clip_image_u8_init(); | ||||
|             // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square | ||||
|             bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square | ||||
|             patches.insert(patches.begin(), image_original_resize); | ||||
|             // clip_image_f32_batch_init(patches.size()); | ||||
|             res_imgs.size = patches.size(); | ||||
|             res_imgs.data = new clip_image_f32[res_imgs.size]; | ||||
|             int num=0; | ||||
|             for (auto& patch : patches) { | ||||
|                 normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std); | ||||
|                 num++; | ||||
|             } | ||||
|  | ||||
|             for (size_t i = 0; i < patches.size(); i++) { | ||||
|                 // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); | ||||
|                 clip_image_u8_free(patches[i]); | ||||
|             } | ||||
|  | ||||
|             clip_image_u8_free(temp); | ||||
|  | ||||
|             return true; | ||||
|         } else { | ||||
|             temp->nx = img->nx; | ||||
|             temp->ny = img->ny; | ||||
|             temp->buf.resize(img->buf.size()); | ||||
|             memcpy(temp->buf.data(), img->buf.data(), temp->buf.size()); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     const int nx = temp->nx; | ||||
|     const int ny = temp->ny; | ||||
|     // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); | ||||
|  | ||||
|     const int nx2 = ctx->vision_model.hparams.image_size; | ||||
|     const int ny2 = ctx->vision_model.hparams.image_size; | ||||
|  | ||||
|     clip_image_f32 * res = clip_image_f32_init(); | ||||
|     res->nx = nx2; | ||||
|     res->ny = ny2; | ||||
|     res->buf.resize(3 * nx2 * ny2); | ||||
| @@ -1184,9 +1640,25 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli | ||||
|     } | ||||
|     clip_image_u8_free(temp); | ||||
|  | ||||
|     // { | ||||
|     //     clip_image_u8 * temp2 = clip_image_u8_init(); | ||||
|     //     clip_image_convert_f32_to_u8(*res, *temp2); | ||||
|     //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp"); | ||||
|     //     clip_image_u8_free(temp2); | ||||
|     // } | ||||
|     // res_imgs.push_back(res); | ||||
|  | ||||
|     res_imgs.size = 1; | ||||
|     res_imgs.data = new clip_image_f32[res_imgs.size]; | ||||
|     res_imgs.data[0] = std::move(*res); | ||||
|  | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { | ||||
|     return ctx->vision_model.image_newline; | ||||
| } | ||||
|  | ||||
| void clip_free(clip_ctx * ctx) { | ||||
|     ggml_free(ctx->ctx_data); | ||||
|     gguf_free(ctx->ctx_gguf); | ||||
| @@ -1194,6 +1666,42 @@ void clip_free(clip_ctx * ctx) { | ||||
|     delete ctx; | ||||
| } | ||||
|  | ||||
| size_t clip_embd_nbytes(const struct clip_ctx * ctx) { | ||||
|     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); | ||||
| } | ||||
|  | ||||
| int32_t clip_image_size(const struct clip_ctx * ctx) { | ||||
|     return ctx->vision_model.hparams.image_size; | ||||
| } | ||||
|  | ||||
| int32_t clip_patch_size(const struct clip_ctx * ctx) { | ||||
|     return ctx->vision_model.hparams.patch_size; | ||||
| } | ||||
|  | ||||
| int32_t clip_hidden_size(const struct clip_ctx * ctx) { | ||||
|     return ctx->vision_model.hparams.hidden_size; | ||||
| } | ||||
|  | ||||
| const char * clip_patch_merge_type(const struct clip_ctx * ctx) { | ||||
|     return ctx->vision_model.hparams.mm_patch_merge_type; | ||||
| } | ||||
|  | ||||
| const int32_t * clip_image_grid(const struct clip_ctx * ctx) { | ||||
|     return ctx->vision_model.hparams.image_grid_pinpoints; | ||||
| } | ||||
|  | ||||
| int clip_n_patches(const struct clip_ctx * ctx) { | ||||
|     const auto & params = ctx->vision_model.hparams; | ||||
|  | ||||
|     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); | ||||
|  | ||||
|     if (ctx->proj_type == PROJECTOR_TYPE_LDP) { | ||||
|         n_patches /= 4; | ||||
|     } | ||||
|  | ||||
|     return n_patches; | ||||
| } | ||||
|  | ||||
| bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { | ||||
|     if (!ctx->has_vision_encoder) { | ||||
|         printf("This gguf file seems to have no vision encoder\n"); | ||||
| @@ -1213,7 +1721,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima | ||||
|     } | ||||
|  | ||||
|     int batch_size = imgs->size; | ||||
|     if(ctx->has_llava_projector) { | ||||
|     if (ctx->has_llava_projector) { | ||||
|         GGML_ASSERT(batch_size == 1); // TODO: support multiple images | ||||
|     } | ||||
|  | ||||
| @@ -1224,9 +1732,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima | ||||
|     // set inputs | ||||
|     const auto & model = ctx->vision_model; | ||||
|     const auto & hparams = model.hparams; | ||||
|     const int image_size = hparams.image_size; | ||||
|     const int patch_size = hparams.patch_size; | ||||
|     const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); | ||||
|  | ||||
|     const int image_size    = hparams.image_size; | ||||
|     const int patch_size    = hparams.patch_size; | ||||
|     const int num_patches   = ((image_size / patch_size) * (image_size / patch_size)); | ||||
|     const int num_positions = num_patches + 1; | ||||
|  | ||||
|     { | ||||
| @@ -1301,11 +1810,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima | ||||
|  | ||||
|     // copy the embeddings to the location passed by the user | ||||
|     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); | ||||
|  | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { | ||||
|  | ||||
|     ggml_type type = GGML_TYPE_Q4_1; | ||||
|  | ||||
|     assert(itype < GGML_TYPE_COUNT); | ||||
| @@ -1494,26 +2003,13 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { | ||||
|     if (ctx->proj_type == PROJECTOR_TYPE_LDP) { | ||||
|         return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; | ||||
|     } | ||||
|     else if (ctx->proj_type == PROJECTOR_TYPE_MLP) { | ||||
|     if (ctx->proj_type == PROJECTOR_TYPE_MLP) { | ||||
|         return ctx->vision_model.mm_2_b->ne[0]; | ||||
|     } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { | ||||
|     } | ||||
|     if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { | ||||
|         return ctx->vision_model.mm_3_b->ne[0]; | ||||
|     } | ||||
|     else { | ||||
|         std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; | ||||
|         throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); | ||||
|     } | ||||
| } | ||||
|  | ||||
| int clip_n_patches(const struct clip_ctx * ctx) { | ||||
|     auto & params = ctx->vision_model.hparams; | ||||
|     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); | ||||
|     if (ctx->proj_type == PROJECTOR_TYPE_LDP) { | ||||
|         n_patches /= 4; | ||||
|     } | ||||
|     return n_patches; | ||||
| } | ||||
|  | ||||
| size_t clip_embd_nbytes(const struct clip_ctx * ctx) { | ||||
|     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); | ||||
|     std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; | ||||
|     throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); | ||||
| } | ||||
|   | ||||
| @@ -24,25 +24,7 @@ struct clip_ctx; | ||||
| extern "C" { | ||||
| #endif | ||||
|  | ||||
| struct clip_vision_hparams { | ||||
|     int32_t image_size; | ||||
|     int32_t patch_size; | ||||
|     int32_t hidden_size; | ||||
|     int32_t n_intermediate; | ||||
|     int32_t projection_dim; | ||||
|     int32_t n_head; | ||||
|     int32_t n_layer; | ||||
|     float eps; | ||||
| }; | ||||
|  | ||||
| CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); | ||||
|  | ||||
| CLIP_API void clip_free(struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API int clip_n_patches    (const struct clip_ctx * ctx); | ||||
| CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); | ||||
| struct clip_ctx; | ||||
|  | ||||
| struct clip_image_u8_batch { | ||||
|     struct clip_image_u8 * data; | ||||
| @@ -54,10 +36,29 @@ struct clip_image_f32_batch { | ||||
|     size_t size; | ||||
| }; | ||||
|  | ||||
| CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity); | ||||
| CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); | ||||
|  | ||||
| CLIP_API void clip_free(struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); | ||||
| CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); | ||||
| CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); | ||||
|  | ||||
| // TODO: should be enum, not string | ||||
| CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API int clip_n_patches    (const struct clip_ctx * ctx); | ||||
| CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API struct clip_image_u8  * clip_image_u8_init (); | ||||
| CLIP_API struct clip_image_f32 * clip_image_f32_init(); | ||||
|  | ||||
| CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); | ||||
| CLIP_API void clip_image_u8_free (struct clip_image_u8  * img); | ||||
| CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); | ||||
|  | ||||
| CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); | ||||
| @@ -65,7 +66,11 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 | ||||
| /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ | ||||
| CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); | ||||
|  | ||||
| CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square); | ||||
| /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ | ||||
| CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); | ||||
|  | ||||
| CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); | ||||
|  | ||||
| CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); | ||||
| CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); | ||||
|  | ||||
|   | ||||
| @@ -78,18 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False, | ||||
|                 help="Save a text-only model. It can't be used to encode images") | ||||
| ap.add_argument("--vision-only", action="store_true", required=False, | ||||
|                 help="Save a vision-only model. It can't be used to encode texts") | ||||
| ap.add_argument("--clip_model_is_vision", action="store_true", required=False, | ||||
| ap.add_argument("--clip-model-is-vision", action="store_true", required=False, | ||||
|                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") | ||||
| ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, | ||||
|                 help="The clip model is from openclip (for ViT-SO400M type))") | ||||
| ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") | ||||
| ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") | ||||
| ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") | ||||
| ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") | ||||
| ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) | ||||
| # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 | ||||
| # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 | ||||
| default_image_mean = [0.48145466, 0.4578275, 0.40821073] | ||||
| default_image_std = [0.26862954, 0.26130258, 0.27577711] | ||||
| ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) | ||||
| ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) | ||||
| ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) | ||||
| ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) | ||||
|  | ||||
| # with proper | ||||
| args = ap.parse_args() | ||||
| @@ -105,7 +106,7 @@ if args.use_f32: | ||||
| # output in the same directory as the model if output_dir is None | ||||
| dir_model = args.model_dir | ||||
|  | ||||
| if args.clip_model_is_vision: | ||||
| if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: | ||||
|     vocab = None | ||||
|     tokens = None | ||||
| else: | ||||
| @@ -133,7 +134,7 @@ ftype = 1 | ||||
| if args.use_f32: | ||||
|     ftype = 0 | ||||
|  | ||||
| if args.clip_model_is_vision: | ||||
| if args.clip_model_is_vision or args.clip_model_is_openclip: | ||||
|     model = CLIPVisionModel.from_pretrained(dir_model) | ||||
|     processor = None | ||||
| else: | ||||
| @@ -202,6 +203,57 @@ if has_vision_encoder: | ||||
|     fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) | ||||
|     block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] | ||||
|     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) | ||||
|                             #     /** | ||||
|                             #      "image_grid_pinpoints": [ | ||||
|                             #         [ | ||||
|                             #         336, | ||||
|                             #         672 | ||||
|                             #         ], | ||||
|                             #         [ | ||||
|                             #         672, | ||||
|                             #         336 | ||||
|                             #         ], | ||||
|                             #         [ | ||||
|                             #         672, | ||||
|                             #         672 | ||||
|                             #         ], | ||||
|                             #         [ | ||||
|                             #         1008, | ||||
|                             #         336 | ||||
|                             #         ], | ||||
|                             #         [ | ||||
|                             #         336, | ||||
|                             #         1008 | ||||
|                             #         ] | ||||
|                             #     ], | ||||
|                             #     Flattened: | ||||
|                             #     [ | ||||
|                             #         336, 672, | ||||
|                             #         672, 336, | ||||
|                             #         672, 672, | ||||
|                             #         1008, 336, | ||||
|                             #         336, 1008 | ||||
|                             #     ] | ||||
|                             #  * | ||||
|                             #  */ | ||||
|     if "image_grid_pinpoints" in v_hparams: | ||||
|         # flatten it | ||||
|         image_grid_pinpoints = [] | ||||
|         for pinpoint in v_hparams["image_grid_pinpoints"]: | ||||
|             for p in pinpoint: | ||||
|                 image_grid_pinpoints.append(p) | ||||
|         fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) | ||||
|     if "image_crop_resolution" in v_hparams: | ||||
|         fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) | ||||
|     if "image_aspect_ratio" in v_hparams: | ||||
|         fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) | ||||
|     if "image_split_resolution" in v_hparams: | ||||
|         fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) | ||||
|     if "mm_patch_merge_type" in v_hparams: | ||||
|         fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) | ||||
|     if "mm_projector_type" in v_hparams: | ||||
|         fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) | ||||
|  | ||||
|  | ||||
|     if processor is not None: | ||||
|         image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean | ||||
|   | ||||
| @@ -155,11 +155,29 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ | ||||
|         system_prompt = prompt.substr(0, image_pos); | ||||
|         user_prompt = prompt.substr(image_pos + std::string("<image>").length()); | ||||
|         printf("system_prompt: %s\n", system_prompt.c_str()); | ||||
|         if (params->verbose_prompt) { | ||||
|             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); | ||||
|             for (int i = 0; i < (int) tmp.size(); i++) { | ||||
|                 printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); | ||||
|             } | ||||
|         } | ||||
|         printf("user_prompt: %s\n", user_prompt.c_str()); | ||||
|         if (params->verbose_prompt) { | ||||
|             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); | ||||
|             for (int i = 0; i < (int) tmp.size(); i++) { | ||||
|                 printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); | ||||
|             } | ||||
|         } | ||||
|     } else { | ||||
|         // llava-1.5 native mode | ||||
|         system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; | ||||
|         user_prompt = prompt + "\nASSISTANT:"; | ||||
|         if (params->verbose_prompt) { | ||||
|             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); | ||||
|             for (int i = 0; i < (int) tmp.size(); i++) { | ||||
|                 printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); | ||||
| @@ -171,13 +189,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ | ||||
|     fprintf(stderr, "\n"); | ||||
|  | ||||
|     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); | ||||
|  | ||||
|     std::string response = ""; | ||||
|     for (int i = 0; i < max_tgt_len; i++) { | ||||
|         const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); | ||||
|         response += tmp; | ||||
|         if (strcmp(tmp, "</s>") == 0) break; | ||||
|         if (strstr(tmp, "###")) break; // Yi-VL behavior | ||||
|  | ||||
|         printf("%s", tmp); | ||||
|         if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) | ||||
|         if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 | ||||
|         if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 | ||||
|  | ||||
|         fflush(stdout); | ||||
|     } | ||||
|  | ||||
|   | ||||
							
								
								
									
										167
									
								
								examples/llava/llava-surgery-v2.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										167
									
								
								examples/llava/llava-surgery-v2.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,167 @@ | ||||
| import argparse | ||||
| import glob | ||||
| import os | ||||
| import torch | ||||
| from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file | ||||
|  | ||||
| # Function to determine if file is a SafeTensor file | ||||
| def is_safetensor_file(file_path): | ||||
|     return file_path.endswith('.safetensors') | ||||
|  | ||||
|  | ||||
| # Unified loading function | ||||
| def load_model(file_path): | ||||
|     if is_safetensor_file(file_path): | ||||
|         tensors = {} | ||||
|         with safe_open(file_path, framework="pt", device="cpu") as f: | ||||
|             for key in f.keys(): | ||||
|                 tensors[key] = f.get_tensor(key).clone() | ||||
|                 # output shape | ||||
|                 print(f"{key} : {tensors[key].shape}") | ||||
|         return tensors, 'safetensor' | ||||
|     else: | ||||
|         return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' | ||||
|  | ||||
|  | ||||
| # Unified saving function | ||||
| def save_model(model, file_path, file_type): | ||||
|     if file_type == 'safetensor': | ||||
|         # safe_save(model, file_path) | ||||
|         save_file(model, file_path) | ||||
|     else: | ||||
|         torch.save(model, file_path) | ||||
|  | ||||
|  | ||||
| # Adapted function to clean vision tower from checkpoint | ||||
| def clean_vision_tower_from_checkpoint(checkpoint_path): | ||||
|     checkpoint, file_type = load_model(checkpoint_path) | ||||
|     # file_type = 'pytorch' | ||||
|     model_path = os.path.dirname(checkpoint_path) | ||||
|     print(f"Searching for vision tower tensors in {checkpoint_path}") | ||||
|     clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))] | ||||
|  | ||||
|     if len(clip_tensors) > 0: | ||||
|         print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") | ||||
|         # Adapted for file type | ||||
|         clip_path = os.path.join(model_path, "llava.clip") | ||||
|  | ||||
|         if os.path.exists(clip_path): | ||||
|             print(f"Loading existing llava.clip from {clip_path}") | ||||
|             existing_clip, _ = load_model(clip_path) | ||||
|         else: | ||||
|             print(f"Creating new llava.clip at {clip_path}") | ||||
|             existing_clip = {} | ||||
|         # Update existing_clip with new tensors, avoid duplicates | ||||
|         for name in clip_tensors: | ||||
|             simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name | ||||
|             print(f"Adding {simple_name} to llava.clip") | ||||
|             if simple_name not in existing_clip: | ||||
|                 existing_clip[simple_name] = checkpoint[name] | ||||
|  | ||||
|         # Save the updated clip tensors back to llava.clip | ||||
|         save_model(existing_clip, clip_path, 'pytorch') | ||||
|  | ||||
|         # Remove the tensors from the original checkpoint | ||||
|         for name in clip_tensors: | ||||
|             del checkpoint[name] | ||||
|  | ||||
|         # Save the updated checkpoint | ||||
|         checkpoint_path = checkpoint_path | ||||
|         save_model(checkpoint, checkpoint_path, file_type) | ||||
|         return True | ||||
|     return False | ||||
|  | ||||
| def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): | ||||
|     newline_checkpoint_path = None | ||||
|     projector_checkpoint_path = None | ||||
|  | ||||
|     for path in checkpoint_paths: | ||||
|         checkpoint, _ = load_model(path) | ||||
|         if newline_criteria(checkpoint) and newline_checkpoint_path is None: | ||||
|             newline_checkpoint_path = path | ||||
|         if projector(checkpoint): | ||||
|             projector_checkpoint_path = path | ||||
|  | ||||
|     return newline_checkpoint_path, projector_checkpoint_path | ||||
|  | ||||
| def newline_criteria(checkpoint): | ||||
|     return any(k.startswith("model.image_newline") for k in checkpoint.keys()) | ||||
|  | ||||
| def proj_criteria(checkpoint): | ||||
|     return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) | ||||
|  | ||||
|  | ||||
| # Command-line interface setup | ||||
| ap = argparse.ArgumentParser() | ||||
| ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") | ||||
| ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") | ||||
| args = ap.parse_args() | ||||
|  | ||||
| if args.clean_vision_tower: | ||||
|     # Generalized to handle both PyTorch and SafeTensors models | ||||
|     model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) | ||||
|     # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] | ||||
|     checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] | ||||
|     for projector_checkpoint_path in checkpoint_paths: | ||||
|         print(f"Cleaning {projector_checkpoint_path}") | ||||
|         if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): | ||||
|             print(f"No vision tower found in {projector_checkpoint_path}") | ||||
|             # we break once none is found, so far all models append them at the end | ||||
|             # break | ||||
|     print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") | ||||
|  | ||||
| # Now we look for the projector in the last checkpoint | ||||
| model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) | ||||
| checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] | ||||
| # last_checkpoint_path = checkpoint_paths[0] | ||||
| # first_checkpoint_path = checkpoint_paths[-1] | ||||
| newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) | ||||
|  | ||||
| print(f"Taking projector from {projector_checkpoint_path}") | ||||
| first_mm_tensors = [] | ||||
| first_checkpoint = None | ||||
| if newline_checkpoint_path is not None: | ||||
|     print(f"Taking newline from {newline_checkpoint_path}") | ||||
|     first_checkpoint, file_type = load_model(newline_checkpoint_path) | ||||
|     first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] | ||||
|  | ||||
| # Load the checkpoint | ||||
| mm_tensors = [] | ||||
| last_checkpoint = None | ||||
| if projector_checkpoint_path is not None: | ||||
|     last_checkpoint, file_type = load_model(projector_checkpoint_path) | ||||
|     mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] | ||||
|  | ||||
| if len(mm_tensors) == 0: | ||||
|     if last_checkpoint is not None: | ||||
|         for k, v in last_checkpoint.items(): | ||||
|             print(k) | ||||
|     print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.") | ||||
|     print("No tensors found. Is this a LLaVA model?") | ||||
|     exit() | ||||
|  | ||||
| print(f"Found {len(mm_tensors)} tensors to extract.") | ||||
| print(f"Found additional {len(first_mm_tensors)} tensors to extract.") | ||||
| # projector = {name: checkpoint.[name].float() for name in mm_tensors} | ||||
| projector = {} | ||||
| for name in mm_tensors: | ||||
|     projector[name] = last_checkpoint[name].float() | ||||
| for name in first_mm_tensors: | ||||
|     projector[name] = first_checkpoint[name].float() | ||||
|  | ||||
| if len(projector) > 0: | ||||
|     save_model(projector, f"{args.model}/llava.projector", 'pytorch') | ||||
|  | ||||
| for name in mm_tensors: | ||||
|     del last_checkpoint[name] | ||||
| for name in first_mm_tensors: | ||||
|     del first_checkpoint[name] | ||||
|  | ||||
| if len(mm_tensors) > 0: | ||||
|     save_model(last_checkpoint, projector_checkpoint_path, file_type) | ||||
| if len(first_mm_tensors) > 0: | ||||
|     save_model(first_checkpoint, newline_checkpoint_path, file_type) | ||||
|  | ||||
| print("Done!") | ||||
| print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") | ||||
| print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") | ||||
| @@ -2,32 +2,296 @@ | ||||
| #include "common.h" | ||||
| #include "llama.h" | ||||
| #include "llava.h" | ||||
| #include "base64.hpp" | ||||
|  | ||||
| #include <cstdio> | ||||
| #include <cstdlib> | ||||
| #include <vector> | ||||
| #include <numeric> | ||||
|  | ||||
| // RGB uint8 image | ||||
| struct clip_image_u8 { | ||||
|     int nx; | ||||
|     int ny; | ||||
|  | ||||
|     std::vector<uint8_t> buf; | ||||
| }; | ||||
|  | ||||
| // RGB float32 image (NHWC) | ||||
| // Memory layout: RGBRGBRGB... | ||||
| struct clip_image_f32 { | ||||
|     int nx; | ||||
|     int ny; | ||||
|  | ||||
|     std::vector<float> buf; | ||||
| }; | ||||
|  | ||||
| struct clip_image_grid_shape { | ||||
|     int first; | ||||
|     int second; | ||||
| }; | ||||
|  | ||||
| /** | ||||
|  * Selects the best resolution from a list of possible resolutions based on the original size. | ||||
|  * | ||||
|  * @param original_size The original size of the image in the format (width, height). | ||||
|  * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. | ||||
|  * @return The best fit resolution in the format (width, height). | ||||
|  */ | ||||
| static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) { | ||||
|     int original_width  = original_size.first; | ||||
|     int original_height = original_size.second; | ||||
|  | ||||
|     std::pair<int, int> best_fit; | ||||
|     int max_effective_resolution = 0; | ||||
|     int min_wasted_resolution = std::numeric_limits<int>::max(); | ||||
|  | ||||
|     for (const auto& resolution : possible_resolutions) { | ||||
|         int width = resolution.first; | ||||
|         int height = resolution.second; | ||||
|         float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height); | ||||
|         int downscaled_width  = static_cast<int>(original_width * scale); | ||||
|         int downscaled_height = static_cast<int>(original_height * scale); | ||||
|         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); | ||||
|         int wasted_resolution = (width * height) - effective_resolution; | ||||
|         // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); | ||||
|         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { | ||||
|             max_effective_resolution = effective_resolution; | ||||
|             min_wasted_resolution = wasted_resolution; | ||||
|             best_fit = resolution; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return best_fit; | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * @brief Get the anyres image grid shape object | ||||
|  * | ||||
|  * @param image_size | ||||
|  * @param grid_pinpoints | ||||
|  * @param image_patch_size | ||||
|  * @return <int, int> | ||||
|  */ | ||||
| static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) { | ||||
|     /** | ||||
|         Conversion from gguf flat array to vector: | ||||
|         std::vector<std::pair<int, int>> possible_resolutions; | ||||
|         for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { | ||||
|             possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); | ||||
|         } | ||||
|      */ | ||||
|     auto best_resolution = select_best_resolution(image_size, grid_pinpoints); | ||||
|     return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; | ||||
| } | ||||
|  | ||||
| // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) | ||||
| static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { | ||||
|     struct { | ||||
|         struct ggml_tensor * newline; | ||||
|         struct ggml_context * ctx; | ||||
|     } model; | ||||
|  | ||||
|     const int32_t image_size = clip_image_size(ctx_clip); | ||||
|     const int32_t patch_size = clip_patch_size(ctx_clip); | ||||
|  | ||||
|     int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches) | ||||
|  | ||||
|     int num_patches_width  = grid_shape.first;  // grid 1-4 | ||||
|     int num_patches_height = grid_shape.second; // grid 1-4 | ||||
|  | ||||
|     const size_t num_images = num_patches_width + num_patches_height + 1; | ||||
|  | ||||
|     // TODO: size calculation is not calculated - it's only tens of MB | ||||
|     size_t ctx_size = 0; | ||||
|  | ||||
|     { | ||||
|         ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features | ||||
|         ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); | ||||
|     } | ||||
|  | ||||
|     struct ggml_init_params params { | ||||
|         /*.mem_size   =*/ ctx_size, | ||||
|         /*.mem_buffer =*/ NULL, | ||||
|         /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API | ||||
|     }; | ||||
|  | ||||
|     // Python reference code for full unpad: | ||||
|     /* | ||||
|         base_image_feature = image_feature[0] | ||||
|         image_feature = image_feature[1:] | ||||
|         image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() | ||||
|         image_feature = image_feature.flatten(1, 2).flatten(2, 3) | ||||
|         image_feature = unpad_image(image_feature, image_sizes[image_idx]) | ||||
|         image_feature = torch.cat(( | ||||
|             image_feature, | ||||
|             self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1) | ||||
|         ), dim=-1) | ||||
|         image_feature = image_feature.flatten(1, 2).transpose(0, 1) | ||||
|         image_feature = torch.cat((base_image_feature, image_feature), dim=0) | ||||
|     */ | ||||
|     // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval. | ||||
|     // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet. | ||||
|     // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them. | ||||
|     // Once all images are processed to prepended the base_image_features without any changes. | ||||
|  | ||||
|     // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling)) | ||||
|     /* | ||||
|         image_feature = image_feature.view(2, 2, 24, 24, 4096) | ||||
|         image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() | ||||
|         image_feature = image_feature.view(2, 24, 2, 24, 4096) | ||||
|         image_feature = image_feature.flatten(0, 3) | ||||
|  | ||||
|         // Reshape to 4D tensor by merging the last two dimensions | ||||
|         image_feature = image_feature.view(2, 2, 24, 24*4096) | ||||
|         image_feature = image_feature.permute(0, 2, 1, 3).contiguous() | ||||
|         image_feature = image_feature.view(-1, 4096) | ||||
|     */ | ||||
|  | ||||
|     model.ctx = ggml_init(params); | ||||
|  | ||||
|     ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip); | ||||
|     model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); | ||||
|     if (newline_tmp->backend != GGML_BACKEND_CPU) { | ||||
|         if (newline_tmp->buffer == NULL) { | ||||
|             printf("newline_tmp tensor buffer is NULL\n"); | ||||
|         } | ||||
|         ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); | ||||
|     } else { | ||||
|         model.newline->data = newline_tmp->data; | ||||
|         if (model.newline->data == NULL) { | ||||
|             printf("newline_tmp tensor data is NULL\n"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 | ||||
|     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); | ||||
|     // fill it with the image embeddings, ignoring the base | ||||
|     for (size_t i = 1; i < num_images; i++) { | ||||
|         size_t offset = (i-1) * clip_embd_nbytes(ctx_clip); | ||||
|         memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip)); | ||||
|     } | ||||
|  | ||||
|     struct ggml_cgraph  * gf = ggml_new_graph(model.ctx); | ||||
|     size_t size_ele = ggml_type_size(GGML_TYPE_F32); | ||||
|  | ||||
|     struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, | ||||
|                                                                 num_patches_per_side * clip_n_mmproj_embd(ctx_clip), | ||||
|                                                                 num_patches_per_side, | ||||
|                                                                 num_patches_width, | ||||
|                                                                 num_patches_height, | ||||
|                                                                 size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), | ||||
|                                                                 size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side, | ||||
|                                                                 size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0); | ||||
|     // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false); | ||||
|     struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); | ||||
|     /** | ||||
|      At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings | ||||
|          image_feature = torch.cat(( | ||||
|         image_feature, | ||||
|         self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device) | ||||
|     ), dim=-1) | ||||
|      * | ||||
|      */ | ||||
|  | ||||
|     // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false); | ||||
|     struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0); | ||||
|     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false); | ||||
|     ggml_build_forward_expand(gf, flatten); | ||||
|     ggml_graph_compute_with_ctx(model.ctx, gf, 1); | ||||
|     struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1]; | ||||
|  | ||||
|     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context | ||||
|     // append without newline tokens (default behavior in llava_arch when not using unpad ): | ||||
|     memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches | ||||
|     *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip)); | ||||
|  | ||||
|     // Debug: Test single segments | ||||
|     // Current findings: sending base image, sending a segment embedding all works similar to python | ||||
|     // However, permuted embeddings do not work yet (stride issue?) | ||||
|     // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context | ||||
|     // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context | ||||
|     // *n_img_pos_out=576; | ||||
|  | ||||
|     ggml_free(model.ctx); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| #include "base64.hpp" | ||||
|  | ||||
| static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { | ||||
|     clip_image_f32 * img_res = clip_image_f32_init(); | ||||
|     if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) { | ||||
|     // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 | ||||
|     clip_image_f32_batch img_res_v; | ||||
|     img_res_v.size = 0; | ||||
|     img_res_v.data = nullptr; | ||||
|     if (!clip_image_preprocess(ctx_clip, img, img_res_v)) { | ||||
|         fprintf(stderr, "%s: unable to preprocess image\n", __func__); | ||||
|         clip_image_f32_free(img_res); | ||||
|         delete[] img_res_v.data; | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     *n_img_pos = clip_n_patches(ctx_clip); | ||||
|  | ||||
|     const int64_t t_img_enc_start_us = ggml_time_us(); | ||||
|     bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); | ||||
|     clip_image_f32_free(img_res); | ||||
|     if (!encoded) { | ||||
|         fprintf(stderr, "Unable to encode image\n"); | ||||
|  | ||||
|         return false; | ||||
|     const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); | ||||
|  | ||||
|     if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { | ||||
|         // flat / default llava-1.5 type embedding | ||||
|         *n_img_pos = clip_n_patches(ctx_clip); | ||||
|         bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 | ||||
|         delete[] img_res_v.data; | ||||
|         if (!encoded) { | ||||
|             fprintf(stderr, "Unable to encode image\n"); | ||||
|  | ||||
|             return false; | ||||
|         } | ||||
|     } else { | ||||
|         // spatial_unpad llava-1.6 type embedding | ||||
|         // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working | ||||
|         std::vector<float *> image_embd_v; | ||||
|         image_embd_v.resize(img_res_v.size); | ||||
|         for (size_t i = 0; i < img_res_v.size; i++) { | ||||
|             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 | ||||
|             const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside | ||||
|             if (!encoded) { | ||||
|                 fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); | ||||
|                 return false; | ||||
|             } | ||||
|         } | ||||
|         const int64_t t_img_enc_batch_us = ggml_time_us(); | ||||
|         printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); | ||||
|  | ||||
|         const int32_t * image_grid = clip_image_grid(ctx_clip); | ||||
|  | ||||
|         std::vector<std::pair<int, int>> grid_pinpoints; | ||||
|         for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) { | ||||
|             grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); | ||||
|         } | ||||
|  | ||||
|         // free all img_res_v - not needed anymore | ||||
|         delete[] img_res_v.data; | ||||
|         img_res_v.size = 0; | ||||
|         img_res_v.data = nullptr; | ||||
|  | ||||
|         const int32_t image_size = clip_image_size(ctx_clip); | ||||
|  | ||||
|         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size); | ||||
|  | ||||
|         int n_img_pos_out; | ||||
|         clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out); | ||||
|         *n_img_pos = n_img_pos_out; | ||||
|  | ||||
|         for (size_t i = 0; i < image_embd_v.size(); i++) { | ||||
|             free(image_embd_v[i]); | ||||
|         } | ||||
|         image_embd_v.clear(); | ||||
|  | ||||
|         // debug image/segment/normalization content: | ||||
|         // clip_image_u8 * tmp = clip_image_u8_init(); | ||||
|         // clip_image_convert_f32_to_u8(*image_feature, *tmp); | ||||
|         // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); | ||||
|     } | ||||
|  | ||||
|     printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); | ||||
|  | ||||
|     const int64_t t_img_enc_end_us = ggml_time_us(); | ||||
|     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; | ||||
|  | ||||
| @@ -48,7 +312,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * | ||||
| } | ||||
|  | ||||
| static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { | ||||
|     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); | ||||
|     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model | ||||
|     if (!image_embd) { | ||||
|         fprintf(stderr, "Unable to allocate memory for image embeddings\n"); | ||||
|         free(image_embd); | ||||
| @@ -85,7 +349,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { | ||||
| struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) { | ||||
|     clip_image_u8 * img = clip_image_u8_init(); | ||||
|     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { | ||||
|         clip_image_u8_free(img); | ||||
| @@ -142,7 +406,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { | ||||
| struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { | ||||
|     unsigned char* image_bytes; | ||||
|     long image_bytes_length; | ||||
|     auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); | ||||
| @@ -151,13 +415,13 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct | ||||
|         return NULL; | ||||
|     } | ||||
|  | ||||
|     auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); | ||||
|     llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length); | ||||
|     free(image_bytes); | ||||
|  | ||||
|     return embed; | ||||
| } | ||||
|  | ||||
| LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) { | ||||
| void llava_image_embed_free(struct llava_image_embed * embed) { | ||||
|     free(embed->embed); | ||||
|     free(embed); | ||||
| } | ||||
|   | ||||
| @@ -3,7 +3,6 @@ | ||||
|  | ||||
| #include "ggml.h" | ||||
|  | ||||
|  | ||||
| #ifdef LLAMA_SHARED | ||||
| #    if defined(_WIN32) && !defined(__MINGW32__) | ||||
| #        ifdef LLAMA_BUILD | ||||
| @@ -42,7 +41,6 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); | ||||
| /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ | ||||
| LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); | ||||
|  | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
|   | ||||
| @@ -968,13 +968,20 @@ struct llama_server_context | ||||
|             { | ||||
|                 continue; | ||||
|             } | ||||
|             clip_image_f32 * img_res = clip_image_f32_init(); | ||||
|             if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true)) | ||||
|             clip_image_f32_batch img_res_v; | ||||
|             img_res_v.size = 0; | ||||
|             img_res_v.data = nullptr; | ||||
|             if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v)) | ||||
|             { | ||||
|                 LOG_TEE("Error processing the given image"); | ||||
|                 clip_free(clp_ctx); | ||||
|                 clip_image_f32_free(img_res_v.data); | ||||
|                 return false; | ||||
|             } | ||||
|  | ||||
|             // note: assumes only one image was returned by clip_image_preprocess | ||||
|             clip_image_f32 * img_res = img_res_v.data; | ||||
|  | ||||
|             img.image_tokens = clip_n_patches(clp_ctx); | ||||
|             img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); | ||||
|             if (!img.image_embedding) | ||||
| @@ -989,7 +996,9 @@ struct llama_server_context | ||||
|                 LOG_TEE("Unable to encode image\n"); | ||||
|                 return false; | ||||
|             } | ||||
|             clip_image_f32_free(img_res); | ||||
|  | ||||
|             clip_image_f32_free(img_res_v.data); | ||||
|  | ||||
|             img.request_encode_image = false; | ||||
|         } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 John
					John