mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Revert "llava : add support for moondream vision language model (#6899)"
This reverts commit 46e12c4692.
			
			
This commit is contained in:
		| @@ -140,7 +140,6 @@ Typically finetunes of the base models below are supported as well. | ||||
| - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) | ||||
| - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) | ||||
| - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) | ||||
| - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) | ||||
|  | ||||
| **HTTP server** | ||||
|  | ||||
|   | ||||
| @@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) { | ||||
| #define TN_POS_EMBD        "%s.position_embd.weight" | ||||
| #define TN_CLASS_EMBD      "v.class_embd" | ||||
| #define TN_PATCH_EMBD      "v.patch_embd.weight" | ||||
| #define TN_PATCH_BIAS      "v.patch_embd.bias" | ||||
| #define TN_ATTN_K          "%s.blk.%d.attn_k.%s" | ||||
| #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s" | ||||
| #define TN_ATTN_V          "%s.blk.%d.attn_v.%s" | ||||
| @@ -426,7 +425,6 @@ struct clip_vision_model { | ||||
|     // embeddings | ||||
|     struct ggml_tensor * class_embedding; | ||||
|     struct ggml_tensor * patch_embeddings; | ||||
|     struct ggml_tensor * patch_bias; | ||||
|     struct ggml_tensor * position_embeddings; | ||||
|  | ||||
|     struct ggml_tensor * pre_ln_w; | ||||
| @@ -503,11 +501,6 @@ struct clip_ctx { | ||||
|     bool use_gelu = false; | ||||
|     int32_t ftype = 1; | ||||
|  | ||||
|     bool has_class_embedding = true; | ||||
|     bool has_pre_norm = true; | ||||
|     bool has_post_norm = false; | ||||
|     bool has_patch_bias = false; | ||||
|  | ||||
|     struct gguf_context * ctx_gguf; | ||||
|     struct ggml_context * ctx_data; | ||||
|  | ||||
| @@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | ||||
|     const int patch_size           = hparams.patch_size; | ||||
|     const int num_patches          = ((image_size / patch_size) * (image_size / patch_size)); | ||||
|     const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); | ||||
|     const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0); | ||||
|     const int num_positions        = num_patches + 1; | ||||
|     const int hidden_size          = hparams.hidden_size; | ||||
|     const int n_head               = hparams.n_head; | ||||
|     const int d_head               = hidden_size / n_head; | ||||
| @@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | ||||
|     inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); | ||||
|     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); | ||||
|  | ||||
|     if (ctx->has_patch_bias) { | ||||
|         // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); | ||||
|         inp = ggml_add(ctx0, inp, model.patch_bias); | ||||
|     } | ||||
|  | ||||
|     // concat class_embeddings and patch_embeddings | ||||
|     struct ggml_tensor * embeddings = inp; | ||||
|     if (ctx->has_class_embedding) { | ||||
|         embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); | ||||
|         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, | ||||
|                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); | ||||
|         embeddings = ggml_acc(ctx0, embeddings, inp, | ||||
|                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); | ||||
|     } | ||||
|     struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); | ||||
|     ggml_set_name(embeddings, "embeddings"); | ||||
|     ggml_set_input(embeddings); | ||||
|  | ||||
|     embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, | ||||
|             embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); | ||||
|  | ||||
|     embeddings = ggml_acc(ctx0, embeddings, inp, | ||||
|             embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); | ||||
|  | ||||
|     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); | ||||
|     ggml_set_name(positions, "positions"); | ||||
| @@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | ||||
|         ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); | ||||
|  | ||||
|     // pre-layernorm | ||||
|     if (ctx->has_pre_norm) { | ||||
|     { | ||||
|         embeddings = ggml_norm(ctx0, embeddings, eps); | ||||
|         ggml_set_name(embeddings, "pre_ln"); | ||||
|  | ||||
| @@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 | ||||
|         embeddings = cur; | ||||
|     } | ||||
|  | ||||
|     // post-layernorm | ||||
|     if (ctx->has_post_norm) { | ||||
|         embeddings = ggml_norm(ctx0, embeddings, eps); | ||||
|         ggml_set_name(embeddings, "post_ln"); | ||||
|  | ||||
|         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); | ||||
|     } | ||||
|  | ||||
|     // llava projector | ||||
|     { | ||||
|         embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); | ||||
| @@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { | ||||
|  | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); | ||||
|             new_clip->has_class_embedding = true; | ||||
|         } catch (const std::exception& e) { | ||||
|             new_clip->has_class_embedding = false; | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); | ||||
|             vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); | ||||
|             new_clip->has_pre_norm = true; | ||||
|         } catch (std::exception & e) { | ||||
|             new_clip->has_pre_norm = false; | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); | ||||
|             vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); | ||||
|             new_clip->has_post_norm = true; | ||||
|         } catch (std::exception & e) { | ||||
|             new_clip->has_post_norm = false; | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); | ||||
|             new_clip->has_patch_bias = true; | ||||
|         } catch (std::exception & e) { | ||||
|             new_clip->has_patch_bias = false; | ||||
|         } | ||||
|  | ||||
|         try { | ||||
|             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); | ||||
|             vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); | ||||
|             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); | ||||
|             vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); | ||||
|             vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); | ||||
|         } catch(const std::exception& e) { | ||||
|             LOG_TEE("%s: failed to load vision model tensors\n", __func__); | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov