mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	optimize deepstack feature saving
This commit is contained in:
		@@ -846,9 +846,12 @@ struct clip_graph {
 | 
				
			|||||||
        GGML_ASSERT(model.patch_bias != nullptr);
 | 
					        GGML_ASSERT(model.patch_bias != nullptr);
 | 
				
			||||||
        GGML_ASSERT(model.position_embeddings != nullptr);
 | 
					        GGML_ASSERT(model.position_embeddings != nullptr);
 | 
				
			||||||
        GGML_ASSERT(model.class_embedding == nullptr);
 | 
					        GGML_ASSERT(model.class_embedding == nullptr);
 | 
				
			||||||
 | 
					        GGML_ASSERT(hparams.spatial_merge_size == 2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        const int batch_size       = 1;
 | 
					        const int batch_size       = 1;
 | 
				
			||||||
 | 
					        const int merge_factor     = 4;
 | 
				
			||||||
        const int n_pos            = n_patches;
 | 
					        const int n_pos            = n_patches;
 | 
				
			||||||
 | 
					        const int n_pos_merged     = n_pos / merge_factor;
 | 
				
			||||||
        const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
 | 
					        const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        norm_type norm_t = NORM_TYPE_NORMAL;
 | 
					        norm_type norm_t = NORM_TYPE_NORMAL;
 | 
				
			||||||
@@ -911,9 +914,23 @@ struct clip_graph {
 | 
				
			|||||||
            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
 | 
					            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
 | 
					        int deepstack_layer_idx = 1; // begin with 1 to jump main feature
 | 
				
			||||||
        ggml_tensor * deepstack_features = nullptr;
 | 
					        const int llm_n_embd = model.mm_1_w->ne[1]; // llm token dim
 | 
				
			||||||
        const int merge_factor = hparams.spatial_merge_size > 0 ? hparams.spatial_merge_size * hparams.spatial_merge_size : 4; // default 2x2=4 for qwen3vl
 | 
					        const int n_deepstack_layers = std::count(hparams.is_deepstack_layers.begin(), hparams.is_deepstack_layers.end(), true);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        const size_t element_size = ggml_type_size(inpL->type);
 | 
				
			||||||
 | 
					        const size_t slice_offsets = llm_n_embd * n_pos_merged * batch_size * element_size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ggml_tensor * final_embedding = ggml_new_tensor_3d(ctx0, inpL->type,
 | 
				
			||||||
 | 
					            llm_n_embd * (n_deepstack_layers + 1), n_pos_merged, batch_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        auto make_deepstack_slice = [&](int idx) {
 | 
				
			||||||
 | 
					            return ggml_view_3d(ctx0, final_embedding,
 | 
				
			||||||
 | 
					                llm_n_embd, n_pos_merged, batch_size,
 | 
				
			||||||
 | 
					                llm_n_embd * element_size,
 | 
				
			||||||
 | 
					                slice_offsets,
 | 
				
			||||||
 | 
					                idx * slice_offsets);
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // loop over layers
 | 
					        // loop over layers
 | 
				
			||||||
        for (int il = 0; il < n_layer; il++) {
 | 
					        for (int il = 0; il < n_layer; il++) {
 | 
				
			||||||
@@ -990,13 +1007,7 @@ struct clip_graph {
 | 
				
			|||||||
                    nullptr, nullptr,
 | 
					                    nullptr, nullptr,
 | 
				
			||||||
                    layer.deepstack_fc2_w, layer.deepstack_fc2_b,
 | 
					                    layer.deepstack_fc2_w, layer.deepstack_fc2_b,
 | 
				
			||||||
                    ffn_op_type::FFN_GELU, il);
 | 
					                    ffn_op_type::FFN_GELU, il);
 | 
				
			||||||
 | 
					                ggml_cpy(ctx0, feat, make_deepstack_slice(deepstack_layer_idx++));
 | 
				
			||||||
                if(!deepstack_features) {
 | 
					 | 
				
			||||||
                    deepstack_features = feat;
 | 
					 | 
				
			||||||
                } else {
 | 
					 | 
				
			||||||
                    // concat along the feature dimension
 | 
					 | 
				
			||||||
                    deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            inpL = cur;
 | 
					            inpL = cur;
 | 
				
			||||||
@@ -1017,7 +1028,7 @@ struct clip_graph {
 | 
				
			|||||||
            model.mm_1_w, model.mm_1_b,
 | 
					            model.mm_1_w, model.mm_1_b,
 | 
				
			||||||
            ffn_op_type::FFN_GELU, -1);
 | 
					            ffn_op_type::FFN_GELU, -1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
 | 
					        ggml_cpy(ctx0, embeddings, make_deepstack_slice(0));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // build the graph
 | 
					        // build the graph
 | 
				
			||||||
        ggml_build_forward_expand(gf, embeddings);
 | 
					        ggml_build_forward_expand(gf, embeddings);
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user