mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : add support for control vectors (#5970)
* control vector api and implementation * control-vectors : minor code style updates * disable control vector when data == nullptr use -1 for disabled range (also on init) in case we ever support controlling layer 0 (embeddings) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										128
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										128
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -1894,6 +1894,31 @@ struct llama_kv_cache { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| struct llama_control_vector { | ||||
|     std::vector<struct ggml_tensor *> tensors; // per layer | ||||
|     std::vector<struct ggml_context *> ctxs; | ||||
|     std::vector<ggml_backend_buffer_t> bufs; | ||||
|  | ||||
|     int32_t layer_start = -1; | ||||
|     int32_t layer_end   = -1; | ||||
|  | ||||
|     ggml_tensor * tensor_for(int il) const { | ||||
|         if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { | ||||
|             return nullptr; | ||||
|         } | ||||
|         return tensors[il]; | ||||
|     } | ||||
|  | ||||
|     ~llama_control_vector() { | ||||
|         for (struct ggml_context * ctx : ctxs) { | ||||
|             ggml_free(ctx); | ||||
|         } | ||||
|         for (ggml_backend_buffer_t buf : bufs) { | ||||
|             ggml_backend_buffer_free(buf); | ||||
|         } | ||||
|     } | ||||
| }; | ||||
|  | ||||
| struct llama_vocab { | ||||
|     using id    = int32_t; | ||||
|     using token = std::string; | ||||
| @@ -2108,6 +2133,9 @@ struct llama_context { | ||||
|     struct ggml_tensor * inp_s_mask;    // F32 [1, kv_size] | ||||
|     struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch] | ||||
|  | ||||
|     // control vectors | ||||
|     struct llama_control_vector cvec; | ||||
|  | ||||
| #ifdef GGML_USE_MPI | ||||
|     ggml_mpi_context * ctx_mpi = NULL; | ||||
| #endif | ||||
| @@ -5931,6 +5959,12 @@ struct llm_build_context { | ||||
|             } | ||||
|  | ||||
|             cur = ggml_add(ctx0, cur, ffn_inp); | ||||
|             cb(cur, "ffn_out", il); | ||||
|  | ||||
|             ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); | ||||
|             if (layer_dir != nullptr) { | ||||
|                 cur = ggml_add(ctx0, cur, layer_dir); | ||||
|             } | ||||
|             cb(cur, "l_out", il); | ||||
|  | ||||
|             // input for next layer | ||||
| @@ -13366,6 +13400,10 @@ int32_t llama_n_embd(const struct llama_model * model) { | ||||
|     return model->hparams.n_embd; | ||||
| } | ||||
|  | ||||
| int32_t llama_n_layer(const struct llama_model * model) { | ||||
|     return model->hparams.n_layer; | ||||
| } | ||||
|  | ||||
| float llama_rope_freq_scale_train(const struct llama_model * model) { | ||||
|     return model->hparams.rope_freq_scale_train; | ||||
| } | ||||
| @@ -13465,6 +13503,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const | ||||
|     } | ||||
| } | ||||
|  | ||||
| static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { | ||||
|     GGML_ASSERT(cvec.tensors.empty()); | ||||
|     GGML_ASSERT(cvec.ctxs.empty()); | ||||
|     GGML_ASSERT(cvec.bufs.empty()); | ||||
|  | ||||
|     // count layer buffer types | ||||
|     std::map<ggml_backend_buffer_type_t, int> buft_layer_count; | ||||
|     for (int64_t i = 0; i < model.hparams.n_layer; i++) { | ||||
|         buft_layer_count[model.buft_layer[i].buft]++; | ||||
|     } | ||||
|  | ||||
|     // allocate contexts | ||||
|     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map; | ||||
|     for (auto & it : buft_layer_count) { | ||||
|         int n_layers = it.second; | ||||
|         struct ggml_init_params params = { | ||||
|             /*.mem_size   =*/ n_layers * ggml_tensor_overhead(), | ||||
|             /*.mem_buffer =*/ NULL, | ||||
|             /*.no_alloc   =*/ true, | ||||
|         }; | ||||
|         ggml_context * ctx = ggml_init(params); | ||||
|         if (!ctx) { | ||||
|             LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); | ||||
|             return 1; | ||||
|         } | ||||
|         ctx_map[it.first] = ctx; | ||||
|     } | ||||
|  | ||||
|     // make tensors | ||||
|     cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 | ||||
|     for (size_t il = 1; il < model.hparams.n_layer; il++) { | ||||
|         struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft); | ||||
|         ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd); | ||||
|         cvec.tensors.push_back(tensor); | ||||
|     } | ||||
|  | ||||
|     // allocate tensors / buffers and zero | ||||
|     for (auto it : ctx_map) { | ||||
|         ggml_backend_buffer_type_t buft = it.first; | ||||
|         ggml_context * ctx = it.second; | ||||
|         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); | ||||
|         if (!buf) { | ||||
|             LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); | ||||
|             return false; | ||||
|         } | ||||
|         ggml_backend_buffer_clear(buf, 0); | ||||
|         cvec.ctxs.push_back(ctx); | ||||
|         cvec.bufs.push_back(buf); | ||||
|     } | ||||
|  | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { | ||||
|     const llama_model & model = lctx->model; | ||||
|     llama_control_vector & cvec = lctx->cvec; | ||||
|  | ||||
|     if (data == nullptr) { | ||||
|         // disable the current control vector (but leave allocated for later) | ||||
|         cvec.layer_start = -1; | ||||
|         cvec.layer_end   = -1; | ||||
|         return 0; | ||||
|     } | ||||
|  | ||||
|     if (n_embd != (int) model.hparams.n_embd) { | ||||
|         LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     if (cvec.tensors.empty()) { | ||||
|         if (!llama_control_vector_init(cvec, model)) { | ||||
|             return 1; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     cvec.layer_start = il_start; | ||||
|     cvec.layer_end   = il_end; | ||||
|  | ||||
|     for (size_t il = 1; il < model.hparams.n_layer; il++) { | ||||
|         assert(cvec.tensors[il] != nullptr); | ||||
|  | ||||
|         const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present | ||||
|         if (off + n_embd <= len) { | ||||
|             ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { | ||||
|     struct llama_kv_cache_view result = { | ||||
|         /*.n_cells            = */ 0, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Theia Vogel
					Theia Vogel