mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : more consistent names of count variables (#5994)
* llama : more consistent names of count variables ggml-ci * llama : n_parallel -> n_seq_max * common : fix param name * examples : fix param name
This commit is contained in:
		
							
								
								
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -12538,7 +12538,7 @@ struct llama_context_params llama_context_default_params() { | ||||
|         /*.seed                        =*/ LLAMA_DEFAULT_SEED, | ||||
|         /*.n_ctx                       =*/ 512, | ||||
|         /*.n_batch                     =*/ 512, | ||||
|         /*.n_parallel                  =*/ 1, | ||||
|         /*.n_seq_max                   =*/ 1, | ||||
|         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default | ||||
|         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS, | ||||
|         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, | ||||
| @@ -12700,7 +12700,7 @@ struct llama_context * llama_new_context_with_model( | ||||
|     auto       & cparams = ctx->cparams; | ||||
|  | ||||
|     cparams.n_batch          = params.n_batch; | ||||
|     // TODO: maybe add n_parallel here too | ||||
|     // TODO: maybe add n_seq_max here too | ||||
|     cparams.n_threads        = params.n_threads; | ||||
|     cparams.n_threads_batch  = params.n_threads_batch; | ||||
|     cparams.yarn_ext_factor  = params.yarn_ext_factor; | ||||
| @@ -12767,7 +12767,7 @@ struct llama_context * llama_new_context_with_model( | ||||
|     // Mamba only needs a constant number of KV cache cells per sequence | ||||
|     if (model->arch == LLM_ARCH_MAMBA) { | ||||
|         // Mamba needs at least as many KV cells as there are sequences kept at any time | ||||
|         kv_size = std::max((uint32_t) 1, params.n_parallel); | ||||
|         kv_size = std::max((uint32_t) 1, params.n_seq_max); | ||||
|         // it's probably best to keep as much precision as possible for the states | ||||
|         type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states | ||||
|         type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states | ||||
| @@ -13024,7 +13024,7 @@ uint32_t llama_n_batch(const struct llama_context * ctx) { | ||||
|     return ctx->cparams.n_batch; | ||||
| } | ||||
|  | ||||
| uint32_t llama_n_max_seq(const struct llama_context * ctx) { | ||||
| uint32_t llama_n_seq_max(const struct llama_context * ctx) { | ||||
|     return ctx->kv_self.size; | ||||
| } | ||||
|  | ||||
| @@ -13188,10 +13188,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const | ||||
|     } | ||||
| } | ||||
|  | ||||
| struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) { | ||||
| struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) { | ||||
|     struct llama_kv_cache_view result = { | ||||
|         /*.n_cells            = */ 0, | ||||
|         /*.n_max_seq          = */ n_max_seq, | ||||
|         /*.n_seq_max          = */ n_seq_max, | ||||
|         /*.token_count        = */ 0, | ||||
|         /*.used_cells         = */ llama_get_kv_cache_used_cells(ctx), | ||||
|         /*.max_contiguous     = */ 0, | ||||
| @@ -13219,7 +13219,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k | ||||
|         void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells); | ||||
|         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); | ||||
|         view->cells = (struct llama_kv_cache_view_cell *)p; | ||||
|         p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells); | ||||
|         p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells); | ||||
|         GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); | ||||
|         view->cells_sequences = (llama_seq_id *)p; | ||||
|     } | ||||
| @@ -13233,7 +13233,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k | ||||
|     uint32_t max_contig = 0; | ||||
|     int32_t max_contig_idx = -1; | ||||
|  | ||||
|     for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) { | ||||
|     for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) { | ||||
|         const size_t curr_size = kv_cells[i].seq_id.size(); | ||||
|         token_count += curr_size; | ||||
|         c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; | ||||
| @@ -13250,7 +13250,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k | ||||
|  | ||||
|         int seq_idx = 0; | ||||
|         for (const llama_seq_id it : kv_cells[i].seq_id) { | ||||
|             if (seq_idx >= view->n_max_seq) { | ||||
|             if (seq_idx >= view->n_seq_max) { | ||||
|                 break; | ||||
|             } | ||||
|             cs_curr[seq_idx] = it; | ||||
| @@ -13259,7 +13259,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k | ||||
|         if (seq_idx != 0) { | ||||
|             used_cells++; | ||||
|         } | ||||
|         for (; seq_idx < view->n_max_seq; seq_idx++) { | ||||
|         for (; seq_idx < view->n_seq_max; seq_idx++) { | ||||
|             cs_curr[seq_idx] = -1; | ||||
|         } | ||||
|     } | ||||
| @@ -13921,12 +13921,12 @@ int32_t llama_tokenize( | ||||
|                   const char * text, | ||||
|                      int32_t   text_len, | ||||
|                  llama_token * tokens, | ||||
|                      int32_t   n_max_tokens, | ||||
|                      int32_t   n_tokens_max, | ||||
|                         bool   add_bos, | ||||
|                         bool   special) { | ||||
|     auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); | ||||
|  | ||||
|     if (n_max_tokens < (int) res.size()) { | ||||
|     if (n_tokens_max < (int) res.size()) { | ||||
|         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); | ||||
|         return -((int) res.size()); | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov