mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	context : remove logits_all flag (#13284)
* context : remove logits_all flag ggml-ci * llama : remove logits_all flag + reorder llama_context_params ggml-ci
This commit is contained in:
		| @@ -2097,13 +2097,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | |||||||
|             params.cache_type_v = kv_cache_type_from_str(value); |             params.cache_type_v = kv_cache_type_from_str(value); | ||||||
|         } |         } | ||||||
|     ).set_env("LLAMA_ARG_CACHE_TYPE_V")); |     ).set_env("LLAMA_ARG_CACHE_TYPE_V")); | ||||||
|     add_opt(common_arg( |  | ||||||
|         {"--perplexity", "--all-logits"}, |  | ||||||
|         string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), |  | ||||||
|         [](common_params & params) { |  | ||||||
|             params.logits_all = true; |  | ||||||
|         } |  | ||||||
|     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); |  | ||||||
|     add_opt(common_arg( |     add_opt(common_arg( | ||||||
|         {"--hellaswag"}, |         {"--hellaswag"}, | ||||||
|         "compute HellaSwag score over random tasks from datafile supplied with -f", |         "compute HellaSwag score over random tasks from datafile supplied with -f", | ||||||
|   | |||||||
| @@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & | |||||||
|     cparams.n_threads         = params.cpuparams.n_threads; |     cparams.n_threads         = params.cpuparams.n_threads; | ||||||
|     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ? |     cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ? | ||||||
|                                 params.cpuparams.n_threads : params.cpuparams_batch.n_threads; |                                 params.cpuparams.n_threads : params.cpuparams_batch.n_threads; | ||||||
|     cparams.logits_all        = params.logits_all; |  | ||||||
|     cparams.embeddings        = params.embedding; |     cparams.embeddings        = params.embedding; | ||||||
|     cparams.rope_scaling_type = params.rope_scaling_type; |     cparams.rope_scaling_type = params.rope_scaling_type; | ||||||
|     cparams.rope_freq_base    = params.rope_freq_base; |     cparams.rope_freq_base    = params.rope_freq_base; | ||||||
|   | |||||||
| @@ -324,7 +324,6 @@ struct common_params { | |||||||
|     bool ctx_shift         = true;  // context shift on inifinite text generation |     bool ctx_shift         = true;  // context shift on inifinite text generation | ||||||
|  |  | ||||||
|     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix |     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix | ||||||
|     bool logits_all        = false; // return logits for all tokens in the batch |  | ||||||
|     bool use_mmap          = true;  // use mmap for faster loads |     bool use_mmap          = true;  // use mmap for faster loads | ||||||
|     bool use_mlock         = false; // use mlock to keep model in memory |     bool use_mlock         = false; // use mlock to keep model in memory | ||||||
|     bool verbose_prompt    = false; // print prompt tokens before generation |     bool verbose_prompt    = false; // print prompt tokens before generation | ||||||
|   | |||||||
| @@ -351,19 +351,17 @@ extern "C" { | |||||||
|         enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] |         enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] | ||||||
|         enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] |         enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] | ||||||
|  |  | ||||||
|         // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. |  | ||||||
|         // TODO: move at the end of the struct |  | ||||||
|         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) |  | ||||||
|         bool embeddings;  // if true, extract embeddings (together with logits) |  | ||||||
|         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU |  | ||||||
|         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL] |  | ||||||
|         bool no_perf;     // whether to measure performance timings |  | ||||||
|  |  | ||||||
|         // Abort callback |         // Abort callback | ||||||
|         // if it returns true, execution of llama_decode() will be aborted |         // if it returns true, execution of llama_decode() will be aborted | ||||||
|         // currently works only with CPU execution |         // currently works only with CPU execution | ||||||
|         ggml_abort_callback abort_callback; |         ggml_abort_callback abort_callback; | ||||||
|         void *              abort_callback_data; |         void *              abort_callback_data; | ||||||
|  |  | ||||||
|  |         // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. | ||||||
|  |         bool embeddings;  // if true, extract embeddings (together with logits) | ||||||
|  |         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU | ||||||
|  |         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL] | ||||||
|  |         bool no_perf;     // whether to measure performance timings | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     // model quantization parameters |     // model quantization parameters | ||||||
|   | |||||||
| @@ -116,8 +116,6 @@ llama_context::llama_context( | |||||||
|                 __func__, n_ctx_per_seq, hparams.n_ctx_train); |                 __func__, n_ctx_per_seq, hparams.n_ctx_train); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     logits_all = params.logits_all; |  | ||||||
|  |  | ||||||
|     if (!hparams.vocab_only) { |     if (!hparams.vocab_only) { | ||||||
|         // GPU backends |         // GPU backends | ||||||
|         for (auto * dev : model.devices) { |         for (auto * dev : model.devices) { | ||||||
| @@ -890,7 +888,7 @@ int llama_context::decode(llama_batch & inp_batch) { | |||||||
|         for (uint32_t i = 0; i < n_tokens_all; ++i) { |         for (uint32_t i = 0; i < n_tokens_all; ++i) { | ||||||
|             n_outputs_all += batch.logits[i] != 0; |             n_outputs_all += batch.logits[i] != 0; | ||||||
|         } |         } | ||||||
|     } else if (logits_all || embd_pooled) { |     } else if (embd_pooled) { | ||||||
|         n_outputs_all = n_tokens_all; |         n_outputs_all = n_tokens_all; | ||||||
|     } else { |     } else { | ||||||
|         // keep last output only |         // keep last output only | ||||||
| @@ -1853,13 +1851,12 @@ llama_context_params llama_context_default_params() { | |||||||
|         /*.cb_eval_user_data           =*/ nullptr, |         /*.cb_eval_user_data           =*/ nullptr, | ||||||
|         /*.type_k                      =*/ GGML_TYPE_F16, |         /*.type_k                      =*/ GGML_TYPE_F16, | ||||||
|         /*.type_v                      =*/ GGML_TYPE_F16, |         /*.type_v                      =*/ GGML_TYPE_F16, | ||||||
|         /*.logits_all                  =*/ false, |         /*.abort_callback              =*/ nullptr, | ||||||
|  |         /*.abort_callback_data         =*/ nullptr, | ||||||
|         /*.embeddings                  =*/ false, |         /*.embeddings                  =*/ false, | ||||||
|         /*.offload_kqv                 =*/ true, |         /*.offload_kqv                 =*/ true, | ||||||
|         /*.flash_attn                  =*/ false, |         /*.flash_attn                  =*/ false, | ||||||
|         /*.no_perf                     =*/ true, |         /*.no_perf                     =*/ true, | ||||||
|         /*.abort_callback              =*/ nullptr, |  | ||||||
|         /*.abort_callback_data         =*/ nullptr, |  | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     return result; |     return result; | ||||||
|   | |||||||
| @@ -187,9 +187,6 @@ private: | |||||||
|  |  | ||||||
|     std::unique_ptr<llama_memory_i> memory; |     std::unique_ptr<llama_memory_i> memory; | ||||||
|  |  | ||||||
|     // TODO: remove |  | ||||||
|     bool logits_all = false; |  | ||||||
|  |  | ||||||
|     // decode output (2-dimensional array: [n_outputs][n_vocab]) |     // decode output (2-dimensional array: [n_outputs][n_vocab]) | ||||||
|     size_t  logits_size = 0; // capacity (of floats) for logits |     size_t  logits_size = 0; // capacity (of floats) for logits | ||||||
|     float * logits      = nullptr; |     float * logits      = nullptr; | ||||||
|   | |||||||
| @@ -585,7 +585,6 @@ int main(int argc, char ** argv) { | |||||||
|     params.out_file = "imatrix.dat" ; |     params.out_file = "imatrix.dat" ; | ||||||
|  |  | ||||||
|     params.n_ctx = 512; |     params.n_ctx = 512; | ||||||
|     params.logits_all = true; |  | ||||||
|     params.escape = false; |     params.escape = false; | ||||||
|  |  | ||||||
|     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { |     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { | ||||||
|   | |||||||
| @@ -99,14 +99,6 @@ int main(int argc, char ** argv) { | |||||||
|     console::init(params.simple_io, params.use_color); |     console::init(params.simple_io, params.use_color); | ||||||
|     atexit([]() { console::cleanup(); }); |     atexit([]() { console::cleanup(); }); | ||||||
|  |  | ||||||
|     if (params.logits_all) { |  | ||||||
|         LOG_ERR("************\n"); |  | ||||||
|         LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); |  | ||||||
|         LOG_ERR("************\n\n"); |  | ||||||
|  |  | ||||||
|         return 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (params.embedding) { |     if (params.embedding) { | ||||||
|         LOG_ERR("************\n"); |         LOG_ERR("************\n"); | ||||||
|         LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); |         LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); | ||||||
|   | |||||||
| @@ -1554,7 +1554,10 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par | |||||||
|             if (int(batch_indeces.size()) != num_answers) { |             if (int(batch_indeces.size()) != num_answers) { | ||||||
|                 batch_indeces.resize(num_answers); |                 batch_indeces.resize(num_answers); | ||||||
|             } |             } | ||||||
|             for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s; |  | ||||||
|  |             for (int s = 0; s < num_answers; ++s) { | ||||||
|  |                 batch_indeces[s] = s0 + s; | ||||||
|  |             } | ||||||
|  |  | ||||||
|             for (size_t i = 0; i < cur_task.common_prefix; ++i) { |             for (size_t i = 0; i < cur_task.common_prefix; ++i) { | ||||||
|                 //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); |                 //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); | ||||||
| @@ -1970,7 +1973,6 @@ int main(int argc, char ** argv) { | |||||||
|     common_params params; |     common_params params; | ||||||
|  |  | ||||||
|     params.n_ctx = 512; |     params.n_ctx = 512; | ||||||
|     params.logits_all = true; |  | ||||||
|     params.escape = false; |     params.escape = false; | ||||||
|  |  | ||||||
|     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { |     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov