diff --git a/common/common.cpp b/common/common.cpp index e93af5b7a4..bd20af2336 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_threads = params.cpuparams.n_threads; cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ? params.cpuparams.n_threads : params.cpuparams_batch.n_threads; - cparams.logits_all = false; cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_freq_base = params.rope_freq_base; diff --git a/include/llama.h b/include/llama.h index 06c56395c1..e18e9b8da3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -351,19 +351,17 @@ extern "C" { enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] - // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. - // TODO: move at the end of the struct - bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) - bool embeddings; // if true, extract embeddings (together with logits) - bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - bool no_perf; // whether to measure performance timings - // Abort callback // if it returns true, execution of llama_decode() will be aborted // currently works only with CPU execution ggml_abort_callback abort_callback; void * abort_callback_data; + + // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. + bool embeddings; // if true, extract embeddings (together with logits) + bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] + bool no_perf; // whether to measure performance timings }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 28cb94e975..dadb87517f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1851,13 +1851,12 @@ llama_context_params llama_context_default_params() { /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, - /*.logits_all =*/ false, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, - /*.abort_callback =*/ nullptr, - /*.abort_callback_data =*/ nullptr, }; return result;