mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	 8fcb563613
			
		
	
	8fcb563613
	
	
	
		
			
			* llama : introduce llama_set_warmup() API call that controls warmup mode; use all MoE experts during warmup * common : use new API to enable warmup mode during model warmup --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
		
			
				
	
	
		
			39 lines
		
	
	
		
			951 B
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			39 lines
		
	
	
		
			951 B
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "llama.h"
 | |
| 
 | |
| #include <cstdint>
 | |
| 
 | |
| struct llama_cparams {
 | |
|     uint32_t n_ctx;           // context size used during inference
 | |
|     uint32_t n_batch;
 | |
|     uint32_t n_ubatch;
 | |
|     uint32_t n_seq_max;
 | |
|     int      n_threads;       // number of threads to use for generation
 | |
|     int      n_threads_batch; // number of threads to use for batch processing
 | |
| 
 | |
|     float rope_freq_base;
 | |
|     float rope_freq_scale;
 | |
| 
 | |
|     uint32_t n_ctx_orig_yarn;
 | |
|     // These hyperparameters are not exposed in GGUF, because all
 | |
|     // existing YaRN models use the same values for them.
 | |
|     float yarn_ext_factor;
 | |
|     float yarn_attn_factor;
 | |
|     float yarn_beta_fast;
 | |
|     float yarn_beta_slow;
 | |
|     float defrag_thold;
 | |
| 
 | |
|     bool embeddings;
 | |
|     bool causal_attn;
 | |
|     bool offload_kqv;
 | |
|     bool flash_attn;
 | |
|     bool no_perf;
 | |
|     bool warmup;
 | |
| 
 | |
|     enum llama_pooling_type pooling_type;
 | |
| 
 | |
|     ggml_backend_sched_eval_callback cb_eval;
 | |
|     void * cb_eval_user_data;
 | |
| };
 |