mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : add llama_sampling API + move grammar in libllama
ggml-ci
This commit is contained in:
		@@ -77,8 +77,6 @@ struct cpu_params {
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
struct gpt_params {
 | 
			
		||||
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 | 
			
		||||
 | 
			
		||||
    int32_t n_predict             =    -1; // new tokens to predict
 | 
			
		||||
    int32_t n_ctx                 =     0; // context size
 | 
			
		||||
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
 | 
			
		||||
@@ -120,8 +118,7 @@ struct gpt_params {
 | 
			
		||||
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
 | 
			
		||||
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 | 
			
		||||
 | 
			
		||||
    // // sampling parameters
 | 
			
		||||
    struct llama_sampling_params sparams;
 | 
			
		||||
    struct gpt_sampling_params sparams;
 | 
			
		||||
 | 
			
		||||
    std::string model                = ""; // model path
 | 
			
		||||
    std::string model_draft          = ""; // draft model for speculative decoding
 | 
			
		||||
@@ -185,7 +182,6 @@ struct gpt_params {
 | 
			
		||||
    bool flash_attn        = false; // flash attention
 | 
			
		||||
 | 
			
		||||
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
 | 
			
		||||
    bool ignore_eos        = false; // ignore generated EOS tokens
 | 
			
		||||
    bool logits_all        = false; // return logits for all tokens in the batch
 | 
			
		||||
    bool use_mmap          = true;  // use mmap for faster loads
 | 
			
		||||
    bool use_mlock         = false; // use mlock to keep model in memory
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user