mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	Merge branch 'master' into xsn/private_batch_api
This commit is contained in:
		@@ -1868,6 +1868,10 @@ struct server_context {
 | 
			
		||||
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
 | 
			
		||||
            params_dft.n_parallel   = 1;
 | 
			
		||||
 | 
			
		||||
            // force F16 KV cache for the draft model for extra performance
 | 
			
		||||
            params_dft.cache_type_k = GGML_TYPE_F16;
 | 
			
		||||
            params_dft.cache_type_v = GGML_TYPE_F16;
 | 
			
		||||
 | 
			
		||||
            llama_init_dft = common_init_from_params(params_dft);
 | 
			
		||||
 | 
			
		||||
            model_dft = llama_init_dft.model.get();
 | 
			
		||||
@@ -1888,10 +1892,6 @@ struct server_context {
 | 
			
		||||
            cparams_dft = common_context_params_to_llama(params_dft);
 | 
			
		||||
            cparams_dft.n_batch = n_ctx_dft;
 | 
			
		||||
 | 
			
		||||
            // force F16 KV cache for the draft model for extra performance
 | 
			
		||||
            cparams_dft.type_k = GGML_TYPE_F16;
 | 
			
		||||
            cparams_dft.type_v = GGML_TYPE_F16;
 | 
			
		||||
 | 
			
		||||
            // the context is not needed - we will create one for each slot
 | 
			
		||||
            llama_init_dft.context.reset();
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse(
 | 
			
		||||
 | 
			
		||||
    llama_params["chat_format"]      = static_cast<int>(chat_params.format);
 | 
			
		||||
    llama_params["prompt"]           = chat_params.prompt;
 | 
			
		||||
    llama_params["grammar"]          = chat_params.grammar;
 | 
			
		||||
    if (!chat_params.grammar.empty()) {
 | 
			
		||||
        llama_params["grammar"] = chat_params.grammar;
 | 
			
		||||
    }
 | 
			
		||||
    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
 | 
			
		||||
    auto grammar_triggers = json::array();
 | 
			
		||||
    for (const auto & trigger : chat_params.grammar_triggers) {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user