mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Merge branch 'master' into xsn/private_batch_api
This commit is contained in:
		| @@ -1868,6 +1868,10 @@ struct server_context { | ||||
|             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; | ||||
|             params_dft.n_parallel   = 1; | ||||
|  | ||||
|             // force F16 KV cache for the draft model for extra performance | ||||
|             params_dft.cache_type_k = GGML_TYPE_F16; | ||||
|             params_dft.cache_type_v = GGML_TYPE_F16; | ||||
|  | ||||
|             llama_init_dft = common_init_from_params(params_dft); | ||||
|  | ||||
|             model_dft = llama_init_dft.model.get(); | ||||
| @@ -1888,10 +1892,6 @@ struct server_context { | ||||
|             cparams_dft = common_context_params_to_llama(params_dft); | ||||
|             cparams_dft.n_batch = n_ctx_dft; | ||||
|  | ||||
|             // force F16 KV cache for the draft model for extra performance | ||||
|             cparams_dft.type_k = GGML_TYPE_F16; | ||||
|             cparams_dft.type_v = GGML_TYPE_F16; | ||||
|  | ||||
|             // the context is not needed - we will create one for each slot | ||||
|             llama_init_dft.context.reset(); | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen