mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	cont : fix speculative decoding initialization
This commit is contained in:
		| @@ -2385,6 +2385,10 @@ struct server_context { | ||||
|         llama_batch_free(batch); | ||||
|     } | ||||
|  | ||||
|     int32_t n_ctx_slot() const { | ||||
|         return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; | ||||
|     } | ||||
|  | ||||
|     bool load_model(const common_params & params) { | ||||
|         SRV_INF("loading model '%s'\n", params.model.path.c_str()); | ||||
|  | ||||
| @@ -2413,7 +2417,7 @@ struct server_context { | ||||
|  | ||||
|             params_dft.devices      = params_base.speculative.devices; | ||||
|             params_dft.model        = params_base.speculative.model; | ||||
|             params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx; | ||||
|             params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx; | ||||
|             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; | ||||
|             params_dft.n_parallel   = 1; | ||||
|             params_dft.cache_type_k = params_base.speculative.cache_type_k; | ||||
| @@ -2501,8 +2505,6 @@ struct server_context { | ||||
|     } | ||||
|  | ||||
|     void init() { | ||||
|         const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; | ||||
|  | ||||
|         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); | ||||
|  | ||||
|         for (int i = 0; i < params_base.n_parallel; i++) { | ||||
| @@ -2510,7 +2512,7 @@ struct server_context { | ||||
|  | ||||
|             slot.id = i; | ||||
|             slot.ctx = ctx; | ||||
|             slot.n_ctx = n_ctx_slot; | ||||
|             slot.n_ctx = n_ctx_slot(); | ||||
|             slot.mctx = mctx; | ||||
|             slot.prompt.tokens.has_mtmd = mctx != nullptr; | ||||
|  | ||||
| @@ -2533,7 +2535,7 @@ struct server_context { | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); | ||||
|             SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); | ||||
|  | ||||
|             slot.callback_on_release = [this](int) { | ||||
|                 queue_tasks.pop_deferred_task(); | ||||
| @@ -3718,7 +3720,7 @@ struct server_context { | ||||
|                         slot.n_past = 0; | ||||
|                         slot.state = SLOT_STATE_PROCESSING_PROMPT; | ||||
|  | ||||
|                         SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", | ||||
|                         SLT_INF(slot, "new prompt, n_ctx = %d, n_keep = %d, n_prompt_tokens = %d\n", | ||||
|                                 slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens()); | ||||
|  | ||||
|                         // print prompt tokens (for debugging) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov