mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	cont : fix speculative decoding initialization
This commit is contained in:
		| @@ -2385,6 +2385,10 @@ struct server_context { | |||||||
|         llama_batch_free(batch); |         llama_batch_free(batch); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     int32_t n_ctx_slot() const { | ||||||
|  |         return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     bool load_model(const common_params & params) { |     bool load_model(const common_params & params) { | ||||||
|         SRV_INF("loading model '%s'\n", params.model.path.c_str()); |         SRV_INF("loading model '%s'\n", params.model.path.c_str()); | ||||||
|  |  | ||||||
| @@ -2413,7 +2417,7 @@ struct server_context { | |||||||
|  |  | ||||||
|             params_dft.devices      = params_base.speculative.devices; |             params_dft.devices      = params_base.speculative.devices; | ||||||
|             params_dft.model        = params_base.speculative.model; |             params_dft.model        = params_base.speculative.model; | ||||||
|             params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx; |             params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx; | ||||||
|             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; |             params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; | ||||||
|             params_dft.n_parallel   = 1; |             params_dft.n_parallel   = 1; | ||||||
|             params_dft.cache_type_k = params_base.speculative.cache_type_k; |             params_dft.cache_type_k = params_base.speculative.cache_type_k; | ||||||
| @@ -2501,8 +2505,6 @@ struct server_context { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     void init() { |     void init() { | ||||||
|         const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; |  | ||||||
|  |  | ||||||
|         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); |         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); | ||||||
|  |  | ||||||
|         for (int i = 0; i < params_base.n_parallel; i++) { |         for (int i = 0; i < params_base.n_parallel; i++) { | ||||||
| @@ -2510,7 +2512,7 @@ struct server_context { | |||||||
|  |  | ||||||
|             slot.id = i; |             slot.id = i; | ||||||
|             slot.ctx = ctx; |             slot.ctx = ctx; | ||||||
|             slot.n_ctx = n_ctx_slot; |             slot.n_ctx = n_ctx_slot(); | ||||||
|             slot.mctx = mctx; |             slot.mctx = mctx; | ||||||
|             slot.prompt.tokens.has_mtmd = mctx != nullptr; |             slot.prompt.tokens.has_mtmd = mctx != nullptr; | ||||||
|  |  | ||||||
| @@ -2533,7 +2535,7 @@ struct server_context { | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); |             SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); | ||||||
|  |  | ||||||
|             slot.callback_on_release = [this](int) { |             slot.callback_on_release = [this](int) { | ||||||
|                 queue_tasks.pop_deferred_task(); |                 queue_tasks.pop_deferred_task(); | ||||||
| @@ -3718,7 +3720,7 @@ struct server_context { | |||||||
|                         slot.n_past = 0; |                         slot.n_past = 0; | ||||||
|                         slot.state = SLOT_STATE_PROCESSING_PROMPT; |                         slot.state = SLOT_STATE_PROCESSING_PROMPT; | ||||||
|  |  | ||||||
|                         SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", |                         SLT_INF(slot, "new prompt, n_ctx = %d, n_keep = %d, n_prompt_tokens = %d\n", | ||||||
|                                 slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens()); |                                 slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens()); | ||||||
|  |  | ||||||
|                         // print prompt tokens (for debugging) |                         // print prompt tokens (for debugging) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov