mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-30 08:42:00 +00:00
cont : fix speculative decoding initialization
This commit is contained in:
@@ -2385,6 +2385,10 @@ struct server_context {
|
|||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t n_ctx_slot() const {
|
||||||
|
return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
|
||||||
|
}
|
||||||
|
|
||||||
bool load_model(const common_params & params) {
|
bool load_model(const common_params & params) {
|
||||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||||
|
|
||||||
@@ -2413,7 +2417,7 @@ struct server_context {
|
|||||||
|
|
||||||
params_dft.devices = params_base.speculative.devices;
|
params_dft.devices = params_base.speculative.devices;
|
||||||
params_dft.model = params_base.speculative.model;
|
params_dft.model = params_base.speculative.model;
|
||||||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx;
|
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx;
|
||||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||||
params_dft.n_parallel = 1;
|
params_dft.n_parallel = 1;
|
||||||
params_dft.cache_type_k = params_base.speculative.cache_type_k;
|
params_dft.cache_type_k = params_base.speculative.cache_type_k;
|
||||||
@@ -2501,8 +2505,6 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void init() {
|
void init() {
|
||||||
const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel;
|
|
||||||
|
|
||||||
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
|
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
|
||||||
|
|
||||||
for (int i = 0; i < params_base.n_parallel; i++) {
|
for (int i = 0; i < params_base.n_parallel; i++) {
|
||||||
@@ -2510,7 +2512,7 @@ struct server_context {
|
|||||||
|
|
||||||
slot.id = i;
|
slot.id = i;
|
||||||
slot.ctx = ctx;
|
slot.ctx = ctx;
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot();
|
||||||
slot.mctx = mctx;
|
slot.mctx = mctx;
|
||||||
slot.prompt.tokens.has_mtmd = mctx != nullptr;
|
slot.prompt.tokens.has_mtmd = mctx != nullptr;
|
||||||
|
|
||||||
@@ -2533,7 +2535,7 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
|
||||||
|
|
||||||
slot.callback_on_release = [this](int) {
|
slot.callback_on_release = [this](int) {
|
||||||
queue_tasks.pop_deferred_task();
|
queue_tasks.pop_deferred_task();
|
||||||
@@ -3718,7 +3720,7 @@ struct server_context {
|
|||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.state = SLOT_STATE_PROCESSING_PROMPT;
|
slot.state = SLOT_STATE_PROCESSING_PROMPT;
|
||||||
|
|
||||||
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n",
|
SLT_INF(slot, "new prompt, n_ctx = %d, n_keep = %d, n_prompt_tokens = %d\n",
|
||||||
slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens());
|
slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens());
|
||||||
|
|
||||||
// print prompt tokens (for debugging)
|
// print prompt tokens (for debugging)
|
||||||
|
|||||||
Reference in New Issue
Block a user