server : implement universal assisted decoding (#12635)

* llama-server : implement universal assisted decoding

* Erase prompt tail for kv-cache

* set vocab_dft_compatible in common_speculative

* rename ctx_main to ctx_tgt

* move vocab_dft_compatible to spec struct

* clear mem_dft, remove mem

* detokenize id_last for incompatible models

* update comment

* add --spec-replace flag

* accept special tokens when translating between draft/main models

* Escape spec-replace

* clamp draft result to size to params.n_draft

* fix comment

* clean up code

* restore old example

* log common_speculative_are_compatible in speculative example

* fix

* Update common/speculative.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/speculative.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/speculative.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
g2mt
2025-07-31 05:25:23 -07:00
committed by GitHub
parent c1dacaa99b
commit 94933c8c2e
6 changed files with 168 additions and 62 deletions

View File

@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
string_process_escapes(seq_breaker);
}
for (auto & pair : params.speculative.replacements) {
string_process_escapes(pair.first);
string_process_escapes(pair.second);
}
}
if (!params.kv_overrides.empty()) {
@@ -3249,6 +3253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.replacements.push_back({ tgt, dft });
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(