sampling : optimize samplers by reusing bucket sort (#15665)

* sampling : optimize sorting using bucket sort in more places

ggml-ci

* sampling : do not sort in dist sampler

ggml-ci

* sampling : avoid heap allocations for sort buffers

ggml-ci

* common : add option to sort sampling candidates by probability

ggml-ci

* sampling : revert the change for preserving sort buffers

* sampling : use std::copy instead of memcpy

* sampling : clarify purpose of partial sort helpers

ggml-ci

* cont : remove wrong comment [no ci]

* common : update comment

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
Georgi Gerganov
2025-08-31 20:41:02 +03:00
committed by GitHub
parent 0d161f021a
commit e92d53b29e
9 changed files with 227 additions and 171 deletions

View File

@@ -2550,11 +2550,12 @@ struct server_context {
return slot.has_next_token; // continue
}
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
size_t n_probs = slot.params.sampling.n_probs;
size_t n_vocab = llama_vocab_n_tokens(vocab);
if (post_sampling) {
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
const auto * cur_p = common_sampler_get_candidates(slot.smpl, true);
const size_t max_probs = cur_p->size;
// set probability for sampled token