llama : improve llama_batch API + simplify parallel example

2025-11-05 09:36:52 +00:00 · 2023-09-20 10:46:18 +03:00
parent a1327c71c6
commit addae65fd4
6 changed files with 111 additions and 70 deletions
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -134,7 +134,7 @@ int main(int argc, char ** argv) {

        while (true) {
            // sample from the target model
-            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);

            // remember which tokens were sampled - used for repetition penalties during sampling
            last_tokens.erase(last_tokens.begin());