parallel : fix cases where the input prompts can overflow the batch

2025-10-31 08:51:55 +00:00 · 2023-09-20 19:09:25 +03:00
parent ee1d670cc6
commit ded9b43cad
1 changed files with 3 additions and 1 deletions
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -127,7 +127,9 @@ int main(int argc, char ** argv) {

    llama_seq_id g_seq_id = 0;

-    llama_batch batch = llama_batch_init(params.n_batch, 0);
+    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
+    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
+    llama_batch batch = llama_batch_init(params.n_ctx, 0);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;