mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-01 09:01:57 +00:00
parallel : fix cases where the input prompts can overflow the batch
This commit is contained in:
@@ -127,7 +127,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_seq_id g_seq_id = 0;
|
llama_seq_id g_seq_id = 0;
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(params.n_batch, 0);
|
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
||||||
|
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
||||||
|
llama_batch batch = llama_batch_init(params.n_ctx, 0);
|
||||||
|
|
||||||
int32_t n_total_prompt = 0;
|
int32_t n_total_prompt = 0;
|
||||||
int32_t n_total_gen = 0;
|
int32_t n_total_gen = 0;
|
||||||
|
|||||||
Reference in New Issue
Block a user