mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	parallel : increase the variability of the prompt lengths (#13927)
ggml-ci
This commit is contained in:
		| @@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel | |||||||
|  |  | ||||||
| ## Example | ## Example | ||||||
|  |  | ||||||
| Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question. | Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question. | ||||||
|  |  | ||||||
| ```bash | ```bash | ||||||
| llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384 | llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384 | ||||||
|   | |||||||
| @@ -315,7 +315,10 @@ int main(int argc, char ** argv) { | |||||||
|                     } else { |                     } else { | ||||||
|                         client.prompt += k_system; |                         client.prompt += k_system; | ||||||
|                     } |                     } | ||||||
|                     for (int i = 0; i < n_junk; ++i) { |  | ||||||
|  |                     const int n_junk_cur = rand() % n_junk; | ||||||
|  |  | ||||||
|  |                     for (int i = 0; i < n_junk_cur; ++i) { | ||||||
|                         const int r = rand() % k_questions.size(); |                         const int r = rand() % k_questions.size(); | ||||||
|                         client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n"; |                         client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n"; | ||||||
|                     } |                     } | ||||||
| @@ -340,7 +343,7 @@ int main(int argc, char ** argv) { | |||||||
|                     client.n_decoded = 0; |                     client.n_decoded = 0; | ||||||
|                     client.i_batch   = batch.n_tokens - 1; |                     client.i_batch   = batch.n_tokens - 1; | ||||||
|  |  | ||||||
|                     LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); |                     LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur); | ||||||
|  |  | ||||||
|                     g_seq_id += 1; |                     g_seq_id += 1; | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov