mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : add new llama_decode() API that works with llama_batch
This commit is contained in:
		| @@ -891,7 +891,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat | ||||
|     int n_processed = 0; | ||||
|     while (n_processed < n_prompt) { | ||||
|         int n_tokens = std::min(n_prompt - n_processed, n_batch); | ||||
|         llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads); | ||||
|         llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0), n_threads); | ||||
|         n_processed += n_tokens; | ||||
|     } | ||||
| } | ||||
| @@ -899,7 +899,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat | ||||
| static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { | ||||
|     llama_token token = llama_token_bos(ctx); | ||||
|     for (int i = 0; i < n_gen; i++) { | ||||
|         llama_eval(ctx, &token, 1, n_past + i, n_threads); | ||||
|         llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0), n_threads); | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov