mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	context : round n_tokens to next multiple of n_seqs when reserving (#14140)
This fixes RWKV inference which otherwise failed when the worst case ubatch.n_seq_tokens rounded to 0.
This commit is contained in:
		| @@ -1332,7 +1332,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u | |||||||
|     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); |     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); | ||||||
|  |  | ||||||
|     if (n_tokens % n_seqs != 0) { |     if (n_tokens % n_seqs != 0) { | ||||||
|         n_tokens = (n_tokens / n_seqs) * n_seqs; |         n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs | ||||||
|         n_outputs = std::min(n_outputs, n_tokens); |         n_outputs = std::min(n_outputs, n_tokens); | ||||||
|  |  | ||||||
|         LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); |         LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 compilade
					compilade