mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : fix BERT inference without KV cache
This commit is contained in:
		| @@ -3105,6 +3105,10 @@ static bool llama_cache_init( | ||||
|         ggml_context * ctx = it.second; | ||||
|         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); | ||||
|         if (!buf) { | ||||
|             if (!has_kv && !has_rs) { | ||||
|                 // no buffer was needed, so this is fine | ||||
|                 return true; | ||||
|             } | ||||
|             LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); | ||||
|             return false; | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Francis Couture-Harpin
					Francis Couture-Harpin