mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-31 08:51:55 +00:00
llama : fix BERT inference without KV cache
This commit is contained in:
@@ -3105,6 +3105,10 @@ static bool llama_cache_init(
|
||||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
if (!has_kv && !has_rs) {
|
||||
// no buffer was needed, so this is fine
|
||||
return true;
|
||||
}
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user