Merge branch 'master' into compilade/refactor-kv-cache

2025-11-03 09:22:01 +00:00 · 2024-05-24 19:35:16 -04:00
parent cbc743e600 d041d2ceaa
commit 0fd13e9473
37 changed files with 1970 additions and 5711 deletions
--- a/llama.h
+++ b/llama.h
@@ -809,6 +809,12 @@ extern "C" {
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);

+    // Get the number of threads used for generation of a single token.
+    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+
+    // Get the number of threads used for prompt and batch processing (multiple token).
+    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+
    // Set whether to use causal attention or not
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);