llama : use equal-sequence-length sub-batches for recurrent models

* ggml : simplify SSM-related operators * llama : make recurrent state slot allocation contiguous * llama : adapt internal uses of batches to llama_ubatch
2025-11-16 11:27:03 +00:00 · 2024-06-01 11:37:14 -04:00
parent 4e4c41e553
commit 3587a94987
3 changed files with 708 additions and 512 deletions
--- a/ggml.h
+++ b/ggml.h
@@ -1793,8 +1793,7 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * s,
            struct ggml_tensor  * x,
-            struct ggml_tensor  * c,
-            struct ggml_tensor  * sq);
+            struct ggml_tensor  * c);

    GGML_API struct ggml_tensor * ggml_ssm_scan(
            struct ggml_context * ctx,
@@ -1803,8 +1802,7 @@ extern "C" {
            struct ggml_tensor  * dt,
            struct ggml_tensor  * A,
            struct ggml_tensor  * B,
-            struct ggml_tensor  * C,
-            struct ggml_tensor  * sq);
+            struct ggml_tensor  * C);

    // partition into non-overlapping windows with padding if needed
    // example: