llama : use equal-sequence-length sub-batches for recurrent models

* ggml : simplify SSM-related operators

* llama : make recurrent state slot allocation contiguous

* llama : adapt internal uses of batches to llama_ubatch
This commit is contained in:
Francis Couture-Harpin
2024-06-01 11:37:14 -04:00
parent 4e4c41e553
commit 3587a94987
3 changed files with 708 additions and 512 deletions

6
ggml.h
View File

@@ -1793,8 +1793,7 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * s,
struct ggml_tensor * x,
struct ggml_tensor * c,
struct ggml_tensor * sq);
struct ggml_tensor * c);
GGML_API struct ggml_tensor * ggml_ssm_scan(
struct ggml_context * ctx,
@@ -1803,8 +1802,7 @@ extern "C" {
struct ggml_tensor * dt,
struct ggml_tensor * A,
struct ggml_tensor * B,
struct ggml_tensor * C,
struct ggml_tensor * sq);
struct ggml_tensor * C);
// partition into non-overlapping windows with padding if needed
// example: