graph : remove worst_case from the API

ggml-ci
2025-11-05 09:36:52 +00:00 · 2025-02-21 12:10:57 +02:00
parent 2645a7d9a9
commit 548c230dff
7 changed files with 958 additions and 967 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -22,16 +22,25 @@ using llama_loras = std::unordered_map<struct llama_adapter_lora *, float>;

 // basic transformer without KV cache
 struct llama_context : public llama_graph_i {
+public:
    llama_context(
            const llama_model & model,
            const llama_context_params & params);

    virtual ~llama_context();

-    // init scheduler and compute buffers
+    // init scheduler and compute buffers, reserve worst-case graphs
    // call once after the context is constructed
    virtual void init();

+    virtual void synchronize();
+
+protected:
+    // called by init() to reserve the worst-case graphs
+    // override in child classes
+    virtual void reserve();
+
+public:
    const llama_model   & get_model()   const;
    const llama_cparams & get_cparams() const;

@@ -93,33 +102,6 @@ struct llama_context : public llama_graph_i {
                int32_t   il_start,
                int32_t   il_end);

-    ////
-
-    virtual void synchronize();
-
-    // zero-out inputs and create ggml_context
-    virtual ggml_cgraph * graph_init();
-
-    // TODO: add encode/decode graphs
-    virtual llama_graph_result graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-                    bool   worst_case);
-
-    // returns the result of ggml_backend_sched_graph_compute_async execution
-    virtual enum ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
-
-    // Make sure enough space is available for outputs.
-    // Returns max number of outputs for which space was reserved.
-    virtual int32_t output_reserve(int32_t n_outputs);
-
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    virtual void output_reorder();
-
    // encode a batch of tokens by evaluating the encoder part of the transformer
    //
    //   - lctx:      llama context
@@ -145,6 +127,60 @@ struct llama_context : public llama_graph_i {
    //
    virtual int decode(llama_batch & inp_batch);

+protected:
+    //
+    // input
+    //
+
+    // when the compute graph is built, it creates the input tensors that it needs
+    // the contents of the input tensors are set by the input_set() function
+
+    virtual void input_set(const llama_ubatch & ubatch);
+
+    // base input tensors
+    ggml_tensor * inp_tokens;  // I32 [n_batch]
+    ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
+    ggml_tensor * inp_pos;     // I32 [n_batch]
+    ggml_tensor * inp_out_ids; // I32 [n_outputs]
+    ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
+    ggml_tensor * inp_cls;     // I32 [n_batch]
+
+    // KQ mask input tensors
+    ggml_tensor * inp_kq_mask;     // F32 [n_tokens, n_batch]
+    ggml_tensor * inp_kq_mask_cnv; //     [n_tokens, n_batch]
+
+    //
+    // output
+    //
+
+    // Make sure enough space is available for outputs.
+    // Returns max number of outputs for which space was reserved.
+    virtual int32_t output_reserve(int32_t n_outputs);
+
+    // make the outputs have the same order they had in the user-provided batch
+    // TODO: maybe remove this
+    virtual void output_reorder();
+
+    //
+    // graph
+    //
+
+    // zero-out inputs and create the ctx_context for the compute graph
+    virtual ggml_cgraph * graph_init();
+
+    // TODO: add encode/decode graphs
+    virtual llama_graph_result graph_build(
+            ggml_context * ctx,
+             ggml_cgraph * gf,
+      const llama_ubatch & ubatch);
+
+    // returns the result of ggml_backend_sched_graph_compute_async execution
+    virtual enum ggml_status graph_compute(
+            ggml_cgraph * gf,
+                   bool   batched);
+
+    ggml_context_ptr ctx_compute;
+
    //
    // graph build API (generic)
    //
@@ -193,9 +229,7 @@ struct llama_context : public llama_graph_i {
                 int32_t   n_tokens);

    virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case);
+            ggml_context * ctx0);

    virtual ggml_tensor * build_inp_mean(
            ggml_context * ctx0,
@@ -209,8 +243,7 @@ struct llama_context : public llama_graph_i {
            ggml_context * ctx0,
                 int32_t   n_tokens,
                    bool   causal,
-                    bool   swa,
-                    bool   worst_case);
+                    bool   swa);

    virtual ggml_tensor * build_attn(
            ggml_context * ctx0,
@@ -222,15 +255,32 @@ struct llama_context : public llama_graph_i {
             ggml_tensor * v_cur,
                 int32_t   n_tokens,
                 float     kq_scale,
-                 int       il,
-                 bool      worst_case);
+                 int       il);

+public:
+    //
    // perf
+    //

    virtual llama_perf_context_data perf_get_data() const;
    virtual void perf_reset();

+protected:
+    mutable int64_t t_start_us  = 0;
+    mutable int64_t t_load_us   = 0;
+    mutable int64_t t_p_eval_us = 0;
+    mutable int64_t t_eval_us   = 0;
+
+    mutable int64_t t_compute_start_us = 0;
+    mutable int64_t n_queued_tokens    = 0;
+
+    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    mutable int32_t n_eval   = 0; // number of eval calls
+
+public:
+    //
    // state save/load
+    //

    virtual size_t state_get_size();
    virtual size_t state_get_data(      uint8_t * dst, size_t size);
@@ -265,31 +315,15 @@ struct llama_context : public llama_graph_i {
                size_t   n_token_count);

 protected:
-    // state save/load
-
    virtual size_t state_get_data(llama_io_write_i & io);
    virtual size_t state_set_data(llama_io_read_i  & io);

    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id);
    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id);

-    // input
-
-    virtual void input_set(const llama_ubatch & ubatch);
-
-    // base input tensors
-    ggml_tensor * inp_tokens;  // I32 [n_batch]
-    ggml_tensor * inp_embd;    // F32 [n_embd, n_batch]
-    ggml_tensor * inp_pos;     // I32 [n_batch]
-    ggml_tensor * inp_out_ids; // I32 [n_outputs]
-    ggml_tensor * inp_mean;    // F32 [n_batch, n_batch]
-    ggml_tensor * inp_cls;     // I32 [n_batch]
-
-    // KQ mask input tensors
-    ggml_tensor * inp_kq_mask;     // F32 [n_tokens, n_batch]
-    ggml_tensor * inp_kq_mask_cnv; //     [n_tokens, n_batch]
-
+    //
    // members
+    //

    const llama_model & model;

@@ -311,7 +345,9 @@ protected:

    ggml_backend_sched_ptr sched;

-    ggml_context_ptr ctx_compute;
+    // buffer types used for the compute buffer of each backend
+    std::vector<ggml_backend_t>             backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;

    // memory buffers used to evaluate the model
    std::vector<uint8_t> buf_compute_meta;
@@ -340,19 +376,7 @@ protected:

    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

-    bool need_reserve       = false;
    bool has_evaluated_once = false;
-
-    mutable int64_t t_start_us  = 0;
-    mutable int64_t t_load_us   = 0;
-    mutable int64_t t_p_eval_us = 0;
-    mutable int64_t t_eval_us   = 0;
-
-    mutable int64_t t_compute_start_us = 0;
-    mutable int64_t n_queued_tokens    = 0;
-
-    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-    mutable int32_t n_eval   = 0; // number of eval calls
 };

 // transformer with a self-attention KV cache
@@ -364,18 +388,40 @@ public:

    virtual ~llama_context_kv_self();

+protected:
+    virtual void reserve() override;
+
+public:
    virtual       llama_kv_cache * get_kv_self()       override;
    virtual const llama_kv_cache * get_kv_self() const override;

    virtual void kv_self_update() override;

-    virtual ggml_cgraph * graph_init() override;
-
    virtual int encode(llama_batch & inp_batch) override;
    virtual int decode(llama_batch & inp_batch) override;

-    // certain implementations could require a padding for the context size
-    uint32_t get_ctx_padding(const llama_cparams & cparams) const;
+protected:
+    //
+    // input
+    //
+
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
+    ggml_tensor * inp_self_kq_mask;         // F32 [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_cnv;     //     [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_swa;     // F32 [kv_size, n_batch]
+    ggml_tensor * inp_self_kq_mask_swa_cnv; //     [kv_size, n_batch]
+    ggml_tensor * inp_self_k_shift;         // I32 [kv_size]
+
+    //
+    // graph
+    //
+
+    virtual ggml_cgraph * graph_init() override;
+
+    //
+    // graph build
+    //

    virtual ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;

@@ -383,8 +429,7 @@ public:
            ggml_context * ctx0,
                 int32_t   n_tokens,
                    bool   causal,
-                    bool   swa,
-                    bool   worst_case) override;
+                    bool   swa) override;

    virtual ggml_tensor * build_attn(
            ggml_context * ctx0,
@@ -396,8 +441,7 @@ public:
             ggml_tensor * v_cur,
                 int32_t   n_tokens,
                 float     kq_scale,
-                 int       il,
-                 bool      worst_case) override;
+                 int       il) override;

    virtual void build_kv_self_shift(
            ggml_context * ctx0,
@@ -422,31 +466,27 @@ public:
    struct ggml_tensor * inp_kq_mask_cross; // F32 [n_outputs_enc, n_batch]

    virtual ggml_tensor * build_inp_embd_enc(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) override;
+            ggml_context * ctx0) override;

    virtual ggml_tensor * build_inp_kq_mask_cross(
            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) override;
+                 int32_t   n_tokens) override;
+
+    //
+    // state save/load
+    //

-protected:
    virtual size_t state_get_data(llama_io_write_i & io) override;
    virtual size_t state_set_data(llama_io_read_i  & io) override;

    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;

-    virtual void input_set(const llama_ubatch & ubatch) override;
+    //
+    // members
+    //

    llama_kv_cache kv_self;
-
-    ggml_tensor * inp_self_kq_mask;         // F32 [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_cnv;     //     [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_swa;     // F32 [kv_size, n_batch]
-    ggml_tensor * inp_self_kq_mask_swa_cnv; //     [kv_size, n_batch]
-    ggml_tensor * inp_self_k_shift;         // I32 [kv_size]
 };

 // a recurrent transformer (ie.e RWKV, Mamba)
@@ -458,23 +498,43 @@ public:

    virtual ~llama_context_recurrent();

+protected:
+    virtual void reserve() override;
+
+public:
    virtual       llama_kv_cache * get_kv_self()       override;
    virtual const llama_kv_cache * get_kv_self() const override;

    virtual void kv_self_update() override;

-    virtual ggml_cgraph * graph_init() override;
-
    virtual int encode(llama_batch & inp_batch) override;
    virtual int decode(llama_batch & inp_batch) override;

+protected:
+    //
+    // input
+    //
+
+    virtual void input_set(const llama_ubatch & ubatch) override;
+
+    struct ggml_tensor * inp_s_copy; // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
+
+    //
+    // graph
+    //
+
+    virtual ggml_cgraph * graph_init() override;
+
+    //
+    // graph build
+    //
+
    virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0,
-                    bool   worst_case) override;
+            ggml_context * ctx0) override;

    virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0,
-                    bool   worst_case) override;
+            ggml_context * ctx0) override;

    virtual ggml_tensor * build_copy_mask_state(
            ggml_context * ctx0,
@@ -482,10 +542,8 @@ public:
             ggml_tensor * s,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
-                 int32_t   n_tokens,
                 int32_t   n_state,
-                 int32_t   n_seqs,
-                    bool   worst_case) override;
+                 int32_t   n_seqs) override;

    virtual ggml_tensor * build_mamba_layer(
            ggml_context * ctx0,
@@ -494,8 +552,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;

    virtual ggml_tensor * build_rwkv_token_shift_load(
            ggml_context * ctx0,
@@ -503,15 +560,13 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;

    virtual ggml_tensor * build_rwkv_token_shift_store(
            ggml_context * ctx0,
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;

    virtual ggml_tensor * build_rwkv6_time_mix(
            ggml_context * ctx0,
@@ -521,23 +576,24 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case) override;
+                     int   il) override;
+
+    //
+    // state save/load
+    //

-protected:
    virtual size_t state_get_data(llama_io_write_i & io) override;
    virtual size_t state_set_data(llama_io_read_i  & io) override;

    virtual size_t state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) override;
    virtual size_t state_seq_set_data(llama_io_read_i  & io, llama_seq_id seq_id) override;

-    virtual void input_set(const llama_ubatch & ubatch) override;
+    //
+    // members
+    //

    // TODO: change name to something more meaningful -- does "KV cache" make sense for recurrent models?
    llama_kv_cache_recurrent kv_self;
-
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
 };

 // For internal test use
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -12,8 +12,7 @@ ggml_tensor * llama_graph_i::build_attn(
         ggml_tensor * v_cur,
             int32_t   n_tokens,
             float     kq_scale,
-             int       il,
-             bool      worst_case) {
+             int       il) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(wo);
@@ -24,7 +23,6 @@ ggml_tensor * llama_graph_i::build_attn(
    GGML_UNUSED(n_tokens);
    GGML_UNUSED(kq_scale);
    GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
    return nullptr;
@@ -57,12 +55,8 @@ ggml_tensor * llama_graph_i::build_inp_self_k_shift(
 }

 ggml_tensor * llama_graph_i::build_inp_embd_enc(
-        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
+        ggml_context * ctx0) {
    GGML_UNUSED(ctx0);
-    GGML_UNUSED(n_tokens);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
    return nullptr;
@@ -70,21 +64,17 @@ ggml_tensor * llama_graph_i::build_inp_embd_enc(

 ggml_tensor * llama_graph_i::build_inp_kq_mask_cross(
        ggml_context * ctx0,
-             int32_t   n_tokens,
-                bool   worst_case) {
+             int32_t   n_tokens) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(n_tokens);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
    return nullptr;
 }

 ggml_tensor * llama_graph_i::build_inp_s_copy (
-        ggml_context * ctx0,
-                bool   worst_case) {
+        ggml_context * ctx0) {
    GGML_UNUSED(ctx0);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

@@ -92,10 +82,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
 }

 ggml_tensor * llama_graph_i::build_inp_s_mask(
-        ggml_context * ctx0,
-                bool   worst_case) {
+        ggml_context * ctx0) {
    GGML_UNUSED(ctx0);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

@@ -108,19 +96,15 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
         ggml_tensor * s,
         ggml_tensor * state_copy,
         ggml_tensor * state_mask,
-             int32_t   n_tokens,
             int32_t   n_state,
-             int32_t   n_seqs,
-                bool   worst_case) {
+             int32_t   n_seqs) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(s);
    GGML_UNUSED(state_copy);
    GGML_UNUSED(state_mask);
-    GGML_UNUSED(n_tokens);
    GGML_UNUSED(n_state);
    GGML_UNUSED(n_seqs);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

@@ -134,8 +118,7 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
         ggml_tensor * state_copy,
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(cur);
@@ -143,7 +126,6 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
    GGML_UNUSED(state_mask);
    GGML_UNUSED(ubatch);
    GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

@@ -156,15 +138,13 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
         ggml_tensor * state_copy,
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(state_copy);
    GGML_UNUSED(state_mask);
    GGML_UNUSED(ubatch);
    GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

@@ -175,13 +155,11 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
        ggml_context * ctx0,
         ggml_tensor * token_shift,
  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(token_shift);
    GGML_UNUSED(ubatch);
    GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

@@ -196,8 +174,7 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
         ggml_tensor * state_copy,
         ggml_tensor * state_mask,
  const llama_ubatch & ubatch,
-                 int   il,
-                bool   worst_case) {
+                 int   il) {
    GGML_UNUSED(ctx0);
    GGML_UNUSED(gf);
    GGML_UNUSED(cur);
@@ -206,7 +183,6 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
    GGML_UNUSED(state_mask);
    GGML_UNUSED(ubatch);
    GGML_UNUSED(il);
-    GGML_UNUSED(worst_case);

    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);

--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -69,9 +69,7 @@ public:
                 int32_t   n_tokens) = 0;

    virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case) = 0;
+            ggml_context * ctx0) = 0;

    virtual ggml_tensor * build_inp_mean(
            ggml_context * ctx0,
@@ -85,8 +83,7 @@ public:
            ggml_context * ctx0,
                 int32_t   n_tokens,
                    bool   causal,
-                    bool   swa,
-                    bool   worst_case) = 0;
+                    bool   swa) = 0;

    virtual ggml_tensor * build_attn(
            ggml_context * ctx0,
@@ -98,8 +95,7 @@ public:
             ggml_tensor * v_cur,
                 int32_t   n_tokens,
                 float     kq_scale,
-                 int       il,
-                 bool      worst_case);
+                 int       il);

    virtual void build_kv_self_shift(
            ggml_context * ctx0,
@@ -114,22 +110,17 @@ public:
            ggml_context * ctx0);

    virtual ggml_tensor * build_inp_embd_enc(
-            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case);
+            ggml_context * ctx0);

    virtual ggml_tensor * build_inp_kq_mask_cross(
            ggml_context * ctx0,
-                 int32_t   n_tokens,
-                    bool   worst_case);
+                 int32_t   n_tokens);

    virtual ggml_tensor * build_inp_s_copy(
-            ggml_context * ctx0,
-                    bool   worst_case);
+            ggml_context * ctx0);

    virtual ggml_tensor * build_inp_s_mask(
-            ggml_context * ctx0,
-                    bool   worst_case);
+            ggml_context * ctx0);

    virtual ggml_tensor * build_copy_mask_state(
            ggml_context * ctx0,
@@ -137,10 +128,8 @@ public:
             ggml_tensor * s,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
-                 int32_t   n_tokens,
                 int32_t   n_state,
-                 int32_t   n_seqs,
-                    bool   worst_case);
+                 int32_t   n_seqs);

    virtual ggml_tensor * build_mamba_layer(
            ggml_context * ctx0,
@@ -149,8 +138,7 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);

    virtual ggml_tensor * build_rwkv_token_shift_load(
            ggml_context * ctx0,
@@ -158,15 +146,13 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);

    virtual ggml_tensor * build_rwkv_token_shift_store(
            ggml_context * ctx0,
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);

    virtual ggml_tensor * build_rwkv6_time_mix(
            ggml_context * ctx0,
@@ -176,6 +162,5 @@ public:
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
-                     int   il,
-                    bool   worst_case);
+                     int   il);
 };
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -610,6 +610,7 @@ struct llama_kv_cache_slot_info llama_kv_cache::find_slot(
        // sanity check
        return llama_kv_cache_slot_info(n >= n_seqs);
    }
+
    // otherwise, one cell per token.

    if (n_tokens > size) {
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3834,7 +3834,6 @@ struct llm_build_context {
    const int32_t n_tokens;
    const int32_t n_ctx_orig;

-    const bool worst_case;
    const bool flash_attn;

    const enum llama_pooling_type pooling_type;
@@ -3851,8 +3850,7 @@ struct llm_build_context {
            llama_graph_i * lgf,
      const llama_model   & model,
      const llama_cparams & cparams,
-      const llama_ubatch  & ubatch,
-            bool            worst_case) :
+      const llama_ubatch  & ubatch) :
        model            (model),
        hparams          (model.hparams),
        cparams          (cparams),
@@ -3879,7 +3877,6 @@ struct llm_build_context {
        norm_rms_eps     (hparams.f_norm_rms_eps),
        n_tokens         (ubatch.n_tokens),
        n_ctx_orig       (cparams.n_ctx_orig_yarn),
-        worst_case       (worst_case),
        flash_attn       (cparams.flash_attn),
        pooling_type     (cparams.pooling_type),
        rope_type        (hparams.rope_type),
@@ -3910,7 +3907,7 @@ struct llm_build_context {

    // TODO: tmp
    struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
        cb(cur, "inp_out_ids", -1);

        return cur;
@@ -3949,7 +3946,7 @@ struct llm_build_context {

    // TODO: tmp
    struct ggml_tensor * build_inp_embd_enc() {
-        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0);
        cb(cur, "embd_enc", -1);

        return cur;
@@ -3957,7 +3954,7 @@ struct llm_build_context {

    // TODO: tmp
    struct ggml_tensor * build_inp_kq_mask_cross() {
-        ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens, worst_case);
+        ggml_tensor * cur = lgf->build_inp_kq_mask_cross(ctx0, n_tokens);
        cb(cur, "KQ_mask_cross", -1);

        return cur;
@@ -4258,7 +4255,7 @@ struct llm_build_context {
        ggml_build_forward_expand(gf, k_cur);
        ggml_build_forward_expand(gf, v_cur);

-        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il, worst_case);
+        ggml_tensor * cur = lgf->build_attn(ctx0, gf, wo, wo_b, q_cur, k_cur, v_cur, n_tokens, kq_scale, il);
        cb(cur, "kqv_out", il);

        return cur;
@@ -4405,7 +4402,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        for (int il = 0; il < n_layer; ++il) {
@@ -4566,7 +4563,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        for (int il = 0; il < n_layer; ++il) {
@@ -4722,7 +4719,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -4838,7 +4835,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -4943,7 +4940,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * attn_norm;
@@ -5066,7 +5063,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -5218,7 +5215,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -5340,7 +5337,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
        cb(pos, "pos_embd", -1);
@@ -5441,7 +5438,7 @@ struct llm_build_context {

        inpL = build_inp_embd(model.tok_embd);

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -5555,7 +5552,7 @@ struct llm_build_context {
        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
        cb(inpL, "inp_norm", -1);

-        lgf->build_attn_inp(ctx0, n_tokens, false, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, false, false);

        // iterate layers
        for (int il = 0; il < n_layer; ++il) {
@@ -5700,7 +5697,7 @@ struct llm_build_context {

        inpL = build_inp_embd(model.tok_embd);

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        inpL = build_norm(inpL,
                model.tok_norm,
@@ -5803,7 +5800,7 @@ struct llm_build_context {

        inpL = build_inp_embd(model.tok_embd);

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        if (model.pos_embd) {
            // inp_pos - contains the positions
@@ -5945,7 +5942,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {

@@ -6096,7 +6093,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -6210,7 +6207,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -6323,7 +6320,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        int sections[4];
        std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -6441,7 +6438,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -6588,7 +6585,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            attn_norm_output = build_norm(inpL,
@@ -6711,7 +6708,7 @@ struct llm_build_context {
        struct ggml_tensor * inp_pos = build_inp_pos();

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true);

        for (int il = 0; il < n_layer; ++il) {
            auto * residual = inpL;
@@ -6855,7 +6852,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {

@@ -6961,7 +6958,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
        cb(pos, "pos_embd", -1);
@@ -7067,7 +7064,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            cur = build_norm(inpL,
@@ -7178,7 +7175,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -7297,7 +7294,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -7425,7 +7422,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -7626,7 +7623,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            // norm
@@ -7734,7 +7731,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true);

        for (int il = 0; il < n_layer; ++il) {
            // norm
@@ -7864,7 +7861,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -7977,8 +7974,8 @@ struct llm_build_context {
        // {n_embd, n_tokens}
        inpL = build_inp_embd(model.tok_embd);

-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);

        for (int il = 0; il < n_layer; ++il) {
            // norm
@@ -7988,7 +7985,7 @@ struct llm_build_context {
            cb(cur, "attn_norm", il);

            //cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
-            cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il);

            if (il == n_layer - 1) {
                // skip computing output for unused tokens
@@ -8039,7 +8036,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {

@@ -8187,7 +8184,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, true);

        // sliding window switch pattern
        const int32_t sliding_window_pattern = 4;
@@ -8322,7 +8319,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -8442,7 +8439,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -8566,7 +8563,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -8687,7 +8684,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            const int64_t n_head    = hparams.n_head(il);
@@ -8815,7 +8812,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            cur = build_norm(inpL,
@@ -8959,7 +8956,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -9089,7 +9086,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

@@ -9252,7 +9249,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -9470,7 +9467,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -9951,7 +9948,7 @@ struct llm_build_context {

        inpL = build_inp_embd(model.tok_embd);

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            cur = build_norm(inpL,
@@ -10045,7 +10042,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -10175,7 +10172,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -10296,7 +10293,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -10414,8 +10411,8 @@ struct llm_build_context {
        inpL = build_inp_embd(model.tok_embd);
        inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);

-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);

        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10425,7 +10422,7 @@ struct llm_build_context {
            const llama_layer * layer = &model.layers[il];

            struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
-                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+                ctx0, gf, state_copy, state_mask, ubatch, il
            );

            struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
@@ -10441,7 +10438,7 @@ struct llm_build_context {
                1
            );

-            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);

            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
            cb(ffn_inp, "ffn_inp", il);
@@ -10464,7 +10461,7 @@ struct llm_build_context {
                ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
                1
            );
-            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il));

            if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
                cur = ggml_scale(ctx0, cur, 0.5F);
@@ -10506,8 +10503,8 @@ struct llm_build_context {

        inpL = build_inp_embd(model.tok_embd);

-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
+        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
+        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);

        const auto n_embd = hparams.n_embd;
        const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10519,7 +10516,7 @@ struct llm_build_context {
            const llama_layer * layer = &model.layers[il];

            struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
-                ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
+                ctx0, gf, state_copy, state_mask, ubatch, il
            );

            struct ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il);
@@ -10532,10 +10529,10 @@ struct llm_build_context {
                1
            );

-            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
+            cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il);

            token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
-            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
+            ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il));

            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
            cb(ffn_inp, "ffn_inp", il);
@@ -10601,7 +10598,7 @@ struct llm_build_context {
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();

-        lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
+        lgf->build_attn_inp(ctx0, n_tokens, true, false);

        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@@ -10912,9 +10909,8 @@ llama_graph_result llama_model::build_graph(
           ggml_cgraph * gf,
         llama_graph_i * lgf,
   const llama_cparams & cparams,
-   const llama_ubatch  & ubatch,
-                  bool   worst_case) const {
-    struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case);
+   const llama_ubatch  & ubatch) const {
+    struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch);

    switch (arch) {
        case LLM_ARCH_LLAMA:
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -374,8 +374,7 @@ struct llama_model {
               ggml_cgraph * gf,
             llama_graph_i * lgf,
       const llama_cparams & cparams,
-       const llama_ubatch  & ubatch,
-                      bool   worst_case) const;
+       const llama_ubatch  & ubatch) const;

 private:
    struct impl;