mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : reuse compute graphs (#14482)
* llama : reuse compute graphs ggml-ci * llama-bench : add graph reuse parameter ggml-ci * cont : remove the parameter and the sched resets ggml-ci * graph : rename update() to can_reuse() ggml-ci * params : remove is_same() ggml-ci * graph : set res->params in llm_graph_context constructor ggml-ci * graph : avoid set_max_nodes in llm_graph_result ggml-ci * kv-cache : reuse llama_context's graph result instance ggml-ci * context : reset the previous graph result upon memory updates ggml-ci * batch : llama_ubatch now carries its data instead of pointing to balloc ggml-ci * merge : fix build ggml-ci * graph : fix can_reuse() checks when flash-attention is disabled * graph : move llm_graph_result impl in source file + debug env ggml-ci
This commit is contained in:
		@@ -8,12 +8,17 @@
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <set>
 | 
			
		||||
#include <bitset>
 | 
			
		||||
#include <memory>
 | 
			
		||||
#include <unordered_map>
 | 
			
		||||
 | 
			
		||||
// keep this struct lightweight
 | 
			
		||||
// it points to data in `llama_batch_allocr`
 | 
			
		||||
struct llama_ubatch {
 | 
			
		||||
    bool equal_seqs;
 | 
			
		||||
    bool equal_seqs() const {
 | 
			
		||||
        return b_equal_seqs != 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
 | 
			
		||||
                           //       otherwise address sanitizer complains
 | 
			
		||||
    // TODO: whole_seqs for embeddings?
 | 
			
		||||
 | 
			
		||||
    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
 | 
			
		||||
@@ -34,6 +39,20 @@ struct llama_ubatch {
 | 
			
		||||
    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
 | 
			
		||||
    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
 | 
			
		||||
    int8_t       *  output;     // [n_tokens]         | i   | -
 | 
			
		||||
 | 
			
		||||
    struct data_t {
 | 
			
		||||
        std::vector<llama_token>    token;
 | 
			
		||||
        std::vector<float>          embd;
 | 
			
		||||
        std::vector<llama_pos>      pos;
 | 
			
		||||
        std::vector<int32_t>        n_seq_id;
 | 
			
		||||
        std::vector<llama_seq_id *> seq_id;
 | 
			
		||||
        std::vector<llama_seq_id>   seq_id_unq;
 | 
			
		||||
        std::vector<int32_t>        seq_idx;
 | 
			
		||||
        std::vector<int8_t>         output;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
 | 
			
		||||
    std::shared_ptr<data_t> data;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
// a helper for sanitizing, fulfilling and splitting a batch
 | 
			
		||||
@@ -137,20 +156,5 @@ private:
 | 
			
		||||
    // used[i] indicates if token i has already been used in a previous ubatch
 | 
			
		||||
    std::vector<bool> used;
 | 
			
		||||
 | 
			
		||||
    // llama_ubatch points to this data:
 | 
			
		||||
    struct ubatch {
 | 
			
		||||
        std::vector<llama_token>    token;
 | 
			
		||||
        std::vector<float>          embd;
 | 
			
		||||
        std::vector<llama_pos>      pos;
 | 
			
		||||
        std::vector<int32_t>        n_seq_id;
 | 
			
		||||
        std::vector<llama_seq_id *> seq_id;
 | 
			
		||||
        std::vector<llama_seq_id>   seq_id_unq;
 | 
			
		||||
        std::vector<int32_t>        seq_idx;
 | 
			
		||||
        std::vector<int8_t>         output;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // current splitting state:
 | 
			
		||||
    std::vector<ubatch> ubatches;
 | 
			
		||||
 | 
			
		||||
    int debug;
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user