mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : add high-throughput mode (#14363)
* kv-cache : prepare K/V buffers for separation ggml-ci * batched-bench : fix oob write ggml-ci * llama : add "virtual sequences" ggml-ci * llama : use "stream" vs "virtual sequence" ggml-ci * graph : fix stream splitting when KV cache is not used ggml-ci * kv-cache : add multi-stream save/load support ggml-ci * llama : add "--attn-streams" flag ggml-ci * kv-cache : fix handling when find_slot fails ggml-ci * kv-cache : restore find_slot impl ggml-ci * kv-cache : add comments * kv-cache : add bounds checks for sequence id ggml-ci * cont : add n_seq_max to batch allocr ggml-ci * kv-cache : perform stream copies lazily after llama_synchronize ggml-ci * kv-cache : avoid throwing exceptions across the C boundary ggml-ci * CUDA: 4D FlashAttention support (#14628) * CUDA: 4D FlashAttention support * CUDA: fix WMMA FA kernel * llama : rename attn_streams -> kv_unified ggml-ci * common : rename kv_split -> kv_unified ggml-ci --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
		@@ -48,6 +48,7 @@ public:
 | 
			
		||||
            const llama_vocab & vocab,
 | 
			
		||||
            const llama_memory_i * memory,
 | 
			
		||||
            uint32_t n_embd,
 | 
			
		||||
            uint32_t n_seq_max,
 | 
			
		||||
            bool output_all);
 | 
			
		||||
 | 
			
		||||
    const llama_batch & get_batch() const;
 | 
			
		||||
@@ -100,6 +101,7 @@ private:
 | 
			
		||||
    const uint32_t n_pos_per_embd;
 | 
			
		||||
 | 
			
		||||
    uint32_t n_embd;
 | 
			
		||||
    uint32_t n_seq_max;
 | 
			
		||||
    uint32_t n_outputs;
 | 
			
		||||
 | 
			
		||||
    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user