mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	* kv-cache : prepare for SWA ggml-ci * kv-cache : initial iSWA implementation ggml-ci * kv-cache : rework error recovery logic ggml-ci * models : fix Phi-3 SWA parameters ggml-ci * model : adjust Granite to rope factor changes ggml-ci * server : check if context can do shifts ggml-ci * iswa : for now, always enable shifts (experiment) ggml-ci * kv-cache : simplify SWA logic ggml-ci * kv-cache : apply defrag when we fail to find slots for the batch ggml-ci * llama : update docs about llama_decode ggml-ci * kv-cache : update warning logs when no space for the batch is available ggml-ci * llama : add llama_kv_self_seq_pos_min() * kv-cache : keep track of partial SWA computes and print warnings * server : disallow use cases involving partial SWA context ggml-ci * llama : add param to control SWA cache size ggml-ci * minor : clean-up ggml-ci
		
			
				
	
	
		
			426 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			426 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
#pragma once
 | 
						|
 | 
						|
#include "llama.h"
 | 
						|
#include "llama-arch.h"
 | 
						|
#include "llama-graph.h"
 | 
						|
#include "llama-hparams.h"
 | 
						|
#include "llama-memory.h"
 | 
						|
#include "llama-vocab.h"
 | 
						|
 | 
						|
#include <memory>
 | 
						|
#include <string>
 | 
						|
#include <unordered_map>
 | 
						|
#include <vector>
 | 
						|
 | 
						|
struct llama_cparams;
 | 
						|
struct llama_ubatch;
 | 
						|
struct llama_model_loader;
 | 
						|
 | 
						|
// available models
 | 
						|
enum llm_type {
 | 
						|
    LLM_TYPE_UNKNOWN,
 | 
						|
    LLM_TYPE_14M,
 | 
						|
    LLM_TYPE_17M,
 | 
						|
    LLM_TYPE_22M,
 | 
						|
    LLM_TYPE_33M,
 | 
						|
    LLM_TYPE_60M,
 | 
						|
    LLM_TYPE_70M,
 | 
						|
    LLM_TYPE_80M,
 | 
						|
    LLM_TYPE_109M,
 | 
						|
    LLM_TYPE_137M,
 | 
						|
    LLM_TYPE_160M,
 | 
						|
    LLM_TYPE_190M,
 | 
						|
    LLM_TYPE_220M,
 | 
						|
    LLM_TYPE_250M,
 | 
						|
    LLM_TYPE_270M,
 | 
						|
    LLM_TYPE_335M,
 | 
						|
    LLM_TYPE_410M,
 | 
						|
    LLM_TYPE_450M,
 | 
						|
    LLM_TYPE_475M,
 | 
						|
    LLM_TYPE_770M,
 | 
						|
    LLM_TYPE_780M,
 | 
						|
    LLM_TYPE_0_5B,
 | 
						|
    LLM_TYPE_0_6B,
 | 
						|
    LLM_TYPE_1B,
 | 
						|
    LLM_TYPE_1_3B,
 | 
						|
    LLM_TYPE_1_4B,
 | 
						|
    LLM_TYPE_1_5B,
 | 
						|
    LLM_TYPE_1_6B,
 | 
						|
    LLM_TYPE_1_7B,
 | 
						|
    LLM_TYPE_1_8B,
 | 
						|
    LLM_TYPE_2B,
 | 
						|
    LLM_TYPE_2_8B,
 | 
						|
    LLM_TYPE_2_9B,
 | 
						|
    LLM_TYPE_3B,
 | 
						|
    LLM_TYPE_4B,
 | 
						|
    LLM_TYPE_6B,
 | 
						|
    LLM_TYPE_6_9B,
 | 
						|
    LLM_TYPE_7B,
 | 
						|
    LLM_TYPE_8B,
 | 
						|
    LLM_TYPE_9B,
 | 
						|
    LLM_TYPE_11B,
 | 
						|
    LLM_TYPE_12B,
 | 
						|
    LLM_TYPE_13B,
 | 
						|
    LLM_TYPE_14B,
 | 
						|
    LLM_TYPE_15B,
 | 
						|
    LLM_TYPE_16B,
 | 
						|
    LLM_TYPE_20B,
 | 
						|
    LLM_TYPE_27B,
 | 
						|
    LLM_TYPE_30B,
 | 
						|
    LLM_TYPE_32B,
 | 
						|
    LLM_TYPE_34B,
 | 
						|
    LLM_TYPE_35B,
 | 
						|
    LLM_TYPE_40B,
 | 
						|
    LLM_TYPE_65B,
 | 
						|
    LLM_TYPE_70B,
 | 
						|
    LLM_TYPE_236B,
 | 
						|
    LLM_TYPE_290B,
 | 
						|
    LLM_TYPE_314B,
 | 
						|
    LLM_TYPE_405B,
 | 
						|
    LLM_TYPE_671B,
 | 
						|
    LLM_TYPE_SMALL,
 | 
						|
    LLM_TYPE_MEDIUM,
 | 
						|
    LLM_TYPE_LARGE,
 | 
						|
    LLM_TYPE_XL,
 | 
						|
    LLM_TYPE_A1_7B,
 | 
						|
    LLM_TYPE_A2_7B,
 | 
						|
    LLM_TYPE_8x7B,
 | 
						|
    LLM_TYPE_8x22B,
 | 
						|
    LLM_TYPE_16x12B,
 | 
						|
    LLM_TYPE_16x3_8B,
 | 
						|
    LLM_TYPE_10B_128x3_66B,
 | 
						|
    LLM_TYPE_57B_A14B,
 | 
						|
    LLM_TYPE_17B_16E, // llama4 Scout
 | 
						|
    LLM_TYPE_17B_128E, // llama4 Maverick
 | 
						|
    LLM_TYPE_30B_A3B,
 | 
						|
    LLM_TYPE_235B_A22B,
 | 
						|
};
 | 
						|
 | 
						|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
 | 
						|
 | 
						|
struct llama_layer_posnet {
 | 
						|
    // resnet
 | 
						|
    struct ggml_tensor * norm1   = nullptr;
 | 
						|
    struct ggml_tensor * norm1_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * conv1   = nullptr;
 | 
						|
    struct ggml_tensor * conv1_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * norm2   = nullptr;
 | 
						|
    struct ggml_tensor * norm2_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * conv2   = nullptr;
 | 
						|
    struct ggml_tensor * conv2_b = nullptr;
 | 
						|
 | 
						|
    // attention
 | 
						|
    struct ggml_tensor * attn_norm   = nullptr;
 | 
						|
    struct ggml_tensor * attn_norm_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * attn_q   = nullptr;
 | 
						|
    struct ggml_tensor * attn_q_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * attn_k   = nullptr;
 | 
						|
    struct ggml_tensor * attn_k_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * attn_v   = nullptr;
 | 
						|
    struct ggml_tensor * attn_v_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * attn_o   = nullptr;
 | 
						|
    struct ggml_tensor * attn_o_b = nullptr;
 | 
						|
 | 
						|
    // normalize
 | 
						|
    struct ggml_tensor * norm   = nullptr;
 | 
						|
    struct ggml_tensor * norm_b = nullptr;
 | 
						|
};
 | 
						|
 | 
						|
struct llama_layer_convnext {
 | 
						|
    struct ggml_tensor * dw   = nullptr;
 | 
						|
    struct ggml_tensor * dw_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * norm   = nullptr;
 | 
						|
    struct ggml_tensor * norm_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * pw1   = nullptr;
 | 
						|
    struct ggml_tensor * pw1_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * pw2   = nullptr;
 | 
						|
    struct ggml_tensor * pw2_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * gamma = nullptr;
 | 
						|
};
 | 
						|
 | 
						|
struct llama_layer {
 | 
						|
    // normalization
 | 
						|
    struct ggml_tensor * attn_norm       = nullptr;
 | 
						|
    struct ggml_tensor * attn_norm_b     = nullptr;
 | 
						|
    struct ggml_tensor * attn_norm_2     = nullptr;
 | 
						|
    struct ggml_tensor * attn_norm_2_b   = nullptr;
 | 
						|
    struct ggml_tensor * attn_q_norm     = nullptr;
 | 
						|
    struct ggml_tensor * attn_q_norm_b   = nullptr;
 | 
						|
    struct ggml_tensor * attn_k_norm     = nullptr;
 | 
						|
    struct ggml_tensor * attn_k_norm_b   = nullptr;
 | 
						|
    struct ggml_tensor * attn_out_norm   = nullptr;
 | 
						|
    struct ggml_tensor * attn_out_norm_b = nullptr;
 | 
						|
    struct ggml_tensor * attn_q_a_norm   = nullptr;
 | 
						|
    struct ggml_tensor * attn_kv_a_norm  = nullptr;
 | 
						|
    struct ggml_tensor * attn_sub_norm   = nullptr;
 | 
						|
    struct ggml_tensor * attn_post_norm  = nullptr;
 | 
						|
    struct ggml_tensor * ffn_sub_norm    = nullptr;
 | 
						|
    struct ggml_tensor * attn_norm_cross = nullptr;
 | 
						|
    struct ggml_tensor * attn_norm_enc   = nullptr;
 | 
						|
 | 
						|
    // attention
 | 
						|
    struct ggml_tensor * wq        = nullptr;
 | 
						|
    struct ggml_tensor * wk        = nullptr;
 | 
						|
    struct ggml_tensor * wv        = nullptr;
 | 
						|
    struct ggml_tensor * wo        = nullptr;
 | 
						|
    struct ggml_tensor * wqkv      = nullptr;
 | 
						|
    struct ggml_tensor * wq_a      = nullptr;
 | 
						|
    struct ggml_tensor * wq_b      = nullptr;
 | 
						|
    struct ggml_tensor * wkv_a_mqa = nullptr;
 | 
						|
    struct ggml_tensor * wkv_b     = nullptr;
 | 
						|
    struct ggml_tensor * wk_b      = nullptr;
 | 
						|
    struct ggml_tensor * wv_b      = nullptr;
 | 
						|
    struct ggml_tensor * wq_cross  = nullptr;
 | 
						|
    struct ggml_tensor * wk_cross  = nullptr;
 | 
						|
    struct ggml_tensor * wv_cross  = nullptr;
 | 
						|
    struct ggml_tensor * wo_cross  = nullptr;
 | 
						|
    struct ggml_tensor * wq_enc    = nullptr;
 | 
						|
    struct ggml_tensor * wk_enc    = nullptr;
 | 
						|
    struct ggml_tensor * wv_enc    = nullptr;
 | 
						|
    struct ggml_tensor * wo_enc    = nullptr;
 | 
						|
 | 
						|
    // attention bias
 | 
						|
    struct ggml_tensor * bq   = nullptr;
 | 
						|
    struct ggml_tensor * bk   = nullptr;
 | 
						|
    struct ggml_tensor * bv   = nullptr;
 | 
						|
    struct ggml_tensor * bo   = nullptr;
 | 
						|
    struct ggml_tensor * bqkv = nullptr;
 | 
						|
 | 
						|
    // relative position bias
 | 
						|
    struct ggml_tensor * attn_rel_b       = nullptr;
 | 
						|
    struct ggml_tensor * attn_rel_b_enc   = nullptr;
 | 
						|
    struct ggml_tensor * attn_rel_b_cross = nullptr;
 | 
						|
 | 
						|
    // normalization
 | 
						|
    struct ggml_tensor * ffn_norm         = nullptr;
 | 
						|
    struct ggml_tensor * ffn_norm_b       = nullptr;
 | 
						|
    struct ggml_tensor * ffn_post_norm    = nullptr;
 | 
						|
    struct ggml_tensor * layer_out_norm   = nullptr;
 | 
						|
    struct ggml_tensor * layer_out_norm_b = nullptr;
 | 
						|
    struct ggml_tensor * ffn_norm_exps    = nullptr;
 | 
						|
    struct ggml_tensor * ffn_norm_enc     = nullptr;
 | 
						|
 | 
						|
    // ff
 | 
						|
    struct ggml_tensor * ffn_gate     = nullptr; // w1
 | 
						|
    struct ggml_tensor * ffn_down     = nullptr; // w2
 | 
						|
    struct ggml_tensor * ffn_up       = nullptr; // w3
 | 
						|
    struct ggml_tensor * ffn_gate_enc = nullptr;
 | 
						|
    struct ggml_tensor * ffn_down_enc = nullptr;
 | 
						|
    struct ggml_tensor * ffn_up_enc   = nullptr;
 | 
						|
 | 
						|
    // ff MoE
 | 
						|
    struct ggml_tensor * ffn_gate_inp  = nullptr;
 | 
						|
    struct ggml_tensor * ffn_gate_exps = nullptr;
 | 
						|
    struct ggml_tensor * ffn_down_exps = nullptr;
 | 
						|
    struct ggml_tensor * ffn_up_exps   = nullptr;
 | 
						|
 | 
						|
    // ff shared expert (shexp)
 | 
						|
    struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
 | 
						|
    struct ggml_tensor * ffn_gate_shexp     = nullptr;
 | 
						|
    struct ggml_tensor * ffn_down_shexp     = nullptr;
 | 
						|
    struct ggml_tensor * ffn_up_shexp       = nullptr;
 | 
						|
 | 
						|
    // ff bias
 | 
						|
    struct ggml_tensor * ffn_gate_b = nullptr;
 | 
						|
    struct ggml_tensor * ffn_down_b = nullptr; // b2
 | 
						|
    struct ggml_tensor * ffn_up_b   = nullptr; // b3
 | 
						|
    struct ggml_tensor * ffn_act    = nullptr;
 | 
						|
    struct ggml_tensor * ffn_exp_probs_b = nullptr;
 | 
						|
 | 
						|
    // mamba proj
 | 
						|
    struct ggml_tensor * ssm_in  = nullptr;
 | 
						|
    struct ggml_tensor * ssm_x   = nullptr;
 | 
						|
    struct ggml_tensor * ssm_dt  = nullptr;
 | 
						|
    struct ggml_tensor * ssm_out = nullptr;
 | 
						|
 | 
						|
    // mamba
 | 
						|
    struct ggml_tensor * ssm_conv1d = nullptr;
 | 
						|
    struct ggml_tensor * ssm_a      = nullptr;
 | 
						|
    struct ggml_tensor * ssm_d      = nullptr;
 | 
						|
 | 
						|
    // mamba bias
 | 
						|
    struct ggml_tensor * ssm_conv1d_b = nullptr;
 | 
						|
    struct ggml_tensor * ssm_dt_b     = nullptr;
 | 
						|
 | 
						|
    // rwkv
 | 
						|
    struct ggml_tensor * time_mix_w1         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_w2         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_x     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_w     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_k     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_v     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_r     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_g     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_lerp_fused = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * time_mix_first        = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_decay        = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_decay_w1     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_decay_w2     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_key          = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_key_b        = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_value        = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_value_b      = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_receptance   = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_receptance_b = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_gate         = nullptr;
 | 
						|
 | 
						|
    // rwkv7
 | 
						|
    struct ggml_tensor * time_mix_w0         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_a0         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_a1         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_a2         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_v0         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_v1         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_v2         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_g1         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_g2         = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_k_k        = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_k_a        = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_r_k        = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * time_mix_ln     = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_ln_b   = nullptr;
 | 
						|
    struct ggml_tensor * time_mix_output = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * channel_mix_lerp_k = nullptr;
 | 
						|
    struct ggml_tensor * channel_mix_lerp_r = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * channel_mix_key        = nullptr;
 | 
						|
    struct ggml_tensor * channel_mix_receptance = nullptr;
 | 
						|
    struct ggml_tensor * channel_mix_value      = nullptr;
 | 
						|
 | 
						|
    // long rope factors
 | 
						|
    struct ggml_tensor * rope_long  = nullptr;
 | 
						|
    struct ggml_tensor * rope_short = nullptr;
 | 
						|
    struct ggml_tensor * rope_freqs = nullptr;
 | 
						|
 | 
						|
    // bitnet scale
 | 
						|
    struct ggml_tensor * wq_scale       = nullptr;
 | 
						|
    struct ggml_tensor * wk_scale       = nullptr;
 | 
						|
    struct ggml_tensor * wv_scale       = nullptr;
 | 
						|
    struct ggml_tensor * wo_scale       = nullptr;
 | 
						|
    struct ggml_tensor * ffn_gate_scale = nullptr;
 | 
						|
    struct ggml_tensor * ffn_up_scale   = nullptr;
 | 
						|
    struct ggml_tensor * ffn_down_scale = nullptr;
 | 
						|
 | 
						|
    struct llama_layer_posnet posnet;
 | 
						|
 | 
						|
    struct llama_layer_convnext convnext;
 | 
						|
};
 | 
						|
 | 
						|
struct llama_model {
 | 
						|
    llm_type type = LLM_TYPE_UNKNOWN;
 | 
						|
    llm_arch arch = LLM_ARCH_UNKNOWN;
 | 
						|
 | 
						|
    std::string name = "n/a";
 | 
						|
 | 
						|
    llama_hparams hparams = {};
 | 
						|
    llama_vocab   vocab;
 | 
						|
 | 
						|
    struct ggml_tensor * tok_embd   = nullptr;
 | 
						|
    struct ggml_tensor * type_embd  = nullptr;
 | 
						|
    struct ggml_tensor * pos_embd   = nullptr;
 | 
						|
    struct ggml_tensor * tok_norm   = nullptr;
 | 
						|
    struct ggml_tensor * tok_norm_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * output_norm     = nullptr;
 | 
						|
    struct ggml_tensor * output_norm_b   = nullptr;
 | 
						|
    struct ggml_tensor * output          = nullptr;
 | 
						|
    struct ggml_tensor * output_b        = nullptr;
 | 
						|
    struct ggml_tensor * output_norm_enc = nullptr;
 | 
						|
 | 
						|
    // classifier
 | 
						|
    struct ggml_tensor * cls       = nullptr;
 | 
						|
    struct ggml_tensor * cls_b     = nullptr;
 | 
						|
    struct ggml_tensor * cls_out   = nullptr;
 | 
						|
    struct ggml_tensor * cls_out_b = nullptr;
 | 
						|
 | 
						|
    struct ggml_tensor * conv1d   = nullptr;
 | 
						|
    struct ggml_tensor * conv1d_b = nullptr;
 | 
						|
 | 
						|
    std::vector<llama_layer> layers;
 | 
						|
 | 
						|
    llama_model_params params;
 | 
						|
 | 
						|
    // gguf metadata
 | 
						|
    std::unordered_map<std::string, std::string> gguf_kv;
 | 
						|
 | 
						|
    // list of devices used in this model
 | 
						|
    std::vector<ggml_backend_dev_t> devices;
 | 
						|
 | 
						|
    // for quantize-stats only
 | 
						|
    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
 | 
						|
 | 
						|
    int64_t t_load_us  = 0;
 | 
						|
    int64_t t_start_us = 0;
 | 
						|
 | 
						|
    explicit llama_model(const struct llama_model_params & params);
 | 
						|
    ~llama_model();
 | 
						|
 | 
						|
    void load_stats  (llama_model_loader & ml);
 | 
						|
    void load_arch   (llama_model_loader & ml);
 | 
						|
    void load_hparams(llama_model_loader & ml);
 | 
						|
    void load_vocab  (llama_model_loader & ml);
 | 
						|
    bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
 | 
						|
 | 
						|
    std::string arch_name() const;
 | 
						|
    std::string type_name() const;
 | 
						|
 | 
						|
    std::string desc() const;
 | 
						|
 | 
						|
    size_t size() const;
 | 
						|
    size_t n_tensors() const;
 | 
						|
    size_t n_devices() const;
 | 
						|
 | 
						|
    // total number of parameters in the model
 | 
						|
    uint64_t n_elements() const;
 | 
						|
 | 
						|
    void print_info() const;
 | 
						|
 | 
						|
    ggml_backend_dev_t dev_layer(int il) const;
 | 
						|
    ggml_backend_dev_t dev_output() const;
 | 
						|
 | 
						|
    ggml_backend_buffer_type_t select_buft(int il) const;
 | 
						|
 | 
						|
    bool has_tensor_overrides() const;
 | 
						|
 | 
						|
    const struct ggml_tensor * get_tensor(const char * name) const;
 | 
						|
 | 
						|
    float get_rope_freq_base (const llama_cparams & cparams, int il) const;
 | 
						|
    float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
 | 
						|
 | 
						|
    ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
 | 
						|
 | 
						|
    // note: can mutate `cparams`
 | 
						|
    // TODO: move this to new llm_arch_model_i interface
 | 
						|
    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
 | 
						|
 | 
						|
    // TODO: move this to new llm_arch_model_i interface
 | 
						|
    llm_graph_result_ptr build_graph(
 | 
						|
            const llm_graph_params & params,
 | 
						|
                       ggml_cgraph * gf,
 | 
						|
                    llm_graph_type   type) const;
 | 
						|
 | 
						|
private:
 | 
						|
    struct impl;
 | 
						|
    std::unique_ptr<impl> pimpl;
 | 
						|
};
 | 
						|
 | 
						|
const char * llm_type_name(llm_type type);
 | 
						|
 | 
						|
// For internal test use
 | 
						|
// TODO: remove
 | 
						|
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
 |