mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-14 11:07:10 +00:00
This commit add support for the EmbeddingGemma 300m. This model supports sliding window attention (SWA) and a new swq_type is introduced to support symmetric SWA masking. This commit also extracts the code from the function llama_is_masked_swa in llama-impl.h, so that the logic can be shared by both llm_graph_input_attn_no_cache::set_input and llama_kv_cache::set_input_kq_mask. With this commit the EmbeddingGemma 300m model can be converted to to GGUF and used with llama.cpp. Once the model has been uploaded to HuggingFace it can be used like this: ```console ./build/bin/llama-cli -hf ggml-org/embeddinggemma-300m-GGUF:Q8_0 ```
505 lines
13 KiB
C++
505 lines
13 KiB
C++
#pragma once
|
|
|
|
#include "ggml.h" // ggml_op
|
|
|
|
#include <string>
|
|
|
|
//
|
|
// gguf constants (sync with gguf.py)
|
|
//
|
|
|
|
enum llm_arch {
|
|
LLM_ARCH_LLAMA,
|
|
LLM_ARCH_LLAMA4,
|
|
LLM_ARCH_DECI,
|
|
LLM_ARCH_FALCON,
|
|
LLM_ARCH_BAICHUAN,
|
|
LLM_ARCH_GROK,
|
|
LLM_ARCH_GPT2,
|
|
LLM_ARCH_GPTJ,
|
|
LLM_ARCH_GPTNEOX,
|
|
LLM_ARCH_MPT,
|
|
LLM_ARCH_STARCODER,
|
|
LLM_ARCH_REFACT,
|
|
LLM_ARCH_BERT,
|
|
LLM_ARCH_NOMIC_BERT,
|
|
LLM_ARCH_NOMIC_BERT_MOE,
|
|
LLM_ARCH_NEO_BERT,
|
|
LLM_ARCH_JINA_BERT_V2,
|
|
LLM_ARCH_JINA_BERT_V3,
|
|
LLM_ARCH_BLOOM,
|
|
LLM_ARCH_STABLELM,
|
|
LLM_ARCH_QWEN,
|
|
LLM_ARCH_QWEN2,
|
|
LLM_ARCH_QWEN2MOE,
|
|
LLM_ARCH_QWEN2VL,
|
|
LLM_ARCH_QWEN3,
|
|
LLM_ARCH_QWEN3MOE,
|
|
LLM_ARCH_PHI2,
|
|
LLM_ARCH_PHI3,
|
|
LLM_ARCH_PHIMOE,
|
|
LLM_ARCH_PLAMO,
|
|
LLM_ARCH_PLAMO2,
|
|
LLM_ARCH_CODESHELL,
|
|
LLM_ARCH_ORION,
|
|
LLM_ARCH_INTERNLM2,
|
|
LLM_ARCH_MINICPM,
|
|
LLM_ARCH_MINICPM3,
|
|
LLM_ARCH_GEMMA,
|
|
LLM_ARCH_GEMMA2,
|
|
LLM_ARCH_GEMMA3,
|
|
LLM_ARCH_GEMMA3N,
|
|
LLM_ARCH_GEMMA_EMBEDDING,
|
|
LLM_ARCH_STARCODER2,
|
|
LLM_ARCH_MAMBA,
|
|
LLM_ARCH_MAMBA2,
|
|
LLM_ARCH_JAMBA,
|
|
LLM_ARCH_FALCON_H1,
|
|
LLM_ARCH_XVERSE,
|
|
LLM_ARCH_COMMAND_R,
|
|
LLM_ARCH_COHERE2,
|
|
LLM_ARCH_DBRX,
|
|
LLM_ARCH_OLMO,
|
|
LLM_ARCH_OLMO2,
|
|
LLM_ARCH_OLMOE,
|
|
LLM_ARCH_OPENELM,
|
|
LLM_ARCH_ARCTIC,
|
|
LLM_ARCH_DEEPSEEK,
|
|
LLM_ARCH_DEEPSEEK2,
|
|
LLM_ARCH_CHATGLM,
|
|
LLM_ARCH_GLM4,
|
|
LLM_ARCH_GLM4_MOE,
|
|
LLM_ARCH_BITNET,
|
|
LLM_ARCH_T5,
|
|
LLM_ARCH_T5ENCODER,
|
|
LLM_ARCH_JAIS,
|
|
LLM_ARCH_NEMOTRON,
|
|
LLM_ARCH_NEMOTRON_H,
|
|
LLM_ARCH_EXAONE,
|
|
LLM_ARCH_EXAONE4,
|
|
LLM_ARCH_RWKV6,
|
|
LLM_ARCH_RWKV6QWEN2,
|
|
LLM_ARCH_RWKV7,
|
|
LLM_ARCH_ARWKV7,
|
|
LLM_ARCH_GRANITE,
|
|
LLM_ARCH_GRANITE_MOE,
|
|
LLM_ARCH_GRANITE_HYBRID,
|
|
LLM_ARCH_CHAMELEON,
|
|
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
LLM_ARCH_PLM,
|
|
LLM_ARCH_BAILINGMOE,
|
|
LLM_ARCH_DOTS1,
|
|
LLM_ARCH_ARCEE,
|
|
LLM_ARCH_ERNIE4_5,
|
|
LLM_ARCH_ERNIE4_5_MOE,
|
|
LLM_ARCH_HUNYUAN_MOE,
|
|
LLM_ARCH_HUNYUAN_DENSE,
|
|
LLM_ARCH_SMOLLM3,
|
|
LLM_ARCH_OPENAI_MOE,
|
|
LLM_ARCH_LFM2,
|
|
LLM_ARCH_DREAM,
|
|
LLM_ARCH_SMALLTHINKER,
|
|
LLM_ARCH_LLADA,
|
|
LLM_ARCH_SEED_OSS,
|
|
LLM_ARCH_UNKNOWN,
|
|
};
|
|
|
|
enum llm_kv {
|
|
LLM_KV_GENERAL_TYPE,
|
|
LLM_KV_GENERAL_ARCHITECTURE,
|
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
LLM_KV_GENERAL_ALIGNMENT,
|
|
LLM_KV_GENERAL_FILE_TYPE,
|
|
LLM_KV_GENERAL_NAME,
|
|
LLM_KV_GENERAL_AUTHOR,
|
|
LLM_KV_GENERAL_VERSION,
|
|
LLM_KV_GENERAL_URL,
|
|
LLM_KV_GENERAL_DESCRIPTION,
|
|
LLM_KV_GENERAL_LICENSE,
|
|
LLM_KV_GENERAL_SOURCE_URL,
|
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
|
|
|
LLM_KV_VOCAB_SIZE,
|
|
LLM_KV_CONTEXT_LENGTH,
|
|
LLM_KV_EMBEDDING_LENGTH,
|
|
LLM_KV_FEATURES_LENGTH,
|
|
LLM_KV_BLOCK_COUNT,
|
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
|
LLM_KV_FEED_FORWARD_LENGTH,
|
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
|
LLM_KV_EXPERT_COUNT,
|
|
LLM_KV_EXPERT_USED_COUNT,
|
|
LLM_KV_EXPERT_SHARED_COUNT,
|
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
|
LLM_KV_EXPERT_GATING_FUNC,
|
|
LLM_KV_MOE_EVERY_N_LAYERS,
|
|
LLM_KV_NEXTN_PREDICT_LAYERS,
|
|
LLM_KV_POOLING_TYPE,
|
|
LLM_KV_LOGIT_SCALE,
|
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
|
LLM_KV_SWIN_NORM,
|
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
|
LLM_KV_TIME_MIX_EXTRA_DIM,
|
|
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
|
LLM_KV_RESIDUAL_SCALE,
|
|
LLM_KV_EMBEDDING_SCALE,
|
|
LLM_KV_TOKEN_SHIFT_COUNT,
|
|
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
|
|
|
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
|
LLM_KV_ATTENTION_CLAMP_KQV,
|
|
LLM_KV_ATTENTION_KEY_LENGTH,
|
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
LLM_KV_ATTENTION_GROUPNORM_EPS,
|
|
LLM_KV_ATTENTION_GROUPNORM_GROUPS,
|
|
LLM_KV_ATTENTION_CAUSAL,
|
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
|
LLM_KV_ATTENTION_DECAY_LORA_RANK,
|
|
LLM_KV_ATTENTION_ICLR_LORA_RANK,
|
|
LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
|
|
LLM_KV_ATTENTION_GATE_LORA_RANK,
|
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
LLM_KV_ATTENTION_SCALE,
|
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
|
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
LLM_KV_ROPE_FREQ_BASE,
|
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
LLM_KV_ROPE_SCALING_TYPE,
|
|
LLM_KV_ROPE_SCALING_FACTOR,
|
|
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
|
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
|
|
LLM_KV_SPLIT_NO,
|
|
LLM_KV_SPLIT_COUNT,
|
|
LLM_KV_SPLIT_TENSORS_COUNT,
|
|
|
|
LLM_KV_SSM_INNER_SIZE,
|
|
LLM_KV_SSM_CONV_KERNEL,
|
|
LLM_KV_SSM_STATE_SIZE,
|
|
LLM_KV_SSM_TIME_STEP_RANK,
|
|
LLM_KV_SSM_GROUP_COUNT,
|
|
LLM_KV_SSM_DT_B_C_RMS,
|
|
|
|
LLM_KV_WKV_HEAD_SIZE,
|
|
|
|
LLM_KV_TOKENIZER_MODEL,
|
|
LLM_KV_TOKENIZER_PRE,
|
|
LLM_KV_TOKENIZER_LIST,
|
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
|
LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
|
|
LLM_KV_TOKENIZER_SCORES,
|
|
LLM_KV_TOKENIZER_MERGES,
|
|
LLM_KV_TOKENIZER_BOS_ID,
|
|
LLM_KV_TOKENIZER_EOS_ID,
|
|
LLM_KV_TOKENIZER_EOT_ID,
|
|
LLM_KV_TOKENIZER_EOM_ID,
|
|
LLM_KV_TOKENIZER_UNK_ID,
|
|
LLM_KV_TOKENIZER_SEP_ID,
|
|
LLM_KV_TOKENIZER_PAD_ID,
|
|
LLM_KV_TOKENIZER_CLS_ID,
|
|
LLM_KV_TOKENIZER_MASK_ID,
|
|
LLM_KV_TOKENIZER_ADD_BOS,
|
|
LLM_KV_TOKENIZER_ADD_EOS,
|
|
LLM_KV_TOKENIZER_ADD_SEP,
|
|
LLM_KV_TOKENIZER_ADD_PREFIX,
|
|
LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
|
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
|
LLM_KV_TOKENIZER_HF_JSON,
|
|
LLM_KV_TOKENIZER_RWKV,
|
|
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
|
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
|
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
|
LLM_KV_TOKENIZER_FIM_MID_ID,
|
|
LLM_KV_TOKENIZER_FIM_PAD_ID,
|
|
LLM_KV_TOKENIZER_FIM_REP_ID,
|
|
LLM_KV_TOKENIZER_FIM_SEP_ID,
|
|
|
|
LLM_KV_ADAPTER_TYPE,
|
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
|
LLM_KV_ADAPTER_LORA_TASK_NAME,
|
|
LLM_KV_ADAPTER_LORA_PROMPT_PREFIX,
|
|
|
|
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
|
LLM_KV_POSNET_BLOCK_COUNT,
|
|
|
|
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
|
|
LLM_KV_CONVNEXT_BLOCK_COUNT,
|
|
|
|
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
|
|
|
|
LLM_KV_SHORTCONV_L_CACHE,
|
|
|
|
// deprecated:
|
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
|
};
|
|
|
|
enum llm_tensor {
|
|
LLM_TENSOR_TOKEN_EMBD,
|
|
LLM_TENSOR_TOKEN_EMBD_NORM,
|
|
LLM_TENSOR_TOKEN_TYPES,
|
|
LLM_TENSOR_POS_EMBD,
|
|
LLM_TENSOR_OUTPUT,
|
|
LLM_TENSOR_OUTPUT_NORM,
|
|
LLM_TENSOR_ROPE_FREQS,
|
|
LLM_TENSOR_ROPE_FACTORS_LONG,
|
|
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
|
LLM_TENSOR_ATTN_Q,
|
|
LLM_TENSOR_ATTN_K,
|
|
LLM_TENSOR_ATTN_V,
|
|
LLM_TENSOR_ATTN_QKV,
|
|
LLM_TENSOR_ATTN_OUT,
|
|
LLM_TENSOR_ATTN_NORM,
|
|
LLM_TENSOR_ATTN_NORM_2,
|
|
LLM_TENSOR_ATTN_OUT_NORM,
|
|
LLM_TENSOR_ATTN_POST_NORM,
|
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
LLM_TENSOR_ATTN_SINKS,
|
|
LLM_TENSOR_FFN_GATE_INP,
|
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
|
LLM_TENSOR_FFN_NORM,
|
|
LLM_TENSOR_FFN_POST_NORM,
|
|
LLM_TENSOR_FFN_GATE,
|
|
LLM_TENSOR_FFN_DOWN,
|
|
LLM_TENSOR_FFN_UP,
|
|
LLM_TENSOR_FFN_ACT,
|
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
|
LLM_TENSOR_FFN_GATE_EXP,
|
|
LLM_TENSOR_FFN_UP_EXP,
|
|
LLM_TENSOR_FFN_NORM_EXPS,
|
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
|
LLM_TENSOR_FFN_GATE_EXPS,
|
|
LLM_TENSOR_FFN_UP_EXPS,
|
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
|
LLM_TENSOR_FFN_UP_SHEXP,
|
|
LLM_TENSOR_FFN_EXP_PROBS_B,
|
|
LLM_TENSOR_ATTN_Q_NORM,
|
|
LLM_TENSOR_ATTN_K_NORM,
|
|
LLM_TENSOR_LAYER_OUT_NORM,
|
|
LLM_TENSOR_POST_ATTN_NORM,
|
|
LLM_TENSOR_POST_MLP_NORM,
|
|
LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
|
|
LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
|
|
LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n
|
|
LLM_TENSOR_PER_LAYER_PROJ, // gemma3n
|
|
LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n
|
|
LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n
|
|
LLM_TENSOR_ALTUP_PROJ, // gemma3n
|
|
LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n
|
|
LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n
|
|
LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n
|
|
LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n
|
|
LLM_TENSOR_ALTUP_ROUTER, // gemma3n
|
|
LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n
|
|
LLM_TENSOR_LAUREL_L, // gemma3n
|
|
LLM_TENSOR_LAUREL_R, // gemma3n
|
|
LLM_TENSOR_LAUREL_POST_NORM, // gemma3n
|
|
LLM_TENSOR_SSM_IN,
|
|
LLM_TENSOR_SSM_CONV1D,
|
|
LLM_TENSOR_SSM_X,
|
|
LLM_TENSOR_SSM_DT,
|
|
LLM_TENSOR_SSM_DT_NORM,
|
|
LLM_TENSOR_SSM_A,
|
|
LLM_TENSOR_SSM_B_NORM,
|
|
LLM_TENSOR_SSM_C_NORM,
|
|
LLM_TENSOR_SSM_D,
|
|
LLM_TENSOR_SSM_NORM,
|
|
LLM_TENSOR_SSM_OUT,
|
|
LLM_TENSOR_TIME_MIX_W0,
|
|
LLM_TENSOR_TIME_MIX_W1,
|
|
LLM_TENSOR_TIME_MIX_W2,
|
|
LLM_TENSOR_TIME_MIX_A0,
|
|
LLM_TENSOR_TIME_MIX_A1,
|
|
LLM_TENSOR_TIME_MIX_A2,
|
|
LLM_TENSOR_TIME_MIX_V0,
|
|
LLM_TENSOR_TIME_MIX_V1,
|
|
LLM_TENSOR_TIME_MIX_V2,
|
|
LLM_TENSOR_TIME_MIX_G1,
|
|
LLM_TENSOR_TIME_MIX_G2,
|
|
LLM_TENSOR_TIME_MIX_K_K,
|
|
LLM_TENSOR_TIME_MIX_K_A,
|
|
LLM_TENSOR_TIME_MIX_R_K,
|
|
LLM_TENSOR_TIME_MIX_LERP_X,
|
|
LLM_TENSOR_TIME_MIX_LERP_W,
|
|
LLM_TENSOR_TIME_MIX_LERP_K,
|
|
LLM_TENSOR_TIME_MIX_LERP_V,
|
|
LLM_TENSOR_TIME_MIX_LERP_R,
|
|
LLM_TENSOR_TIME_MIX_LERP_G,
|
|
LLM_TENSOR_TIME_MIX_LERP_FUSED,
|
|
LLM_TENSOR_TIME_MIX_FIRST,
|
|
LLM_TENSOR_TIME_MIX_DECAY,
|
|
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
|
LLM_TENSOR_TIME_MIX_DECAY_W2,
|
|
LLM_TENSOR_TIME_MIX_KEY,
|
|
LLM_TENSOR_TIME_MIX_VALUE,
|
|
LLM_TENSOR_TIME_MIX_RECEPTANCE,
|
|
LLM_TENSOR_TIME_MIX_GATE,
|
|
LLM_TENSOR_TIME_MIX_LN,
|
|
LLM_TENSOR_TIME_MIX_OUTPUT,
|
|
LLM_TENSOR_CHANNEL_MIX_LERP_K,
|
|
LLM_TENSOR_CHANNEL_MIX_LERP_R,
|
|
LLM_TENSOR_CHANNEL_MIX_KEY,
|
|
LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
|
|
LLM_TENSOR_CHANNEL_MIX_VALUE,
|
|
LLM_TENSOR_ATTN_Q_A,
|
|
LLM_TENSOR_ATTN_Q_B,
|
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
|
LLM_TENSOR_ATTN_KV_B,
|
|
LLM_TENSOR_ATTN_K_B,
|
|
LLM_TENSOR_ATTN_V_B,
|
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
|
LLM_TENSOR_ATTN_SUB_NORM,
|
|
LLM_TENSOR_FFN_SUB_NORM,
|
|
LLM_TENSOR_DEC_ATTN_NORM,
|
|
LLM_TENSOR_DEC_ATTN_Q,
|
|
LLM_TENSOR_DEC_ATTN_K,
|
|
LLM_TENSOR_DEC_ATTN_V,
|
|
LLM_TENSOR_DEC_ATTN_OUT,
|
|
LLM_TENSOR_DEC_ATTN_REL_B,
|
|
LLM_TENSOR_DEC_CROSS_ATTN_NORM,
|
|
LLM_TENSOR_DEC_CROSS_ATTN_Q,
|
|
LLM_TENSOR_DEC_CROSS_ATTN_K,
|
|
LLM_TENSOR_DEC_CROSS_ATTN_V,
|
|
LLM_TENSOR_DEC_CROSS_ATTN_OUT,
|
|
LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
|
|
LLM_TENSOR_DEC_FFN_NORM,
|
|
LLM_TENSOR_DEC_FFN_GATE,
|
|
LLM_TENSOR_DEC_FFN_DOWN,
|
|
LLM_TENSOR_DEC_FFN_UP,
|
|
LLM_TENSOR_DEC_OUTPUT_NORM,
|
|
LLM_TENSOR_ENC_ATTN_NORM,
|
|
LLM_TENSOR_ENC_ATTN_Q,
|
|
LLM_TENSOR_ENC_ATTN_K,
|
|
LLM_TENSOR_ENC_ATTN_V,
|
|
LLM_TENSOR_ENC_ATTN_OUT,
|
|
LLM_TENSOR_ENC_ATTN_REL_B,
|
|
LLM_TENSOR_ENC_FFN_NORM,
|
|
LLM_TENSOR_ENC_FFN_GATE,
|
|
LLM_TENSOR_ENC_FFN_DOWN,
|
|
LLM_TENSOR_ENC_FFN_UP,
|
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
|
LLM_TENSOR_CLS,
|
|
LLM_TENSOR_CLS_OUT,
|
|
LLM_TENSOR_CONV1D,
|
|
LLM_TENSOR_CONVNEXT_DW,
|
|
LLM_TENSOR_CONVNEXT_NORM,
|
|
LLM_TENSOR_CONVNEXT_PW1,
|
|
LLM_TENSOR_CONVNEXT_PW2,
|
|
LLM_TENSOR_CONVNEXT_GAMMA,
|
|
LLM_TENSOR_POS_NET_CONV1,
|
|
LLM_TENSOR_POS_NET_CONV2,
|
|
LLM_TENSOR_POS_NET_NORM,
|
|
LLM_TENSOR_POS_NET_NORM1,
|
|
LLM_TENSOR_POS_NET_NORM2,
|
|
LLM_TENSOR_POS_NET_ATTN_NORM,
|
|
LLM_TENSOR_POS_NET_ATTN_Q,
|
|
LLM_TENSOR_POS_NET_ATTN_K,
|
|
LLM_TENSOR_POS_NET_ATTN_V,
|
|
LLM_TENSOR_POS_NET_ATTN_OUT,
|
|
LLM_TENSOR_SHORTCONV_CONV,
|
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
LLM_TENSOR_NEXTN_EH_PROJ,
|
|
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
|
LLM_TENSOR_NEXTN_ENORM,
|
|
LLM_TENSOR_NEXTN_HNORM,
|
|
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
|
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
|
};
|
|
|
|
enum llm_tensor_layer {
|
|
LLM_TENSOR_LAYER_INPUT,
|
|
LLM_TENSOR_LAYER_REPEATING,
|
|
LLM_TENSOR_LAYER_OUTPUT,
|
|
};
|
|
|
|
struct LLM_KV {
|
|
LLM_KV(llm_arch arch, const char * suffix = nullptr);
|
|
|
|
llm_arch arch;
|
|
const char * suffix;
|
|
|
|
std::string operator()(llm_kv kv) const;
|
|
};
|
|
|
|
// helper to handle gguf constants
|
|
// usage:
|
|
//
|
|
// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
|
//
|
|
// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
|
|
// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
|
|
// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
|
|
//
|
|
struct LLM_TN_IMPL {
|
|
const llm_arch arch;
|
|
const llm_tensor tensor;
|
|
const char * const suffix;
|
|
const int bid;
|
|
const int xid;
|
|
|
|
std::string str() const;
|
|
|
|
operator std::string() const {
|
|
return str();
|
|
}
|
|
|
|
friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
|
|
return str == tn.str();
|
|
}
|
|
|
|
friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
|
|
return str != tn.str();
|
|
}
|
|
};
|
|
|
|
struct LLM_TN {
|
|
LLM_TN(llm_arch arch) : arch(arch) {}
|
|
|
|
llm_arch arch;
|
|
|
|
LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
|
|
return { arch, tensor, suffix, bid, xid };
|
|
}
|
|
|
|
LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
|
|
return { arch, tensor, nullptr, bid, xid };
|
|
}
|
|
};
|
|
|
|
|
|
struct llm_tensor_info {
|
|
llm_tensor_layer layer;
|
|
ggml_op op;
|
|
};
|
|
|
|
const char * llm_arch_name(llm_arch arch);
|
|
|
|
llm_arch llm_arch_from_string(const std::string & name);
|
|
|
|
const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
|
|
|
bool llm_arch_is_recurrent(const llm_arch & arch);
|
|
bool llm_arch_is_hybrid (const llm_arch & arch);
|
|
bool llm_arch_is_diffusion(const llm_arch & arch);
|