model: EmbeddingGemma Adding Support for SentenceTransformers Dense Modules (#16367)

* model: EmbeddingGemma sentence-transformers dense linear projections support

* model: add support for EmbeddingGemma SentenceTransformers dense linear projections

Adding support for the Dense modules used in EmbeddingGemma models.
EmbeddingGemma is a SentenceTransformers model with additional modules beyond the base Transformer backbone.

See: https://developers.googleblog.com/en/gemma-explained-embeddinggemma-architecture-and-recipe/

* model: add support for EmbeddingGemma SentenceTransformers dense linear projections

- converting model with dense-layers is optional
- introduced dense config params

* Update convert_hf_to_gguf.py

Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* fixed formatting issues

* Update src/llama-graph.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* - removed pooling_type_opt, always allow overriding pooling_type
- asserts checking dense features dims

* fix python lint

* fix ubuntu gcc build warning

* - fixed thread-safety test
- moved asserts to load_hparams

* - tidying up code
- simplifying graph-context expecting both dense weights

* minor : add TODO

---------

Co-authored-by: Daniel Bevenius <daniel.bevenius@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Saba Fallah
2025-10-09 08:39:18 +02:00
committed by GitHub
parent 12bbc3fa50
commit e08db42595
12 changed files with 170 additions and 7 deletions

View File

@@ -128,6 +128,8 @@ class Keys:
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
@@ -433,6 +435,8 @@ class MODEL_TENSOR(IntEnum):
TOKEN_TYPES = auto()
POS_EMBD = auto()
OUTPUT = auto()
DENSE_2_OUT = auto() # embeddinggemma 2_Dense
DENSE_3_OUT = auto() # embeddinggemma 3_Dense
OUTPUT_NORM = auto()
ROPE_FREQS = auto()
ROPE_FACTORS_LONG = auto()
@@ -777,6 +781,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
@@ -1759,6 +1765,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.GEMMA_EMBEDDING: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.DENSE_2_OUT,
MODEL_TENSOR.DENSE_3_OUT,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,

View File

@@ -730,6 +730,10 @@ class GGUFWriter:
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)
self.add_uint32(Keys.LLM.DENSE_FEAT_OUT_SIZE.format(arch=self.arch, dense=dense), out_f)
def add_logit_scale(self, value: float) -> None:
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)

View File

@@ -76,7 +76,12 @@ class TensorNameMap:
"lm_head", # llama4
"model.transformer.ff_out", # llada
),
MODEL_TENSOR.DENSE_2_OUT: (
"dense_2_out", # embeddinggemma
),
MODEL_TENSOR.DENSE_3_OUT: (
"dense_3_out", # embeddinggemma
),
# Output norm
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox