llama : add support for EmbeddingGemma 300m (#15798)

This commit add support for the EmbeddingGemma 300m. This model supports sliding window attention (SWA) and a new swq_type is introduced to support symmetric SWA masking. This commit also extracts the code from the function llama_is_masked_swa in llama-impl.h, so that the logic can be shared by both llm_graph_input_attn_no_cache::set_input and llama_kv_cache::set_input_kq_mask. With this commit the EmbeddingGemma 300m model can be converted to to GGUF and used with llama.cpp. Once the model has been uploaded to HuggingFace it can be used like this: ```console ./build/bin/llama-cli -hf ggml-org/embeddinggemma-300m-GGUF:Q8_0 ```
2025-11-04 09:32:00 +00:00 · 2025-09-04 18:10:29 +02:00
parent 856ed0947f
commit fb15d649ed
15 changed files with 328 additions and 47 deletions
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -27,11 +27,10 @@ llama_kv_cache::llama_kv_cache(
                 uint32_t   n_seq_max,
                 uint32_t   n_pad,
                 uint32_t   n_swa,
-           llama_swa_type   swa_type,
    const layer_filter_cb & filter,
    const  layer_reuse_cb & reuse) :
    model(model), hparams(model.hparams), v_trans(v_trans),
-    n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
+    n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa) {

    GGML_ASSERT(kv_size % n_pad == 0);

@@ -1393,29 +1392,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
 }

 bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
-    assert(p0 >= 0 && p1 >= 0);
-
-    switch (swa_type) {
-        case LLAMA_SWA_TYPE_NONE:
-            {
-            } break;
-        case LLAMA_SWA_TYPE_STANDARD:
-            {
-                if (p1 - p0 >= (int32_t) n_swa) {
-                    return true;
-                }
-            } break;
-        case LLAMA_SWA_TYPE_CHUNKED:
-            {
-                const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
-
-                if (p0 < pos_chunk_start) {
-                    return true;
-                }
-            } break;
-    }
-
-    return false;
+    return hparams.is_masked_swa(p0, p1);
 }

 void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {