mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : Add Gemma 3 support (+ experimental vision capability) (#12343)
* llama : Add Gemma 3 text-only support * fix python coding style * fix compile on ubuntu * python: fix style * fix ubuntu compile * fix build on ubuntu (again) * fix ubuntu build, finally * clip : Experimental support for Gemma 3 vision (#12344) * clip : Experimental support for Gemma 3 vision * fix build * PRId64
This commit is contained in:
		| @@ -9,6 +9,7 @@ | ||||
| #include <algorithm> | ||||
| #include <cassert> | ||||
| #include <cstring> | ||||
| #include <cmath> | ||||
| #include <functional> | ||||
| #include <map> | ||||
| #include <sstream> | ||||
| @@ -864,6 +865,23 @@ void llama_model::load_hparams(llama_model_loader & ml) { | ||||
|                     default: type = LLM_TYPE_UNKNOWN; | ||||
|                } | ||||
|             } break; | ||||
|         case LLM_ARCH_GEMMA3: | ||||
|             { | ||||
|                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa); | ||||
|                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); | ||||
|  | ||||
|                 switch (hparams.n_layer) { | ||||
|                     case 26: type = LLM_TYPE_1B; break; | ||||
|                     case 34: type = LLM_TYPE_4B; break; | ||||
|                     case 48: type = LLM_TYPE_12B; break; | ||||
|                     case 62: type = LLM_TYPE_27B; break; | ||||
|                     default: type = LLM_TYPE_UNKNOWN; | ||||
|                 } | ||||
|  | ||||
|                 hparams.f_attention_scale = type == LLM_TYPE_27B | ||||
|                     ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0))) | ||||
|                     : 1.0f / std::sqrt(float(hparams.n_embd_head_k)); | ||||
|             } break; | ||||
|         case LLM_ARCH_STARCODER2: | ||||
|             { | ||||
|                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); | ||||
| @@ -2454,6 +2472,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) { | ||||
|                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); | ||||
|                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); | ||||
|  | ||||
|                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); | ||||
|                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0); | ||||
|                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0); | ||||
|                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0); | ||||
|                         layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); | ||||
|                     } | ||||
|                 } break; | ||||
|             case LLM_ARCH_GEMMA3: | ||||
|                 { | ||||
|                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); | ||||
|  | ||||
|                     // output | ||||
|                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); | ||||
|                     output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading | ||||
|  | ||||
|                     for (int i = 0; i < n_layer; ++i) { | ||||
|                         auto & layer = layers[i]; | ||||
|  | ||||
|                         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); | ||||
|  | ||||
|                         layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0); | ||||
|                         layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0); | ||||
|                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0); | ||||
|                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); | ||||
|  | ||||
|                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); | ||||
|                         layer.attn_k_norm    = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM,    "weight", i), {n_embd_head_k}, 0); | ||||
|                         layer.attn_q_norm    = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM,    "weight", i), {n_embd_head_k}, 0); | ||||
|  | ||||
|                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); | ||||
|                         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0); | ||||
|                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0); | ||||
| @@ -3650,6 +3697,7 @@ void llama_model::print_info() const { | ||||
|         LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv); | ||||
|         LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias); | ||||
|         LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale); | ||||
|         LLAMA_LOG_INFO("%s: f_attn_scale     = %.1e\n",   __func__, hparams.f_attention_scale); | ||||
|         LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str()); | ||||
|         LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert); | ||||
|         LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used); | ||||
| @@ -3923,6 +3971,7 @@ enum llama_rope_type llama_model_rope_type(const struct llama_model * model) { | ||||
|         case LLM_ARCH_PHIMOE: | ||||
|         case LLM_ARCH_GEMMA: | ||||
|         case LLM_ARCH_GEMMA2: | ||||
|         case LLM_ARCH_GEMMA3: | ||||
|         case LLM_ARCH_STARCODER2: | ||||
|         case LLM_ARCH_OPENELM: | ||||
|         case LLM_ARCH_GPTNEOX: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan-Son Nguyen
					Xuan-Son Nguyen