mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	
							
								
								
									
										145
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										145
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -4315,6 +4315,7 @@ static struct ggml_tensor * llm_build_kqv( | ||||
|           const llama_model & model, | ||||
|         const llama_hparams & hparams, | ||||
|        const llama_kv_cache & kv, | ||||
|          struct ggml_cgraph * graph, | ||||
|          struct ggml_tensor * wo, | ||||
|          struct ggml_tensor * wo_b, | ||||
|          struct ggml_tensor * q_cur, | ||||
| @@ -4393,6 +4394,8 @@ static struct ggml_tensor * llm_build_kqv( | ||||
|     struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); | ||||
|     cb(cur, "kqv_merged_cont", il); | ||||
|  | ||||
|     ggml_build_forward_expand(graph, cur); | ||||
|  | ||||
|     cur = ggml_mul_mat(ctx, wo, cur); | ||||
|     if (wo_b) { | ||||
|         cb(cur, "kqv_wo", il); | ||||
| @@ -4405,6 +4408,44 @@ static struct ggml_tensor * llm_build_kqv( | ||||
|     return cur; | ||||
| } | ||||
|  | ||||
| static struct ggml_tensor * llm_build_kv( | ||||
|         struct ggml_context * ctx, | ||||
|           const llama_model & model, | ||||
|         const llama_hparams & hparams, | ||||
|        const llama_kv_cache & kv, | ||||
|          struct ggml_cgraph * graph, | ||||
|          struct ggml_tensor * wo, | ||||
|          struct ggml_tensor * wo_b, | ||||
|          struct ggml_tensor * k_cur, | ||||
|          struct ggml_tensor * v_cur, | ||||
|          struct ggml_tensor * q_cur, | ||||
|          struct ggml_tensor * kq_mask, | ||||
|                     int64_t   n_ctx, | ||||
|                     int32_t   n_tokens, | ||||
|                     int32_t   kv_head, | ||||
|                     int32_t   n_kv, | ||||
|                     float     max_alibi_bias, | ||||
|                     float     kq_scale, | ||||
|          const llm_build_cb & cb, | ||||
|                     int       il) { | ||||
|  | ||||
|     // these nodes are added to the graph together so that they are not reordered | ||||
|     // by doing so, the number of splits in the graph is reduced | ||||
|     ggml_build_forward_expand(graph, k_cur); | ||||
|     ggml_build_forward_expand(graph, v_cur); | ||||
|     ggml_build_forward_expand(graph, q_cur); | ||||
|  | ||||
|     llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|     struct ggml_tensor * cur; | ||||
|     cur  = llm_build_kqv(ctx, model, hparams, kv, graph, | ||||
|             wo, wo_b, | ||||
|             q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il); | ||||
|     cb(cur, "kqv_out", il); | ||||
|  | ||||
|     return cur; | ||||
| } | ||||
|  | ||||
| struct llm_build_context { | ||||
|     const llama_model    & model; | ||||
|     const llama_hparams  & hparams; | ||||
| @@ -4562,12 +4603,6 @@ struct llm_build_context { | ||||
|                     cb(Vcur, "Vcur", il); | ||||
|                 } | ||||
|  | ||||
|                 // these nodes are added to the graph together so that they are not reordered | ||||
|                 // by doing so, the number of splits in the graph is reduced | ||||
|                 ggml_build_forward_expand(gf, Qcur); | ||||
|                 ggml_build_forward_expand(gf, Kcur); | ||||
|                 ggml_build_forward_expand(gf, Vcur); | ||||
|  | ||||
|                 Qcur = ggml_rope_custom( | ||||
|                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, | ||||
|                     hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, | ||||
| @@ -4582,11 +4617,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -4763,14 +4796,13 @@ struct llm_build_context { | ||||
|                 cb(Qcur, "Qcur", il); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 // apply ALiBi for 13B model | ||||
|                 const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f; | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -4892,11 +4924,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -4993,11 +5023,9 @@ struct llm_build_context { | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5200,12 +5228,9 @@ struct llm_build_context { | ||||
|                         ); | ||||
|                 cb(Vcur, "Vcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 // TODO: not tested, could be broken | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5292,11 +5317,9 @@ struct llm_build_context { | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 cb(Qcur, "Qcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5390,11 +5413,9 @@ struct llm_build_context { | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5485,11 +5506,9 @@ struct llm_build_context { | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5597,11 +5616,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5714,11 +5731,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5837,11 +5852,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -5966,11 +5979,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -6071,11 +6082,9 @@ struct llm_build_context { | ||||
|                         ext_factor, attn_factor, beta_fast, beta_slow); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, NULL, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|             struct ggml_tensor * sa_out = cur; | ||||
| @@ -6172,11 +6181,9 @@ struct llm_build_context { | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -6283,11 +6290,9 @@ struct llm_build_context { | ||||
|                 ); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); | ||||
|  | ||||
|                 cur = llm_build_kqv(ctx0, model, hparams, kv_self, | ||||
|                 cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, | ||||
|                         model.layers[il].wo, model.layers[il].bo, | ||||
|                         Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                         Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); | ||||
|                 cb(cur, "kqv_out", il); | ||||
|             } | ||||
|  | ||||
| @@ -6355,6 +6360,14 @@ static struct ggml_cgraph * llama_build_graph( | ||||
|             ggml_set_name(cur, name); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         if (!lctx.cparams.offload_kqv) { | ||||
|             if (strcmp(name, "kqv_merged_cont") == 0) { | ||||
|                 // all nodes between the KV store and the attention output are run on the CPU | ||||
|                 ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // | ||||
|         // allocate input tensors and set input data | ||||
|         // | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren