mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : make starcoder graph build more consistent with others
This commit is contained in:
		
							
								
								
									
										98
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										98
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -3446,7 +3446,9 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|     const int64_t n_layer     = hparams.n_layer; |     const int64_t n_layer     = hparams.n_layer; | ||||||
|     const int64_t n_ctx       = hparams.n_ctx; |     const int64_t n_ctx       = hparams.n_ctx; | ||||||
|     const int64_t n_head      = hparams.n_head; |     const int64_t n_head      = hparams.n_head; | ||||||
|  |     const int64_t n_head_kv   = hparams.n_head_kv; | ||||||
|     const int64_t n_embd_head = hparams.n_embd_head(); |     const int64_t n_embd_head = hparams.n_embd_head(); | ||||||
|  |     const int64_t n_embd_gqa  = hparams.n_embd_gqa(); | ||||||
|  |  | ||||||
|     GGML_ASSERT(n_embd_head == hparams.n_rot); |     GGML_ASSERT(n_embd_head == hparams.n_rot); | ||||||
|  |  | ||||||
| @@ -3508,28 +3510,44 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|         position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); |         position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); | ||||||
|  |     ggml_allocr_alloc(lctx.alloc, KQ_scale); | ||||||
|  |     if (!ggml_allocr_is_measure(lctx.alloc)) { | ||||||
|  |         ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); | ||||||
|  |     } | ||||||
|  |     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); | ||||||
|  |  | ||||||
|     inpL = ggml_add(ctx0, token, position); |     inpL = ggml_add(ctx0, token, position); | ||||||
|  |     ggml_set_name(inpL, "inpL"); | ||||||
|  |  | ||||||
|     for (int il = 0; il < n_layer; ++il) { |     for (int il = 0; il < n_layer; ++il) { | ||||||
|         { |         { | ||||||
|             // Norm |             // Norm | ||||||
|             cur = ggml_norm(ctx0, inpL, norm_eps); |             cur = ggml_norm(ctx0, inpL, norm_eps); | ||||||
|             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); |             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); | ||||||
|  |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         { |         { | ||||||
|             // Self Attention |             // Self Attention | ||||||
|             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); |             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); | ||||||
|  |  | ||||||
|             struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); |             struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); | ||||||
|             struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); |             struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); | ||||||
|             struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); |             struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); | ||||||
|  |  | ||||||
|             // store key and value to memory |             struct ggml_tensor * Qcur = tmpq; | ||||||
|             if (N >= 1) { |             struct ggml_tensor * Kcur = tmpk; | ||||||
|                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); |  | ||||||
|                 struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); |             { | ||||||
|  |                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); | ||||||
|  |                 ggml_set_name(Vcur, "Vcur"); | ||||||
|  |  | ||||||
|  |                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); | ||||||
|  |                 ggml_set_name(k, "k"); | ||||||
|  |  | ||||||
|  |                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, | ||||||
|  |                         (   n_ctx)*ggml_element_size(kv_self.v), | ||||||
|  |                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); | ||||||
|  |  | ||||||
|                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); |                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); | ||||||
|                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); |                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); | ||||||
| @@ -3541,56 +3559,62 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|                             Qcur, |                             Qcur, | ||||||
|                             ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), |                             ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), | ||||||
|                         0, 2, 1, 3); |                         0, 2, 1, 3); | ||||||
|  |             ggml_set_name(Q, "Q"); | ||||||
|  |  | ||||||
|             struct ggml_tensor * K = |             struct ggml_tensor * K = | ||||||
|                 ggml_permute(ctx0, |                 ggml_view_3d(ctx0, kv_self.k, | ||||||
|                         ggml_reshape_3d(ctx0, |                         n_embd_head, n_past + N, n_head_kv, | ||||||
|                             ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), |                         ggml_element_size(kv_self.k)*n_embd_gqa, | ||||||
|                             n_embd/n_head, n_head, n_past + N), |                         ggml_element_size(kv_self.k)*n_embd_head, | ||||||
|                         0, 2, 1, 3); //TODO: need to be tiled |                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); | ||||||
|  |             ggml_set_name(K, "K"); | ||||||
|  |  | ||||||
|             // K * Q |             // K * Q | ||||||
|             // [n_past + N, N, 12] |  | ||||||
|             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); |             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); | ||||||
|  |             ggml_set_name(KQ, "KQ"); | ||||||
|  |  | ||||||
|             // KQ_scaled = KQ / sqrt(n_embd/n_head) |             // KQ_scaled = KQ / sqrt(n_embd_head) | ||||||
|             // [n_past + N, N, 12] |             // KQ_scaled shape [n_past + N, N, n_head, 1] | ||||||
|             struct ggml_tensor * KQ_scaled = |             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); | ||||||
|                 ggml_scale_inplace(ctx0, |             ggml_set_name(KQ_scaled, "KQ_scaled"); | ||||||
|                         KQ, |  | ||||||
|                         ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) |  | ||||||
|                         ); |  | ||||||
|  |  | ||||||
|             // KQ_masked = mask_past(KQ_scaled) |             // KQ_masked = mask_past(KQ_scaled) | ||||||
|             // [n_past + N, N, 12] |  | ||||||
|             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); |             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); | ||||||
|  |             ggml_set_name(KQ_masked, "KQ_masked"); | ||||||
|  |  | ||||||
|             // KQ = soft_max(KQ_masked) |             // KQ = soft_max(KQ_masked) | ||||||
|             // [n_past + N, N, 12] |  | ||||||
|             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); |             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); | ||||||
|  |             ggml_set_name(KQ_soft_max, "KQ_soft_max"); | ||||||
|  |  | ||||||
|             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() |             // split cached V into n_head heads | ||||||
|             // [n_past + N, 64, 12] |             struct ggml_tensor * V = | ||||||
|             struct ggml_tensor * V_trans = |                 ggml_view_3d(ctx0, kv_self.v, | ||||||
|                 ggml_cpy(ctx0, |                         n_past + N, n_embd_head, n_head_kv, | ||||||
|                         ggml_permute(ctx0, |                         ggml_element_size(kv_self.v)*n_ctx, | ||||||
|                             ggml_reshape_3d(ctx0, |                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head, | ||||||
|                                 ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), |                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); | ||||||
|                                 n_embd/n_head, n_head, n_past + N), |             ggml_set_name(V, "V"); | ||||||
|                             1, 2, 0, 3), |  | ||||||
|                         ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); |  | ||||||
|  |  | ||||||
|             // KQV = transpose(V) * KQ_soft_max | #if 1 | ||||||
|             // [64, N, 12] |             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); | ||||||
|             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); |             ggml_set_name(KQV, "KQV"); | ||||||
|  | #else | ||||||
|  |             // make V contiguous in memory to speed up the matmul, however we waste time on the copy | ||||||
|  |             // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation | ||||||
|  |             // is there a better way? | ||||||
|  |             struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); | ||||||
|  |             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|             // KQV_merged = KQV.permute(0, 2, 1, 3) |             // KQV_merged = KQV.permute(0, 2, 1, 3) | ||||||
|             // [64, 12, N] |  | ||||||
|             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); |             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); | ||||||
|  |             ggml_set_name(KQV_merged, "KQV_merged"); | ||||||
|  |  | ||||||
|  |             // cur = KQV_merged.contiguous().view(n_embd, N) | ||||||
|             cur = ggml_cpy(ctx0, |             cur = ggml_cpy(ctx0, | ||||||
|                     KQV_merged, |                     KQV_merged, | ||||||
|                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); |                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); | ||||||
|  |             ggml_set_name(cur, "KQV_merged_contiguous"); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Projection |         // Projection | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov