mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : make starcoder graph build more consistent with others
This commit is contained in:
		
							
								
								
									
										98
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										98
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -3446,7 +3446,9 @@ static struct ggml_cgraph * llm_build_starcoder( | ||||
|     const int64_t n_layer     = hparams.n_layer; | ||||
|     const int64_t n_ctx       = hparams.n_ctx; | ||||
|     const int64_t n_head      = hparams.n_head; | ||||
|     const int64_t n_head_kv   = hparams.n_head_kv; | ||||
|     const int64_t n_embd_head = hparams.n_embd_head(); | ||||
|     const int64_t n_embd_gqa  = hparams.n_embd_gqa(); | ||||
|  | ||||
|     GGML_ASSERT(n_embd_head == hparams.n_rot); | ||||
|  | ||||
| @@ -3508,28 +3510,44 @@ static struct ggml_cgraph * llm_build_starcoder( | ||||
|         position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); | ||||
|     } | ||||
|  | ||||
|     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); | ||||
|     ggml_allocr_alloc(lctx.alloc, KQ_scale); | ||||
|     if (!ggml_allocr_is_measure(lctx.alloc)) { | ||||
|         ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); | ||||
|     } | ||||
|     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); | ||||
|  | ||||
|     inpL = ggml_add(ctx0, token, position); | ||||
|     ggml_set_name(inpL, "inpL"); | ||||
|  | ||||
|     for (int il = 0; il < n_layer; ++il) { | ||||
|         { | ||||
|             // Norm | ||||
|             cur = ggml_norm(ctx0, inpL, norm_eps); | ||||
|             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); | ||||
|  | ||||
|         } | ||||
|  | ||||
|         { | ||||
|             // Self Attention | ||||
|             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); | ||||
|  | ||||
|             struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); | ||||
|             struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); | ||||
|             struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); | ||||
|             struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); | ||||
|             struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); | ||||
|             struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); | ||||
|  | ||||
|             // store key and value to memory | ||||
|             if (N >= 1) { | ||||
|                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); | ||||
|                 struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); | ||||
|             struct ggml_tensor * Qcur = tmpq; | ||||
|             struct ggml_tensor * Kcur = tmpk; | ||||
|  | ||||
|             { | ||||
|                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N)); | ||||
|                 ggml_set_name(Vcur, "Vcur"); | ||||
|  | ||||
|                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); | ||||
|                 ggml_set_name(k, "k"); | ||||
|  | ||||
|                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, | ||||
|                         (   n_ctx)*ggml_element_size(kv_self.v), | ||||
|                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); | ||||
|  | ||||
|                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); | ||||
|                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); | ||||
| @@ -3541,56 +3559,62 @@ static struct ggml_cgraph * llm_build_starcoder( | ||||
|                             Qcur, | ||||
|                             ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), | ||||
|                         0, 2, 1, 3); | ||||
|             ggml_set_name(Q, "Q"); | ||||
|  | ||||
|             struct ggml_tensor * K = | ||||
|                 ggml_permute(ctx0, | ||||
|                         ggml_reshape_3d(ctx0, | ||||
|                             ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), | ||||
|                             n_embd/n_head, n_head, n_past + N), | ||||
|                         0, 2, 1, 3); //TODO: need to be tiled | ||||
|                 ggml_view_3d(ctx0, kv_self.k, | ||||
|                         n_embd_head, n_past + N, n_head_kv, | ||||
|                         ggml_element_size(kv_self.k)*n_embd_gqa, | ||||
|                         ggml_element_size(kv_self.k)*n_embd_head, | ||||
|                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); | ||||
|             ggml_set_name(K, "K"); | ||||
|  | ||||
|             // K * Q | ||||
|             // [n_past + N, N, 12] | ||||
|             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); | ||||
|             ggml_set_name(KQ, "KQ"); | ||||
|  | ||||
|             // KQ_scaled = KQ / sqrt(n_embd/n_head) | ||||
|             // [n_past + N, N, 12] | ||||
|             struct ggml_tensor * KQ_scaled = | ||||
|                 ggml_scale_inplace(ctx0, | ||||
|                         KQ, | ||||
|                         ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) | ||||
|                         ); | ||||
|             // KQ_scaled = KQ / sqrt(n_embd_head) | ||||
|             // KQ_scaled shape [n_past + N, N, n_head, 1] | ||||
|             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); | ||||
|             ggml_set_name(KQ_scaled, "KQ_scaled"); | ||||
|  | ||||
|             // KQ_masked = mask_past(KQ_scaled) | ||||
|             // [n_past + N, N, 12] | ||||
|             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); | ||||
|             ggml_set_name(KQ_masked, "KQ_masked"); | ||||
|  | ||||
|             // KQ = soft_max(KQ_masked) | ||||
|             // [n_past + N, N, 12] | ||||
|             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); | ||||
|             ggml_set_name(KQ_soft_max, "KQ_soft_max"); | ||||
|  | ||||
|             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() | ||||
|             // [n_past + N, 64, 12] | ||||
|             struct ggml_tensor * V_trans = | ||||
|                 ggml_cpy(ctx0, | ||||
|                         ggml_permute(ctx0, | ||||
|                             ggml_reshape_3d(ctx0, | ||||
|                                 ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), | ||||
|                                 n_embd/n_head, n_head, n_past + N), | ||||
|                             1, 2, 0, 3), | ||||
|                         ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); | ||||
|             // split cached V into n_head heads | ||||
|             struct ggml_tensor * V = | ||||
|                 ggml_view_3d(ctx0, kv_self.v, | ||||
|                         n_past + N, n_embd_head, n_head_kv, | ||||
|                         ggml_element_size(kv_self.v)*n_ctx, | ||||
|                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head, | ||||
|                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); | ||||
|             ggml_set_name(V, "V"); | ||||
|  | ||||
|             // KQV = transpose(V) * KQ_soft_max | ||||
|             // [64, N, 12] | ||||
|             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); | ||||
| #if 1 | ||||
|             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); | ||||
|             ggml_set_name(KQV, "KQV"); | ||||
| #else | ||||
|             // make V contiguous in memory to speed up the matmul, however we waste time on the copy | ||||
|             // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation | ||||
|             // is there a better way? | ||||
|             struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); | ||||
|             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); | ||||
| #endif | ||||
|  | ||||
|             // KQV_merged = KQV.permute(0, 2, 1, 3) | ||||
|             // [64, 12, N] | ||||
|             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); | ||||
|             ggml_set_name(KQV_merged, "KQV_merged"); | ||||
|  | ||||
|             // cur = KQV_merged.contiguous().view(n_embd, N) | ||||
|             cur = ggml_cpy(ctx0, | ||||
|                     KQV_merged, | ||||
|                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); | ||||
|             ggml_set_name(cur, "KQV_merged_contiguous"); | ||||
|         } | ||||
|  | ||||
|         // Projection | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov