mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	starcoder : add GPU offloading (#3827)
* starcoder : do not GPU split 1D bias tensors * starcoder : offload layers to GPU ggml-ci
This commit is contained in:
		
							
								
								
									
										106
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -2695,8 +2695,8 @@ static void llm_load_tensors( | |||||||
|                 } break; |                 } break; | ||||||
|             case LLM_ARCH_STARCODER: |             case LLM_ARCH_STARCODER: | ||||||
|                 { |                 { | ||||||
|                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); |                     model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU); | ||||||
|                     model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); |                     model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); | ||||||
|  |  | ||||||
|                     // output |                     // output | ||||||
|                     { |                     { | ||||||
| @@ -2747,19 +2747,19 @@ static void llm_load_tensors( | |||||||
|                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend); |                         layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend); | ||||||
|  |  | ||||||
|                         layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); |                         layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); | ||||||
|                         layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split); |                         layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend); | ||||||
|  |  | ||||||
|                         layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split); |                         layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split); | ||||||
|                         layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend_split); |                         layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend); | ||||||
|  |  | ||||||
|                         layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); |                         layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); | ||||||
|                         layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend); |                         layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend); | ||||||
|  |  | ||||||
|                         layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); |                         layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); | ||||||
|                         layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split); |                         layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend); | ||||||
|  |  | ||||||
|                         layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split); |                         layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split); | ||||||
|                         layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split); |                         layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend); | ||||||
|  |  | ||||||
|                         if (backend == GGML_BACKEND_GPU) { |                         if (backend == GGML_BACKEND_GPU) { | ||||||
|                             vram_weights += |                             vram_weights += | ||||||
| @@ -4616,6 +4616,8 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|  |  | ||||||
|     const float norm_eps = hparams.f_norm_eps; |     const float norm_eps = hparams.f_norm_eps; | ||||||
|  |  | ||||||
|  |     const int n_gpu_layers = model.n_gpu_layers; | ||||||
|  |  | ||||||
|     const int32_t n_tokens = batch.n_tokens; |     const int32_t n_tokens = batch.n_tokens; | ||||||
|     const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n; |     const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n; | ||||||
|     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; |     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; | ||||||
| @@ -4660,6 +4662,27 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     const int i_gpu_start = n_layer - n_gpu_layers; | ||||||
|  |     (void) i_gpu_start; | ||||||
|  |  | ||||||
|  |     // offload functions set the tensor output backend to GPU | ||||||
|  |     // tensors are GPU-accelerated if any input or the output has been offloaded | ||||||
|  |     offload_func_t offload_func_nr = llama_nop; // nr = non-repeating | ||||||
|  |     offload_func_t offload_func_kq = llama_nop; | ||||||
|  |     offload_func_t offload_func_v  = llama_nop; | ||||||
|  |  | ||||||
|  | #ifdef GGML_USE_CUBLAS | ||||||
|  |     if (n_gpu_layers > n_layer) { | ||||||
|  |         offload_func_nr = ggml_cuda_assign_buffers_no_alloc; | ||||||
|  |     } | ||||||
|  |     if (n_gpu_layers > n_layer + 1) { | ||||||
|  |         offload_func_v  = ggml_cuda_assign_buffers_no_alloc; | ||||||
|  |     } | ||||||
|  |     if (n_gpu_layers > n_layer + 2) { | ||||||
|  |         offload_func_kq = ggml_cuda_assign_buffers_no_alloc; | ||||||
|  |     } | ||||||
|  | #endif // GGML_USE_CUBLAS | ||||||
|  |  | ||||||
|     { |     { | ||||||
|         // Compute position embeddings. |         // Compute position embeddings. | ||||||
|         struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); |         struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); | ||||||
| @@ -4685,6 +4708,7 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|     // KQ_mask (mask for 1 head, it will be broadcasted to all heads) |     // KQ_mask (mask for 1 head, it will be broadcasted to all heads) | ||||||
|     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); |     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); | ||||||
|     ggml_set_name(KQ_mask, "KQ_mask"); |     ggml_set_name(KQ_mask, "KQ_mask"); | ||||||
|  |     offload_func_kq(KQ_mask); | ||||||
|     ggml_allocr_alloc(lctx.alloc, KQ_mask); |     ggml_allocr_alloc(lctx.alloc, KQ_mask); | ||||||
|     if (!ggml_allocr_is_measure(lctx.alloc)) { |     if (!ggml_allocr_is_measure(lctx.alloc)) { | ||||||
|         float * data = (float *) KQ_mask->data; |         float * data = (float *) KQ_mask->data; | ||||||
| @@ -4708,44 +4732,67 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|     ggml_set_name(inpL, "inpL"); |     ggml_set_name(inpL, "inpL"); | ||||||
|  |  | ||||||
|     for (int il = 0; il < n_layer; ++il) { |     for (int il = 0; il < n_layer; ++il) { | ||||||
|  |         offload_func_t offload_func = llama_nop; | ||||||
|  |  | ||||||
|  | #ifdef GGML_USE_CUBLAS | ||||||
|  |         if (il >= i_gpu_start) { | ||||||
|  |             offload_func = ggml_cuda_assign_buffers_no_alloc; | ||||||
|  |         } | ||||||
|  | #endif // GGML_USE_CUBLAS | ||||||
|  |  | ||||||
|         { |         { | ||||||
|             // Norm |             // Norm | ||||||
|             cur = ggml_norm(ctx0, inpL, norm_eps); |             cur = ggml_norm(ctx0, inpL, norm_eps); | ||||||
|  |             offload_func(cur); | ||||||
|  |  | ||||||
|             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); |             cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); | ||||||
|  |             offload_func(cur); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         { |         { | ||||||
|             // Self Attention |             // Self Attention | ||||||
|             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); |             cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); | ||||||
|  |             offload_func_kq(cur); | ||||||
|  |  | ||||||
|             struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); |             cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||||
|             struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); |             offload_func_kq(cur); | ||||||
|             struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); |  | ||||||
|  |  | ||||||
|             struct ggml_tensor * Qcur = tmpq; |             struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||||
|  |             struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||||
|  |             struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||||
|  |  | ||||||
|  |             ggml_set_name(tmpq, "tmpq"); | ||||||
|  |             ggml_set_name(tmpk, "tmpk"); | ||||||
|  |             ggml_set_name(tmpv, "tmpv"); | ||||||
|  |  | ||||||
|  |             offload_func_kq(tmpq); | ||||||
|  |             offload_func_kq(tmpk); | ||||||
|  |             offload_func_v (tmpv); | ||||||
|  |  | ||||||
|  |             struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); | ||||||
|             struct ggml_tensor * Kcur = tmpk; |             struct ggml_tensor * Kcur = tmpk; | ||||||
|  |  | ||||||
|             { |             { | ||||||
|                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); |                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); | ||||||
|  |                 offload_func_v(Vcur); | ||||||
|                 ggml_set_name(Vcur, "Vcur"); |                 ggml_set_name(Vcur, "Vcur"); | ||||||
|  |  | ||||||
|                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); |                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); | ||||||
|  |                 offload_func_kq(k); | ||||||
|                 ggml_set_name(k, "k"); |                 ggml_set_name(k, "k"); | ||||||
|  |  | ||||||
|                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, |                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, | ||||||
|                         (   n_ctx)*ggml_element_size(kv_self.v), |                         (   n_ctx)*ggml_element_size(kv_self.v), | ||||||
|                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); |                         (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); | ||||||
|  |                 offload_func_v(v); | ||||||
|  |                 ggml_set_name(v, "v"); | ||||||
|  |  | ||||||
|                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); |                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); | ||||||
|                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); |                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             struct ggml_tensor * Q = |             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); | ||||||
|                 ggml_permute(ctx0, |             offload_func_kq(Q); | ||||||
|                         ggml_cpy(ctx0, |  | ||||||
|                             Qcur, |  | ||||||
|                             ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), |  | ||||||
|                         0, 2, 1, 3); |  | ||||||
|             ggml_set_name(Q, "Q"); |             ggml_set_name(Q, "Q"); | ||||||
|  |  | ||||||
|             struct ggml_tensor * K = |             struct ggml_tensor * K = | ||||||
| @@ -4754,23 +4801,28 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|                         ggml_element_size(kv_self.k)*n_embd_gqa, |                         ggml_element_size(kv_self.k)*n_embd_gqa, | ||||||
|                         ggml_element_size(kv_self.k)*n_embd_head, |                         ggml_element_size(kv_self.k)*n_embd_head, | ||||||
|                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); |                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); | ||||||
|  |             offload_func_kq(K); | ||||||
|             ggml_set_name(K, "K"); |             ggml_set_name(K, "K"); | ||||||
|  |  | ||||||
|             // K * Q |             // K * Q | ||||||
|             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); |             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); | ||||||
|  |             offload_func_kq(KQ); | ||||||
|             ggml_set_name(KQ, "KQ"); |             ggml_set_name(KQ, "KQ"); | ||||||
|  |  | ||||||
|             // KQ_scaled = KQ / sqrt(n_embd_head) |             // KQ_scaled = KQ / sqrt(n_embd_head) | ||||||
|             // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] |             // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] | ||||||
|             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); |             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); | ||||||
|  |             offload_func_kq(KQ_scaled); | ||||||
|             ggml_set_name(KQ_scaled, "KQ_scaled"); |             ggml_set_name(KQ_scaled, "KQ_scaled"); | ||||||
|  |  | ||||||
|             // KQ_masked = mask_past(KQ_scaled) |             // KQ_masked = mask_past(KQ_scaled) | ||||||
|             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); |             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); | ||||||
|  |             offload_func_kq(KQ_masked); | ||||||
|             ggml_set_name(KQ_masked, "KQ_masked"); |             ggml_set_name(KQ_masked, "KQ_masked"); | ||||||
|  |  | ||||||
|             // KQ = soft_max(KQ_masked) |             // KQ = soft_max(KQ_masked) | ||||||
|             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); |             struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); | ||||||
|  |             offload_func_v(KQ_soft_max); | ||||||
|             ggml_set_name(KQ_soft_max, "KQ_soft_max"); |             ggml_set_name(KQ_soft_max, "KQ_soft_max"); | ||||||
|  |  | ||||||
|             // split cached V into n_head heads |             // split cached V into n_head heads | ||||||
| @@ -4783,22 +4835,25 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|             ggml_set_name(V, "V"); |             ggml_set_name(V, "V"); | ||||||
|  |  | ||||||
|             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); |             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); | ||||||
|  |             offload_func_v(KQV); | ||||||
|             ggml_set_name(KQV, "KQV"); |             ggml_set_name(KQV, "KQV"); | ||||||
|  |  | ||||||
|             // KQV_merged = KQV.permute(0, 2, 1, 3) |  | ||||||
|             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); |             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); | ||||||
|  |             offload_func_v(KQV_merged); | ||||||
|             ggml_set_name(KQV_merged, "KQV_merged"); |             ggml_set_name(KQV_merged, "KQV_merged"); | ||||||
|  |  | ||||||
|             // cur = KQV_merged.contiguous().view(n_embd, n_tokens) |  | ||||||
|             cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); |             cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); | ||||||
|  |             offload_func_v(cur); | ||||||
|             ggml_set_name(cur, "KQV_merged_contiguous"); |             ggml_set_name(cur, "KQV_merged_contiguous"); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Projection |         // Projection | ||||||
|         cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); |         cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); | ||||||
|  |         offload_func(cur); | ||||||
|  |  | ||||||
|         // Add the input |         // Add the input | ||||||
|         cur = ggml_add(ctx0, cur, inpL); |         cur = ggml_add(ctx0, cur, inpL); | ||||||
|  |         offload_func(cur); | ||||||
|  |  | ||||||
|         struct ggml_tensor * inpFF = cur; |         struct ggml_tensor * inpFF = cur; | ||||||
|  |  | ||||||
| @@ -4807,27 +4862,36 @@ static struct ggml_cgraph * llm_build_starcoder( | |||||||
|             // Norm |             // Norm | ||||||
|             { |             { | ||||||
|                 cur = ggml_norm(ctx0, inpFF, norm_eps); |                 cur = ggml_norm(ctx0, inpFF, norm_eps); | ||||||
|  |                 offload_func_nr(cur); | ||||||
|  |  | ||||||
|                 cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); |                 cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); | ||||||
|  |                 offload_func_nr(cur); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); |             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); | ||||||
|  |             offload_func(cur); | ||||||
|  |  | ||||||
|             // GELU activation |             // GELU activation | ||||||
|             cur = ggml_gelu(ctx0, cur); |             cur = ggml_gelu(ctx0, cur); | ||||||
|  |             offload_func(cur); | ||||||
|  |  | ||||||
|             // Projection |             // Projection | ||||||
|             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); |             cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); | ||||||
|  |             offload_func(cur); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         inpL = ggml_add(ctx0, cur, inpFF); |         inpL = ggml_add(ctx0, cur, inpFF); | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Output Norm |     // Output Norm | ||||||
|     { |     { | ||||||
|         cur = ggml_norm(ctx0, inpL, norm_eps); |         cur = ggml_norm(ctx0, inpL, norm_eps); | ||||||
|  |         offload_func_nr(cur); | ||||||
|  |  | ||||||
|         cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); |         cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); | ||||||
|  |         ggml_set_name(cur, "result_norm"); | ||||||
|     } |     } | ||||||
|     ggml_set_name(cur, "result_norm"); |  | ||||||
|  |  | ||||||
|     cur = ggml_mul_mat(ctx0, model.output, cur); |     cur = ggml_mul_mat(ctx0, model.output, cur); | ||||||
|     ggml_set_name(cur, "result_output"); |     ggml_set_name(cur, "result_output"); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov