mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	gptneox-main.cpp : n_layer --> n_block
This commit is contained in:
		
							
								
								
									
										135
									
								
								gptneox-main.cpp
									
									
									
									
									
								
							
							
						
						
									
										135
									
								
								gptneox-main.cpp
									
									
									
									
									
								
							| @@ -24,13 +24,13 @@ struct gpt_neox_hparams { | ||||
|     uint32_t n_ctx    = 0; | ||||
|     uint32_t n_embd   = 0; | ||||
|     uint32_t n_head   = 0; | ||||
|     uint32_t n_layer  = 0; | ||||
|     uint32_t n_block  = 0; | ||||
|     uint32_t n_rot    = 0; // rotary_pct * (n_embd / n_head) | ||||
|     bool par_res = true; | ||||
|     float norm_eps = 1e-5; | ||||
| }; | ||||
|  | ||||
| struct gpt_neox_layer { | ||||
| struct gpt_neox_block { | ||||
|     // pre normalization | ||||
|     struct ggml_tensor * ln_1_g; | ||||
|     struct ggml_tensor * ln_1_b; | ||||
| @@ -65,7 +65,7 @@ struct gpt_neox_model { | ||||
|  | ||||
|     struct ggml_tensor * lmh_g; // language model head | ||||
|  | ||||
|     std::vector<gpt_neox_layer> layers; | ||||
|     std::vector<gpt_neox_block> blocks; | ||||
|  | ||||
|     // key + value memory | ||||
|     struct ggml_tensor * memory_k; | ||||
| @@ -415,7 +415,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 | ||||
|                   if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  } | ||||
|  | ||||
|         if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.layer_count"); | ||||
|                   if (keyidx != -1) { hparams.n_layer = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  } | ||||
|                   if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  } | ||||
|  | ||||
|         if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count"); | ||||
|                   if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  } | ||||
| @@ -434,7 +434,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 | ||||
|         printf("%s: n_ctx    = %d\n", __func__, hparams.n_ctx); | ||||
|         printf("%s: n_embd   = %d\n", __func__, hparams.n_embd); | ||||
|         printf("%s: n_head   = %d\n", __func__, hparams.n_head); | ||||
|         printf("%s: n_layer  = %d\n", __func__, hparams.n_layer); | ||||
|         printf("%s: n_block  = %d\n", __func__, hparams.n_block); | ||||
|         printf("%s: n_rot    = %d\n", __func__, hparams.n_rot); | ||||
|         printf("%s: par_res  = %d\n", __func__, hparams.par_res); | ||||
|         printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps); | ||||
| @@ -545,9 +545,9 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 | ||||
|  | ||||
|     // prepare memory for the weights | ||||
|     { | ||||
|         const int n_layer = model.hparams.n_layer; | ||||
|         const int n_block = model.hparams.n_block; | ||||
|  | ||||
|         model.layers.resize(n_layer); | ||||
|         model.blocks.resize(n_block); | ||||
|  | ||||
|         model.wte    = ggml_get_tensor(ctx, "transformer.token_embd.weight"); | ||||
|         model.ln_f_g = ggml_get_tensor(ctx, "transformer.output_norm.weight"); | ||||
| @@ -560,47 +560,47 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 | ||||
|         model.tensors["transformer.output_norm.bias"]   = model.ln_f_b; | ||||
|         model.tensors["transformer.output.weight"] = model.lmh_g; | ||||
|  | ||||
|         for (int i = 0; i < n_layer; ++i) { | ||||
|             auto & layer = model.layers[i]; | ||||
|         for (int i = 0; i < n_block; ++i) { | ||||
|             auto & block = model.blocks[i]; | ||||
|  | ||||
|             std::string blocknamestart = "transformer.blocks." + std::to_string(i) + "."; | ||||
|  | ||||
|             layer.ln_1_g          = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); | ||||
|             layer.ln_1_b          = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); | ||||
|             block.ln_1_g          = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" ); | ||||
|             block.ln_1_b          = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" ); | ||||
|  | ||||
|             layer.c_attn_attn_w   = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); | ||||
|             layer.c_attn_attn_b   = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" ); | ||||
|             block.c_attn_attn_w   = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" ); | ||||
|             block.c_attn_attn_b   = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" ); | ||||
|  | ||||
|             layer.c_attn_proj_w   = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); | ||||
|             layer.c_attn_proj_b   = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" ); | ||||
|             block.c_attn_proj_w   = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" ); | ||||
|             block.c_attn_proj_b   = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" ); | ||||
|  | ||||
|             layer.ln_2_g          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" ); | ||||
|             layer.ln_2_b          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias"); | ||||
|             block.ln_2_g          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" ); | ||||
|             block.ln_2_b          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias"); | ||||
|  | ||||
|             layer.c_mlp_fc_w      = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); | ||||
|             layer.c_mlp_fc_b      = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" ); | ||||
|             block.c_mlp_fc_w      = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" ); | ||||
|             block.c_mlp_fc_b      = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" ); | ||||
|  | ||||
|             layer.c_mlp_proj_w    = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); | ||||
|             layer.c_mlp_proj_b    = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" ); | ||||
|             block.c_mlp_proj_w    = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" ); | ||||
|             block.c_mlp_proj_b    = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" ); | ||||
|  | ||||
|             // map by name | ||||
|             model.tensors[blocknamestart + "attn_norm.weight"] = layer.ln_1_g; | ||||
|             model.tensors[blocknamestart + "attn_norm.bias"]   = layer.ln_1_b; | ||||
|             model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g; | ||||
|             model.tensors[blocknamestart + "attn_norm.bias"]   = block.ln_1_b; | ||||
|  | ||||
|             model.tensors[blocknamestart + "attn_qkv.weight"] = layer.c_attn_attn_w; | ||||
|             model.tensors[blocknamestart + "attn_qkv.bias"]   = layer.c_attn_attn_b; | ||||
|             model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w; | ||||
|             model.tensors[blocknamestart + "attn_qkv.bias"]   = block.c_attn_attn_b; | ||||
|  | ||||
|             model.tensors[blocknamestart + "attn_output.weight"] = layer.c_attn_proj_w; | ||||
|             model.tensors[blocknamestart + "attn_output.bias"]   = layer.c_attn_proj_b; | ||||
|             model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w; | ||||
|             model.tensors[blocknamestart + "attn_output.bias"]   = block.c_attn_proj_b; | ||||
|  | ||||
|             model.tensors[blocknamestart + "ffn_norm.weight"] = layer.ln_2_g; | ||||
|             model.tensors[blocknamestart + "ffn_norm.bias"]   = layer.ln_2_b; | ||||
|             model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g; | ||||
|             model.tensors[blocknamestart + "ffn_norm.bias"]   = block.ln_2_b; | ||||
|  | ||||
|             model.tensors[blocknamestart + "ffn_up.weight"] = layer.c_mlp_fc_w; | ||||
|             model.tensors[blocknamestart + "ffn_up.bias"]   = layer.c_mlp_fc_b; | ||||
|             model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w; | ||||
|             model.tensors[blocknamestart + "ffn_up.bias"]   = block.c_mlp_fc_b; | ||||
|  | ||||
|             model.tensors[blocknamestart + "ffn_down.weight"] = layer.c_mlp_proj_w; | ||||
|             model.tensors[blocknamestart + "ffn_down.bias"]   = layer.c_mlp_proj_b; | ||||
|             model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w; | ||||
|             model.tensors[blocknamestart + "ffn_down.bias"]   = block.c_mlp_proj_b; | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -610,10 +610,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 | ||||
|         const auto & hparams = model.hparams; | ||||
|  | ||||
|         const int n_embd  = hparams.n_embd; | ||||
|         const int n_layer = hparams.n_layer; | ||||
|         const int n_block = hparams.n_block; | ||||
|         const int n_ctx   = hparams.n_ctx; | ||||
|  | ||||
|         const int64_t n_mem      = n_layer*n_ctx; | ||||
|         const int64_t n_mem      = n_block*n_ctx; | ||||
|         const int64_t n_elements = n_embd*n_mem; | ||||
|  | ||||
|         // create the ggml context | ||||
| @@ -647,37 +647,23 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2 | ||||
|  | ||||
| // feed-forward network | ||||
| ggml_tensor * gpt_neox_ff( | ||||
|         const gpt_neox_layer &layer, | ||||
|         const gpt_neox_block &block, | ||||
|         ggml_context * ctx0, | ||||
|         ggml_tensor * inp) { | ||||
|     ggml_tensor * cur = ggml_norm(ctx0, inp); | ||||
|  | ||||
|     cur = ggml_add(ctx0, | ||||
|         ggml_mul(ctx0, | ||||
|             ggml_repeat(ctx0, layer.ln_2_g, cur), | ||||
|             cur), | ||||
|         ggml_repeat(ctx0, layer.ln_2_b, cur)); | ||||
|  | ||||
|     cur = ggml_mul_mat(ctx0, | ||||
|             layer.c_mlp_fc_w, | ||||
|             cur); | ||||
|  | ||||
|     cur = ggml_add(ctx0, | ||||
|             ggml_repeat(ctx0, layer.c_mlp_fc_b, cur), | ||||
|             cur); | ||||
|     cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur)); | ||||
|     cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur); | ||||
|     cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur); | ||||
|  | ||||
|     // GELU activation | ||||
|     cur = ggml_gelu(ctx0, cur); | ||||
|  | ||||
|     // projection | ||||
|     // cur = proj_w*cur + proj_b | ||||
|     cur = ggml_mul_mat(ctx0, | ||||
|             layer.c_mlp_proj_w, | ||||
|             cur); | ||||
|     cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur); | ||||
|  | ||||
|     cur = ggml_add(ctx0, | ||||
|             ggml_repeat(ctx0, layer.c_mlp_proj_b, cur), | ||||
|             cur); | ||||
|     cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur); | ||||
|     return cur; | ||||
| } | ||||
|  | ||||
| @@ -701,7 +687,7 @@ bool gpt_neox_eval( | ||||
|     const auto & hparams = model.hparams; | ||||
|  | ||||
|     const int n_embd  = hparams.n_embd; | ||||
|     const int n_layer = hparams.n_layer; | ||||
|     const int n_block = hparams.n_block; | ||||
|     const int n_ctx   = hparams.n_ctx; | ||||
|     const int n_head  = hparams.n_head; | ||||
|     const int n_vocab = hparams.n_vocab; | ||||
| @@ -747,7 +733,7 @@ bool gpt_neox_eval( | ||||
|     // wte | ||||
|     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); | ||||
|  | ||||
|     for (int il = 0; il < n_layer; ++il) { | ||||
|     for (int il = 0; il < n_block; ++il) { | ||||
|         struct ggml_tensor * cur; | ||||
|  | ||||
|         ggml_set_scratch(ctx0, { 0, scr0_size, scr0, }); | ||||
| @@ -758,22 +744,15 @@ bool gpt_neox_eval( | ||||
|                 cur = ggml_norm(ctx0, inpL); | ||||
|  | ||||
|                 cur = ggml_add(ctx0, | ||||
|                         ggml_mul(ctx0, | ||||
|                             ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), | ||||
|                             cur), | ||||
|                         ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); | ||||
|                         ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur), | ||||
|                         ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur)); | ||||
|             } | ||||
|  | ||||
|             // compute QKV | ||||
|             { | ||||
|  | ||||
|                 cur = ggml_mul_mat(ctx0, | ||||
|                         model.layers[il].c_attn_attn_w, | ||||
|                         cur); | ||||
|  | ||||
|                 cur = ggml_add(ctx0, | ||||
|                         ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), | ||||
|                         cur); | ||||
|                 cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur); | ||||
|                 cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur); | ||||
|             } | ||||
|  | ||||
|             struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); | ||||
| @@ -798,10 +777,7 @@ bool gpt_neox_eval( | ||||
|             } | ||||
|  | ||||
|             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) | ||||
|             struct ggml_tensor * Q = | ||||
|                 ggml_permute(ctx0, | ||||
|                         Qcur, | ||||
|                         0, 2, 1, 3); | ||||
|             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); | ||||
|  | ||||
|             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) | ||||
|             struct ggml_tensor * K = | ||||
| @@ -842,17 +818,12 @@ bool gpt_neox_eval( | ||||
|             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); | ||||
|  | ||||
|             // cur = KQV_merged.contiguous().view(n_embd, N) | ||||
|             cur = ggml_cpy(ctx0, | ||||
|                     KQV_merged, | ||||
|                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); | ||||
|             cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); | ||||
|  | ||||
|             // projection | ||||
|             { | ||||
|                 cur = ggml_mul_mat(ctx0, | ||||
|                         model.layers[il].c_attn_proj_w, | ||||
|                         cur); | ||||
|  | ||||
|                 cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); | ||||
|                 cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur); | ||||
|                 cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur); | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -861,7 +832,7 @@ bool gpt_neox_eval( | ||||
|         if (hparams.par_res == 0) { | ||||
|             struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); | ||||
|  | ||||
|             cur = gpt_neox_ff(model.layers[il], ctx0, inpFF); | ||||
|             cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF); | ||||
|  | ||||
|             // input for next layer | ||||
|             inpL = ggml_add(ctx0, cur, inpFF); | ||||
| @@ -870,7 +841,7 @@ bool gpt_neox_eval( | ||||
|  | ||||
|             // this is independent of the self-attention result, so it could be done in parallel to the self-attention | ||||
|             // note here we pass inpL instead of cur | ||||
|             cur = gpt_neox_ff(model.layers[il], ctx0, inpL); | ||||
|             cur = gpt_neox_ff(model.blocks[il], ctx0, inpL); | ||||
|  | ||||
|             // layer input + FF | ||||
|             cur  = ggml_add(ctx0, cur, inpFF); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 klosax
					klosax