mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : remove ggml_cont where possible (#14568)
This commit is contained in:
		| @@ -5670,12 +5670,10 @@ struct llm_build_falcon : public llm_graph_context { | ||||
|                 cur = build_lora_mm(model.layers[il].wqkv, cur); | ||||
|                 cb(cur, "wqkv", il); | ||||
|  | ||||
|                 ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 // using mode = 2 for neox mode | ||||
| @@ -5952,12 +5950,10 @@ struct llm_build_dbrx : public llm_graph_context { | ||||
|                 cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); | ||||
|                 cb(cur, "wqkv_clamped", il); | ||||
|  | ||||
|                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 Qcur = ggml_rope_ext( | ||||
| @@ -6468,12 +6464,10 @@ struct llm_build_neo_bert : public llm_graph_context { | ||||
|                 cur = build_lora_mm(model.layers[il].wqkv, cur); | ||||
|                 cb(cur, "wqkv", il); | ||||
|  | ||||
|                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 // RoPE | ||||
| @@ -6703,8 +6697,8 @@ struct llm_build_mpt : public llm_graph_context { | ||||
|                     cb(cur, "wqkv_clamped", il); | ||||
|                 } | ||||
|  | ||||
|                 ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|  | ||||
|                 cb(Qcur, "Qcur", il); | ||||
| @@ -6724,6 +6718,12 @@ struct llm_build_mpt : public llm_graph_context { | ||||
|                             model.layers[il].attn_k_norm_b, | ||||
|                             LLM_NORM, il); | ||||
|                     cb(Kcur, "Kcur", il); | ||||
|                 } else { | ||||
|                     Qcur = ggml_cont(ctx0, Qcur); | ||||
|                     cb(Qcur, "Qcur", il); | ||||
|  | ||||
|                     Kcur = ggml_cont(ctx0, Kcur); | ||||
|                     cb(Kcur, "Kcur", il); | ||||
|                 } | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
| @@ -6978,12 +6978,10 @@ struct llm_build_qwen : public llm_graph_context { | ||||
|                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||
|                 cb(cur, "bqkv", il); | ||||
|  | ||||
|                 ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,   n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd))); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 // using mode = 2 for neox mode | ||||
| @@ -7748,21 +7746,21 @@ struct llm_build_phi2 : public llm_graph_context { | ||||
|                     cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||
|                     cb(cur, "bqkv", il); | ||||
|  | ||||
|                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                     Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                     Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|                 } else { | ||||
|                     Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); | ||||
|                     Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); | ||||
|                     Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); | ||||
|                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 } | ||||
|  | ||||
|                 cb(Qcur, "Qcur", il); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|                 cb(Vcur, "Vcur", il); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 Qcur = ggml_rope_ext( | ||||
| @@ -7886,21 +7884,21 @@ struct llm_build_phi3 : public llm_graph_context { | ||||
|                     cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output); | ||||
|                     cb(cur, "wqkv", il); | ||||
|  | ||||
|                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); | ||||
|                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); | ||||
|                     Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head * sizeof(float), cur->nb[1], 0 * sizeof(float) * (n_embd)); | ||||
|                     Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), cur->nb[1], 1 * sizeof(float) * (n_embd)); | ||||
|                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); | ||||
|                 } else { | ||||
|                     Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq); | ||||
|                     Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk); | ||||
|                     Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv); | ||||
|                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 } | ||||
|  | ||||
|                 cb(Qcur, "Qcur", il); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|                 cb(Vcur, "Vcur", il); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 Qcur = ggml_rope_ext( | ||||
| @@ -8256,12 +8254,10 @@ struct llm_build_codeshell : public llm_graph_context { | ||||
|                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||
|                 cb(cur, "bqkv", il); | ||||
|  | ||||
|                 ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 Qcur = ggml_rope_ext( | ||||
| @@ -8677,8 +8673,6 @@ struct llm_build_minicpm3 : public llm_graph_context { | ||||
|                         ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); | ||||
|                 cb(k_pe, "k_pe", il); | ||||
|  | ||||
|                 // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont | ||||
|                 kv_compressed = ggml_cont(ctx0, kv_compressed); | ||||
|                 kv_compressed = build_norm(kv_compressed, | ||||
|                         model.layers[il].attn_kv_a_norm, NULL, | ||||
|                         LLM_NORM_RMS, il); | ||||
| @@ -8710,7 +8704,6 @@ struct llm_build_minicpm3 : public llm_graph_context { | ||||
|                         0); | ||||
|                 cb(v_states, "v_states", il); | ||||
|  | ||||
|                 q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this | ||||
|                 q_pe = ggml_rope_ext( | ||||
|                         ctx0, q_pe, inp_pos, rope_factors, | ||||
|                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||||
| @@ -8719,7 +8712,6 @@ struct llm_build_minicpm3 : public llm_graph_context { | ||||
|                 cb(q_pe, "q_pe", il); | ||||
|  | ||||
|                 // shared RoPE key | ||||
|                 k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this | ||||
|                 k_pe = ggml_rope_ext( | ||||
|                         ctx0, k_pe, inp_pos, rope_factors, | ||||
|                         n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | ||||
| @@ -10784,10 +10776,10 @@ struct llm_build_openelm : public llm_graph_context { | ||||
|  | ||||
|                 cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); | ||||
|  | ||||
|                 ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, cur->nb[1], cur->nb[2], 0)); | ||||
|                 ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, cur->nb[1], cur->nb[2], 0); | ||||
|                 cb(Qcur, "Qcur", il); | ||||
|  | ||||
|                 ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); | ||||
|                 ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head); | ||||
|                 cb(Kcur, "Kcur", il); | ||||
|  | ||||
|                 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); | ||||
| @@ -10909,12 +10901,10 @@ struct llm_build_gptneox : public llm_graph_context { | ||||
|                 cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||
|                 cb(cur, "bqkv", il); | ||||
|  | ||||
|                 ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                 ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                 ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 Qcur = ggml_rope_ext( | ||||
| @@ -12159,6 +12149,8 @@ struct llm_build_chatglm : public llm_graph_context { | ||||
|                     if (model.layers[il].bv) { | ||||
|                         Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); | ||||
|                     } | ||||
|                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 } else { | ||||
|                     cur = build_lora_mm(model.layers[il].wqkv, cur); | ||||
|                     cb(cur, "wqkv", il); | ||||
| @@ -12166,13 +12158,11 @@ struct llm_build_chatglm : public llm_graph_context { | ||||
|                         cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||
|                         cb(cur, "bqkv", il); | ||||
|                     } | ||||
|                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                     Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                     Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|                 } | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor); | ||||
| @@ -12293,6 +12283,8 @@ struct llm_build_glm4 : public llm_graph_context { | ||||
|                     if (model.layers[il].bv) { | ||||
|                         Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); | ||||
|                     } | ||||
|                     Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 } else { | ||||
|                     cur = build_lora_mm(model.layers[il].wqkv, cur); | ||||
|                     cb(cur, "wqkv", il); | ||||
| @@ -12300,13 +12292,11 @@ struct llm_build_glm4 : public llm_graph_context { | ||||
|                         cur = ggml_add(ctx0, cur, model.layers[il].bqkv); | ||||
|                         cb(cur, "bqkv", il); | ||||
|                     } | ||||
|                     Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); | ||||
|                     Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); | ||||
|                     Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); | ||||
|                     Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); | ||||
|                     Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); | ||||
|                 } | ||||
|  | ||||
|                 Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens); | ||||
|                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | ||||
|                 Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | ||||
|  | ||||
|                 Qcur = ggml_rope_ext( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sigbjørn Skjæret
					Sigbjørn Skjæret