mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : add phi3 128K model support (#7225)
* add phi3 128k support in convert-hf-to-gguf * add phi3 128k support in cuda * address build warnings on llama.cpp * adjust index value in cuda long rope freq factors * add long rope support in ggml cpu backend * make freq factors only depend on ctx size * remove unused rope scaling type 'su' frin gguf converter * fix flint warnings on convert-hf-to-gguf.py * set to the short freq factor when context size is small than trained context size * add one line of comments * metal : support rope freq_factors * ggml : update ggml_rope_ext API to support freq. factors * backends : add dev messages to support rope freq. factors * minor : style * tests : update to use new rope API * backends : fix pragma semicolons * minor : cleanup * llama : move rope factors from KV header to tensors * llama : remove tmp assert * cuda : fix compile warning * convert : read/write n_head_kv * llama : fix uninitialized tensors --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -1763,14 +1763,14 @@ struct test_llama : public test_llm { | ||||
|                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur); | ||||
|                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur); | ||||
|  | ||||
|                 Qcur = ggml_rope_custom( | ||||
|                     ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, | ||||
|                 Qcur = ggml_rope_ext( | ||||
|                     ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr, | ||||
|                     hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale, | ||||
|                     ext_factor, attn_factor, beta_fast, beta_slow | ||||
|                 ); | ||||
|  | ||||
|                 Kcur = ggml_rope_custom( | ||||
|                     ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, | ||||
|                 Kcur = ggml_rope_ext( | ||||
|                     ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr, | ||||
|                     hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale, | ||||
|                     ext_factor, attn_factor, beta_fast, beta_slow | ||||
|                 ); | ||||
| @@ -1889,13 +1889,13 @@ struct test_falcon : public test_llm { | ||||
|                 Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens); | ||||
|  | ||||
|                 // using mode = 2 for neox mode | ||||
|                 Qcur = ggml_rope_custom( | ||||
|                     ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx, | ||||
|                 Qcur = ggml_rope_ext( | ||||
|                     ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx, | ||||
|                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow | ||||
|                 ); | ||||
|  | ||||
|                 Kcur = ggml_rope_custom( | ||||
|                     ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx, | ||||
|                 Kcur = ggml_rope_ext( | ||||
|                     ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx, | ||||
|                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow | ||||
|                 ); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 liuwei-git
					liuwei-git