mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llm : add Falcon support (#2717)
* llama : refactor GGUF constants into static maps * llama : check if model architecture is known * llama : refactor llama_model_load_internal() * gguf : add KV constant maps * llm : read arch-specific KVs * convert : add dummy scores + types * falcon : load tensor data (CPU only) * llama : fix loading progress bar * llama : add arch member to llama_model * falcon : CPU inference working * falcon : support non-40B models * falcon : minor * llama : minor updates ggml-ci * convert-falcon-hf-to-gguf.py : fix special token mapping * llama.cpp : llama default UNK token = id 0 * llama.cpp : fix bpe tokenizer * llama.cpp : fix the fix of bpe tokenizer * ggml : pass eps to ggml_norm * metal : implement RoPE (mode = 2) + avoid ggml_repeat * ggml : ggml_repeat always creates new tensor * falcon : copy-paste self-attention from LLaMA * metal : print extra compute pipeline info * falcon : minor changes (still chasing the Metal problem) * llama.cpp : fix linefeed token * metal : fix GELU kernel numerical stability by using precise::tanh * metal : temporary workaround for the concurrency optimization bug * falcon : add CUDA offloading (#2739) * llama : better model naming and size reporting * llama : prep new tokenizer support * llama : advanced BPE tokenizer based on ggllm.cpp imlpementation * llama : remove oboslete comment ggml-ci * common : remove obsolete BPE API + disable test-tokenizer-1 * llama : revert BPE special-case in llama_byte_to_token() * cuda : add TODOs for RoPE NeoX implementation * llama : default special tokens based on vocab type * perplexity : add log for start of tokenization --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
		
							
								
								
									
										30
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -3554,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { | ||||
| inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } | ||||
| inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } | ||||
|  | ||||
| static const float GELU_COEF_A    = 0.044715f; | ||||
| static const float GELU_QUICK_COEF    = -1.702f; | ||||
| static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; | ||||
| static const float GELU_COEF_A     = 0.044715f; | ||||
| static const float GELU_QUICK_COEF = -1.702f; | ||||
| static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f; | ||||
|  | ||||
| inline static float ggml_gelu_f32(float x) { | ||||
|     return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); | ||||
| @@ -5555,10 +5555,6 @@ struct ggml_tensor * ggml_repeat( | ||||
|         is_node = true; | ||||
|     } | ||||
|  | ||||
|     if (ggml_are_same_shape(a, b) && !is_node) { | ||||
|         return a; | ||||
|     } | ||||
|  | ||||
|     struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); | ||||
|  | ||||
|     result->op   = GGML_OP_REPEAT; | ||||
| @@ -5789,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back( | ||||
| static struct ggml_tensor * ggml_norm_impl( | ||||
|         struct ggml_context * ctx, | ||||
|         struct ggml_tensor  * a, | ||||
|         float eps, | ||||
|         bool inplace) { | ||||
|     bool is_node = false; | ||||
|  | ||||
| @@ -5799,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl( | ||||
|  | ||||
|     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); | ||||
|  | ||||
|     // TODO: maybe store epsilon here? | ||||
|     ggml_set_op_params(result, &eps, sizeof(eps)); | ||||
|  | ||||
|     result->op   = GGML_OP_NORM; | ||||
|     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; | ||||
| @@ -5810,14 +5807,16 @@ static struct ggml_tensor * ggml_norm_impl( | ||||
|  | ||||
| struct ggml_tensor * ggml_norm( | ||||
|         struct ggml_context * ctx, | ||||
|         struct ggml_tensor  * a) { | ||||
|     return ggml_norm_impl(ctx, a, false); | ||||
|         struct ggml_tensor  * a, | ||||
|         float eps) { | ||||
|     return ggml_norm_impl(ctx, a, eps, false); | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * ggml_norm_inplace( | ||||
|         struct ggml_context * ctx, | ||||
|         struct ggml_tensor  * a) { | ||||
|     return ggml_norm_impl(ctx, a, true); | ||||
|         struct ggml_tensor  * a, | ||||
|         float eps) { | ||||
|     return ggml_norm_impl(ctx, a, eps, true); | ||||
| } | ||||
|  | ||||
| // ggml_rms_norm | ||||
| @@ -10619,7 +10618,8 @@ static void ggml_compute_forward_norm_f32( | ||||
|  | ||||
|     GGML_TENSOR_UNARY_OP_LOCALS; | ||||
|  | ||||
|     const float eps = 1e-5f; // TODO: make this a parameter | ||||
|     float eps; | ||||
|     memcpy(&eps, dst->op_params, sizeof(float)); | ||||
|  | ||||
|     // TODO: optimize | ||||
|     for (int64_t i03 = 0; i03 < ne03; i03++) { | ||||
| @@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32( | ||||
|                         dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; | ||||
|                     } | ||||
|                 } else { | ||||
|                     // TODO: this is probably wrong, but I can't figure it out .. | ||||
|                     // TODO: this might be wrong for ne0 != n_dims - need double check | ||||
|                     // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 | ||||
|                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { | ||||
|                         for (int64_t ic = 0; ic < n_dims; ic += 2) { | ||||
| @@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16( | ||||
|                         dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); | ||||
|                     } | ||||
|                 } else { | ||||
|                     // TODO: this is probably wrong, but I can't figure it out .. | ||||
|                     // TODO: this might be wrong for ne0 != n_dims - need double check | ||||
|                     // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 | ||||
|                     for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { | ||||
|                         for (int64_t ic = 0; ic < n_dims; ic += 2) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov