mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	llama : dynamic temperature sampling (#4972)
* implemented dynamic temperature sampling from koboldcpp * removed trailing whitespace * removed unused temp parameter in llama_sample_entropy * exposed exponent_val in dynamic temp sampler * added debug check for printf statements * use nullptr in llama_sample_softmax call during llama_sample_entropy this avoids counting the time taken stats twice Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * return earlier if there is only 1 candiate (i.e. max_entropy == 0) * reformat 't' case in llama_sample_queue Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * check for one or zero candidates case in llama_sample_entropy --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
This commit is contained in:
		| @@ -129,6 +129,8 @@ static void sampler_queue( | ||||
|     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); | ||||
|  | ||||
|     const float         temp              = params.temp; | ||||
|     const float         dynatemp_range    = params.dynatemp_range; | ||||
|     const float         dynatemp_exponent = params.dynatemp_exponent; | ||||
|     const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k; | ||||
|     const float         top_p             = params.top_p; | ||||
|     const float         min_p             = params.min_p; | ||||
| @@ -143,7 +145,15 @@ static void sampler_queue( | ||||
|             case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break; | ||||
|             case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break; | ||||
|             case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break; | ||||
|             case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break; | ||||
|             case 't': | ||||
|                 if (dynatemp_range > 0) { | ||||
|                     float dynatemp_min = std::max(0.0f, temp - dynatemp_range); | ||||
|                     float dynatemp_max = std::max(0.0f, temp + dynatemp_range); | ||||
|                     llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); | ||||
|                 } else { | ||||
|                     llama_sample_temp(ctx_main, &cur_p, temp); | ||||
|                 } | ||||
|                 break; | ||||
|             default : break; | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -18,6 +18,8 @@ typedef struct llama_sampling_params { | ||||
|     float       tfs_z                 = 1.00f;    // 1.0 = disabled | ||||
|     float       typical_p             = 1.00f;    // 1.0 = disabled | ||||
|     float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities | ||||
|     float       dynatemp_range        = 0.00f;    // 0.0 = disabled | ||||
|     float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler | ||||
|     int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size) | ||||
|     float       penalty_repeat        = 1.10f;    // 1.0 = disabled | ||||
|     float       penalty_freq          = 0.00f;    // 0.0 = disabled | ||||
|   | ||||
							
								
								
									
										67
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										67
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -8151,6 +8151,73 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c | ||||
|     } | ||||
| } | ||||
|  | ||||
| void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) { | ||||
|     const int64_t t_start_sample_us = ggml_time_us(); | ||||
|  | ||||
|     // no need to do anything if there is only one (or zero) candidates | ||||
|     if(candidates_p->size <= 1) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // Calculate maximum possible entropy | ||||
|     float max_entropy = -logf(1.0f / candidates_p->size); | ||||
|  | ||||
|     llama_sample_softmax(nullptr, candidates_p); | ||||
|  | ||||
|     // Calculate entropy of the softmax probabilities | ||||
|     float entropy = 0.0f; | ||||
|     for (size_t i = 0; i < candidates_p->size; ++i) { | ||||
|         float prob = candidates_p->data[i].p; | ||||
|         if (prob > 0.0f) { // Ensure no log(0) | ||||
|             entropy -= prob * logf(prob); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // Normalize the entropy (max_entropy cannot be 0 here because we checked candidates_p->size != 1 above) | ||||
|     float normalized_entropy = entropy / max_entropy; | ||||
|  | ||||
|     // Map the normalized entropy to the desired temperature range using the power function | ||||
|     float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); | ||||
|  | ||||
| #ifdef DEBUG | ||||
|     LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); | ||||
|     LLAMA_LOG_INFO("Entropy: %f\n", entropy); | ||||
|     LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); | ||||
|     LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); | ||||
|     LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); | ||||
|     LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); | ||||
| #endif | ||||
|  | ||||
|     // Apply the dynamically calculated temperature scaling | ||||
|     for (size_t i = 0; i < candidates_p->size; ++i) { | ||||
|         candidates_p->data[i].logit /= dyn_temp; | ||||
|     } | ||||
|  | ||||
|     // Re-compute softmax probabilities after scaling logits with dynamic temperature | ||||
|     double max_l_double = candidates_p->data[0].logit; | ||||
|     double cum_sum_double = 0.0; | ||||
|     for (size_t i = 0; i < candidates_p->size; ++i) { | ||||
|         double p = exp(candidates_p->data[i].logit - max_l_double); | ||||
|         candidates_p->data[i].p = p; // Store the scaled probability | ||||
|         cum_sum_double += p; | ||||
|     } | ||||
|     for (size_t i = 0; i < candidates_p->size; ++i) { | ||||
|         candidates_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities | ||||
|     } | ||||
|  | ||||
| #ifdef DEBUG | ||||
|     // Print the updated top 25 probabilities after temperature scaling | ||||
|     LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); | ||||
|     for (size_t i = 0; i < 25 && i < candidates_p->size; ++i) { | ||||
|         LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, candidates_p->data[i].p * 100.0f); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     if (ctx) { | ||||
|         ctx->t_sample_us += ggml_time_us() - t_start_sample_us; | ||||
|     } | ||||
| } | ||||
|  | ||||
| void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { | ||||
|     const int64_t t_start_sample_us = ggml_time_us(); | ||||
|  | ||||
|   | ||||
							
								
								
									
										8
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								llama.h
									
									
									
									
									
								
							| @@ -775,6 +775,14 @@ extern "C" { | ||||
|                            float   p, | ||||
|                           size_t   min_keep); | ||||
|  | ||||
|     /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. | ||||
|     LLAMA_API void llama_sample_entropy( | ||||
|             struct llama_context * ctx, | ||||
|           llama_token_data_array * candidates_p, | ||||
|                            float   min_temp, | ||||
|                            float   max_temp, | ||||
|                            float   exponent_val); | ||||
|  | ||||
|     LLAMA_API void llama_sample_temp( | ||||
|             struct llama_context * ctx, | ||||
|           llama_token_data_array * candidates, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 l3utterfly
					l3utterfly