mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : refactor sampling v2 (#9294)
- Add `struct llama_sampler` and `struct llama_sampler_i` - Add `llama_sampler_` API - Add `llama_sampler_chain_` API for chaining multiple samplers - Remove `LLAMA_API_INTERNAL` - Add `llama_perf_` API and remove old `llama_print_timings` and `llama_reset_timings`
This commit is contained in:
		| @@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo | ||||
|     LOGi("Using %d threads", n_threads); | ||||
|  | ||||
|     llama_context_params ctx_params = llama_context_default_params(); | ||||
|     ctx_params.seed  = 1234; | ||||
|     ctx_params.n_ctx = 2048; | ||||
|  | ||||
|     ctx_params.n_ctx           = 2048; | ||||
|     ctx_params.n_threads       = n_threads; | ||||
|     ctx_params.n_threads_batch = n_threads; | ||||
|  | ||||
| @@ -380,11 +380,13 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( | ||||
|         JNIEnv * env, | ||||
|         jobject, | ||||
|         jlong context_pointer, | ||||
|         jlong sampling_pointer, | ||||
|         jlong batch_pointer, | ||||
|         jint n_len, | ||||
|         jobject intvar_ncur | ||||
| ) { | ||||
|     const auto context = reinterpret_cast<llama_context *>(context_pointer); | ||||
|     const auto sampling = reinterpret_cast<llama_sampler *>(sampling_pointer); | ||||
|     const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); | ||||
|     const auto model = llama_get_model(context); | ||||
|  | ||||
| @@ -392,20 +394,10 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( | ||||
|     if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); | ||||
|     if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); | ||||
|  | ||||
|     auto n_vocab = llama_n_vocab(model); | ||||
|     auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); | ||||
|  | ||||
|     std::vector<llama_token_data> candidates; | ||||
|     candidates.reserve(n_vocab); | ||||
|  | ||||
|     for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||||
|         candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); | ||||
|     } | ||||
|  | ||||
|     llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; | ||||
|  | ||||
|     // sample the most likely token | ||||
|     const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); | ||||
|     const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1); | ||||
|  | ||||
|     llama_sampler_accept(sampling, new_token_id); | ||||
|  | ||||
|     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); | ||||
|     if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov