mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : allow to specify tokens as strings in logit_bias (#5003)
* server: allow to specify tokens as strings in logit_bias * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -185,7 +185,7 @@ node index.js | ||||
|  | ||||
|     `ignore_eos`: Ignore end of stream token and continue generating (default: false). | ||||
|  | ||||
|     `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). | ||||
|     `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []). | ||||
|  | ||||
|     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) | ||||
|  | ||||
|   | ||||
| @@ -626,18 +626,36 @@ struct llama_server_context | ||||
|             const int n_vocab = llama_n_vocab(model); | ||||
|             for (const auto &el : *logit_bias) | ||||
|             { | ||||
|                 if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) | ||||
|                 if (el.is_array() && el.size() == 2) | ||||
|                 { | ||||
|                     float bias; | ||||
|                     if (el[1].is_number()) | ||||
|                     { | ||||
|                         bias = el[1].get<float>(); | ||||
|                     } | ||||
|                     else if (el[1].is_boolean() && !el[1].get<bool>()) | ||||
|                     { | ||||
|                         bias = -INFINITY; | ||||
|                     } | ||||
|                     else | ||||
|                     { | ||||
|                         continue; | ||||
|                     } | ||||
|  | ||||
|                     if (el[0].is_number_integer()) | ||||
|                     { | ||||
|                         llama_token tok = el[0].get<llama_token>(); | ||||
|                         if (tok >= 0 && tok < n_vocab) | ||||
|                         { | ||||
|                         if (el[1].is_number()) | ||||
|                         { | ||||
|                             slot->sparams.logit_bias[tok] = el[1].get<float>(); | ||||
|                             slot->sparams.logit_bias[tok] = bias; | ||||
|                         } | ||||
|                         else if (el[1].is_boolean() && !el[1].get<bool>()) | ||||
|                     } | ||||
|                     else if (el[0].is_string()) | ||||
|                     { | ||||
|                             slot->sparams.logit_bias[tok] = -INFINITY; | ||||
|                         auto toks = llama_tokenize(model, el[0].get<std::string>(), false); | ||||
|                         for (auto tok : toks) | ||||
|                         { | ||||
|                             slot->sparams.logit_bias[tok] = bias; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Alexey Parfenov
					Alexey Parfenov