mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : allow to specify tokens as strings in logit_bias (#5003)
* server: allow to specify tokens as strings in logit_bias * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -185,7 +185,7 @@ node index.js | |||||||
|  |  | ||||||
|     `ignore_eos`: Ignore end of stream token and continue generating (default: false). |     `ignore_eos`: Ignore end of stream token and continue generating (default: false). | ||||||
|  |  | ||||||
|     `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []). |     `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []). | ||||||
|  |  | ||||||
|     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) |     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -626,18 +626,36 @@ struct llama_server_context | |||||||
|             const int n_vocab = llama_n_vocab(model); |             const int n_vocab = llama_n_vocab(model); | ||||||
|             for (const auto &el : *logit_bias) |             for (const auto &el : *logit_bias) | ||||||
|             { |             { | ||||||
|                 if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) |                 if (el.is_array() && el.size() == 2) | ||||||
|  |                 { | ||||||
|  |                     float bias; | ||||||
|  |                     if (el[1].is_number()) | ||||||
|  |                     { | ||||||
|  |                         bias = el[1].get<float>(); | ||||||
|  |                     } | ||||||
|  |                     else if (el[1].is_boolean() && !el[1].get<bool>()) | ||||||
|  |                     { | ||||||
|  |                         bias = -INFINITY; | ||||||
|  |                     } | ||||||
|  |                     else | ||||||
|  |                     { | ||||||
|  |                         continue; | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     if (el[0].is_number_integer()) | ||||||
|                     { |                     { | ||||||
|                         llama_token tok = el[0].get<llama_token>(); |                         llama_token tok = el[0].get<llama_token>(); | ||||||
|                         if (tok >= 0 && tok < n_vocab) |                         if (tok >= 0 && tok < n_vocab) | ||||||
|                         { |                         { | ||||||
|                         if (el[1].is_number()) |                             slot->sparams.logit_bias[tok] = bias; | ||||||
|                         { |  | ||||||
|                             slot->sparams.logit_bias[tok] = el[1].get<float>(); |  | ||||||
|                         } |                         } | ||||||
|                         else if (el[1].is_boolean() && !el[1].get<bool>()) |                     } | ||||||
|  |                     else if (el[0].is_string()) | ||||||
|                     { |                     { | ||||||
|                             slot->sparams.logit_bias[tok] = -INFINITY; |                         auto toks = llama_tokenize(model, el[0].get<std::string>(), false); | ||||||
|  |                         for (auto tok : toks) | ||||||
|  |                         { | ||||||
|  |                             slot->sparams.logit_bias[tok] = bias; | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Alexey Parfenov
					Alexey Parfenov