mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	server : add_special option for tokenize endpoint (#7059)
This commit is contained in:
		@@ -331,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.
 | 
			
		||||
 | 
			
		||||
    `content`: Set the text to tokenize.
 | 
			
		||||
 | 
			
		||||
    Note that a special `BOS` token is never inserted.
 | 
			
		||||
    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
 | 
			
		||||
 | 
			
		||||
- **POST** `/detokenize`: Convert tokens to text.
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -3647,7 +3647,8 @@ int main(int argc, char ** argv) {
 | 
			
		||||
 | 
			
		||||
        std::vector<llama_token> tokens;
 | 
			
		||||
        if (body.count("content") != 0) {
 | 
			
		||||
            tokens = ctx_server.tokenize(body["content"], false);
 | 
			
		||||
            const bool add_special = json_value(body, "add_special", false);
 | 
			
		||||
            tokens = ctx_server.tokenize(body["content"], add_special);
 | 
			
		||||
        }
 | 
			
		||||
        const json data = format_tokenizer_response(tokens);
 | 
			
		||||
        return res.set_content(data.dump(), "application/json; charset=utf-8");
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,7 @@ Feature: llama.cpp server
 | 
			
		||||
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
 | 
			
		||||
    And   a model file test-model.gguf
 | 
			
		||||
    And   a model alias tinyllama-2
 | 
			
		||||
    And   BOS token is 1
 | 
			
		||||
    And   42 as server seed
 | 
			
		||||
      # KV Cache corresponds to the total amount of tokens
 | 
			
		||||
      # that can be stored across all independent sequences: #4130
 | 
			
		||||
@@ -91,7 +92,18 @@ Feature: llama.cpp server
 | 
			
		||||
    """
 | 
			
		||||
    What is the capital of France ?
 | 
			
		||||
    """
 | 
			
		||||
    Then tokens can be detokenize
 | 
			
		||||
    Then tokens can be detokenized
 | 
			
		||||
    And  tokens do not begin with BOS
 | 
			
		||||
 | 
			
		||||
  Scenario: Tokenize w/ BOS
 | 
			
		||||
    Given adding special tokens
 | 
			
		||||
    When  tokenizing:
 | 
			
		||||
    """
 | 
			
		||||
    What is the capital of Germany?
 | 
			
		||||
    """
 | 
			
		||||
    Then  tokens begin with BOS
 | 
			
		||||
    Given first token is removed
 | 
			
		||||
    Then  tokens can be detokenized
 | 
			
		||||
 | 
			
		||||
  Scenario: Models available
 | 
			
		||||
    Given available models
 | 
			
		||||
 
 | 
			
		||||
@@ -376,6 +376,11 @@ def step_seed(context, seed):
 | 
			
		||||
        context.seed.append(seed)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('BOS token is {bos:d}')
 | 
			
		||||
def step_bos_token(context, bos):
 | 
			
		||||
    context.bos = bos
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('a prefix prompt')
 | 
			
		||||
def step_prompt_prefix(context):
 | 
			
		||||
    context.prompt_prefix = context_text(context)
 | 
			
		||||
@@ -656,21 +661,29 @@ async def all_embeddings_are_generated(context):
 | 
			
		||||
        assert_embeddings(context.tasks_result.pop().pop())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('adding special tokens')
 | 
			
		||||
def step_tokenize_set_add_special(context):
 | 
			
		||||
    context.tokenize_add_special = True
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('tokenizing')
 | 
			
		||||
@async_run_until_complete
 | 
			
		||||
async def step_tokenize(context):
 | 
			
		||||
    context.tokenized_text = context_text(context)
 | 
			
		||||
    async with aiohttp.ClientSession() as session:
 | 
			
		||||
        tokenize_args = {
 | 
			
		||||
            "content": context.tokenized_text,
 | 
			
		||||
        }
 | 
			
		||||
        if getattr(context, 'tokenize_add_special', None) is not None:
 | 
			
		||||
            tokenize_args['add_special'] = context.tokenize_add_special
 | 
			
		||||
        async with session.post(f'{context.base_url}/tokenize',
 | 
			
		||||
                                json={
 | 
			
		||||
                                    "content": context.tokenized_text,
 | 
			
		||||
                                }) as response:
 | 
			
		||||
                                json=tokenize_args) as response:
 | 
			
		||||
            assert response.status == 200
 | 
			
		||||
            tokenize_json = await response.json()
 | 
			
		||||
            context.tokens = tokenize_json['tokens']
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('tokens can be detokenize')
 | 
			
		||||
@step('tokens can be detokenized')
 | 
			
		||||
@async_run_until_complete
 | 
			
		||||
async def step_detokenize(context):
 | 
			
		||||
    assert len(context.tokens) > 0
 | 
			
		||||
@@ -685,6 +698,21 @@ async def step_detokenize(context):
 | 
			
		||||
            assert context.tokenized_text == detokenize_json['content'].strip()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('tokens begin with BOS')
 | 
			
		||||
def step_strings_for_tokenization(context):
 | 
			
		||||
    assert context.tokens[0] == context.bos
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('tokens do not begin with BOS')
 | 
			
		||||
def step_strings_for_tokenization(context):
 | 
			
		||||
    assert context.tokens[0] != context.bos
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('first token is removed')
 | 
			
		||||
def step_strings_for_tokenization(context):
 | 
			
		||||
    context.tokens = context.tokens[1:]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('an OPTIONS request is sent from {origin}')
 | 
			
		||||
@async_run_until_complete
 | 
			
		||||
async def step_options_request(context, origin):
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user