mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	server: tests: add truncated prompt tests, better kv cache size (#5933)
* server: tests: add truncated prompt tests, better size * server, tests : update regex --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		@@ -6,8 +6,8 @@ Feature: Parallel
 | 
			
		||||
    Given a server listening on localhost:8080
 | 
			
		||||
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
 | 
			
		||||
    And   42 as server seed
 | 
			
		||||
    And   512 as batch size
 | 
			
		||||
    And   64 KV cache size
 | 
			
		||||
    And   128 as batch size
 | 
			
		||||
    And   256 KV cache size
 | 
			
		||||
    And   2 slots
 | 
			
		||||
    And   continuous batching
 | 
			
		||||
    Then  the server is starting
 | 
			
		||||
@@ -76,6 +76,7 @@ Feature: Parallel
 | 
			
		||||
      | disabled  | 128       |
 | 
			
		||||
      | enabled   | 64        |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
 | 
			
		||||
    Given a prompt:
 | 
			
		||||
      """
 | 
			
		||||
 
 | 
			
		||||
@@ -10,11 +10,10 @@ Feature: llama.cpp server
 | 
			
		||||
      # KV Cache corresponds to the total amount of tokens
 | 
			
		||||
      # that can be stored across all independent sequences: #4130
 | 
			
		||||
      # see --ctx-size and #5568
 | 
			
		||||
    And   32 KV cache size
 | 
			
		||||
    And   512 as batch size
 | 
			
		||||
    And   1 slots
 | 
			
		||||
    And   embeddings extraction
 | 
			
		||||
    And   32 server max tokens to predict
 | 
			
		||||
    And   256 KV cache size
 | 
			
		||||
    And   32 as batch size
 | 
			
		||||
    And   2 slots
 | 
			
		||||
    And   64 server max tokens to predict
 | 
			
		||||
    And   prometheus compatible metrics exposed
 | 
			
		||||
    Then  the server is starting
 | 
			
		||||
    Then  the server is healthy
 | 
			
		||||
@@ -23,18 +22,35 @@ Feature: llama.cpp server
 | 
			
		||||
    Then the server is ready
 | 
			
		||||
    And  all slots are idle
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Scenario Outline: Completion
 | 
			
		||||
    Given a prompt <prompt>
 | 
			
		||||
    And   <n_predict> max tokens to predict
 | 
			
		||||
    And   a completion request with no api error
 | 
			
		||||
    Then  <n_predicted> tokens are predicted matching <re_content>
 | 
			
		||||
    And   the completion is <truncated> truncated
 | 
			
		||||
    And   <n_prompt> prompt tokens are processed
 | 
			
		||||
    And   prometheus metrics are exposed
 | 
			
		||||
    And   metric llamacpp:tokens_predicted is <n_predicted>
 | 
			
		||||
 | 
			
		||||
    Examples: Prompts
 | 
			
		||||
      | prompt                           | n_predict | re_content                       | n_predicted |
 | 
			
		||||
      | I believe the meaning of life is | 8         | (read\|going)+                   | 8           |
 | 
			
		||||
      | Write a joke about AI            | 64        | (park\|friends\|scared\|always)+ | 32          |
 | 
			
		||||
      | prompt                                                                    | n_predict | re_content                    | n_prompt | n_predicted | truncated |
 | 
			
		||||
      | I believe the meaning of life is                                          | 8         | (read\|going)+                | 18       | 8           | not       |
 | 
			
		||||
      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids)+ | 46       | 64          | not       |
 | 
			
		||||
 | 
			
		||||
  Scenario: Completion prompt truncated
 | 
			
		||||
    Given a prompt:
 | 
			
		||||
    """
 | 
			
		||||
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
 | 
			
		||||
    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
 | 
			
		||||
    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
 | 
			
		||||
    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
 | 
			
		||||
    """
 | 
			
		||||
    And   a completion request with no api error
 | 
			
		||||
    Then  64 tokens are predicted matching fun|Annaks|popcorns
 | 
			
		||||
    And   the completion is  truncated
 | 
			
		||||
    And   109 prompt tokens are processed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Scenario Outline: OAI Compatibility
 | 
			
		||||
    Given a model <model>
 | 
			
		||||
@@ -44,11 +60,14 @@ Feature: llama.cpp server
 | 
			
		||||
    And   streaming is <enable_streaming>
 | 
			
		||||
    Given an OAI compatible chat completions request with no api error
 | 
			
		||||
    Then  <n_predicted> tokens are predicted matching <re_content>
 | 
			
		||||
    And   <n_prompt> prompt tokens are processed
 | 
			
		||||
    And   the completion is <truncated> truncated
 | 
			
		||||
 | 
			
		||||
    Examples: Prompts
 | 
			
		||||
      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_predicted | enable_streaming |
 | 
			
		||||
      | llama-2      | Book                        | What is the best book                | 8          | (Mom\|what)+           | 8           | disabled         |
 | 
			
		||||
      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks\|happy\|bird)+ | 32          | enabled          |
 | 
			
		||||
      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_prompt | n_predicted | enable_streaming | truncated |
 | 
			
		||||
      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+          | 77       | 8           | disabled         | not       |
 | 
			
		||||
      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird)+ | -1       | 64          | enabled          |           |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Scenario: Tokenize / Detokenize
 | 
			
		||||
    When tokenizing:
 | 
			
		||||
 
 | 
			
		||||
@@ -196,12 +196,30 @@ async def step_request_completion(context, api_error):
 | 
			
		||||
 | 
			
		||||
@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
 | 
			
		||||
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
 | 
			
		||||
    assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
 | 
			
		||||
    context.completion = context.tasks_result.pop()
 | 
			
		||||
    assert_n_tokens_predicted(context.completion, predicted_n, re_content)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step(u'{predicted_n:d} tokens are predicted')
 | 
			
		||||
def step_n_tokens_predicted(context, predicted_n):
 | 
			
		||||
    assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
 | 
			
		||||
    context.completion = context.tasks_result.pop()
 | 
			
		||||
    assert_n_tokens_predicted(context.completion, predicted_n)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step(u'the completion is  truncated')
 | 
			
		||||
def step_assert_completion_truncated(context):
 | 
			
		||||
    step_assert_completion_truncated(context, '')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step(u'the completion is {truncated} truncated')
 | 
			
		||||
def step_assert_completion_truncated(context, truncated):
 | 
			
		||||
    truncated = truncated != "not"
 | 
			
		||||
    assert context.completion['truncated'] == truncated, f'{context.completion}'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step(u'{n_prompt:d} prompt tokens are processed')
 | 
			
		||||
def step_impl(context, n_prompt):
 | 
			
		||||
    assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step(u'a user prompt {user_prompt}')
 | 
			
		||||
@@ -722,7 +740,8 @@ async def oai_chat_completions(user_prompt,
 | 
			
		||||
    completion_response = {
 | 
			
		||||
        'content': '',
 | 
			
		||||
        'timings': {
 | 
			
		||||
            'predicted_n': 0
 | 
			
		||||
            'predicted_n': 0,
 | 
			
		||||
            'prompt_n': 0
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    if async_client:
 | 
			
		||||
@@ -763,7 +782,8 @@ async def oai_chat_completions(user_prompt,
 | 
			
		||||
                        completion_response = {
 | 
			
		||||
                            'content': chat_completion_raw['choices'][0]['message'],
 | 
			
		||||
                            'timings': {
 | 
			
		||||
                                'predicted_n': chat_completion_raw['usage']['completion_tokens']
 | 
			
		||||
                                'predicted_n': chat_completion_raw['usage']['completion_tokens'],
 | 
			
		||||
                                'prompt_n': chat_completion_raw['usage']['prompt_tokens']
 | 
			
		||||
                            }
 | 
			
		||||
                        }
 | 
			
		||||
                    else:
 | 
			
		||||
@@ -792,13 +812,16 @@ async def oai_chat_completions(user_prompt,
 | 
			
		||||
                if 'content' in delta:
 | 
			
		||||
                    completion_response['content'] += delta['content']
 | 
			
		||||
                    completion_response['timings']['predicted_n'] += 1
 | 
			
		||||
                completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
 | 
			
		||||
        else:
 | 
			
		||||
            assert len(chat_completion.choices) == 1
 | 
			
		||||
            completion_response = {
 | 
			
		||||
                'content': chat_completion.choices[0].message.content,
 | 
			
		||||
                'timings': {
 | 
			
		||||
                    'predicted_n': chat_completion.usage.completion_tokens
 | 
			
		||||
                }
 | 
			
		||||
                    'predicted_n': chat_completion.usage.completion_tokens,
 | 
			
		||||
                    'prompt_n': chat_completion.usage.prompt_tokens
 | 
			
		||||
                    },
 | 
			
		||||
                'truncated': chat_completion.choices[0].finish_reason != 'stop'
 | 
			
		||||
            }
 | 
			
		||||
    if debug:
 | 
			
		||||
        print("OAI response formatted to llama.cpp:", completion_response)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user