mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ci: bench: support sse and fix prompt processing time / server: add tokens usage in stream OAI response (#6495)
* ci: bench: support sse and fix prompt processing time server: add tokens usage in stream mode * ci: bench: README.md EOL * ci: bench: remove total pp and tg as it is not accurate * ci: bench: fix case when there is no token generated * ci: bench: change to the 95 percentile for pp and tg as it is closer to what the server exports in metrics * ci: bench: fix finish reason rate
This commit is contained in:
		| @@ -2,13 +2,15 @@ | ||||
|  | ||||
| Benchmark is using [k6](https://k6.io/). | ||||
|  | ||||
| ##### Install k6 | ||||
| ##### Install k6 and sse extension | ||||
|  | ||||
| Follow instruction from: https://k6.io/docs/get-started/installation/ | ||||
| SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. | ||||
|  | ||||
| Example for ubuntu: | ||||
| Example: | ||||
| ```shell | ||||
| snap install k6 | ||||
| go install go.k6.io/xk6/cmd/xk6@latest | ||||
| xk6 build master \ | ||||
| --with github.com/phymbert/xk6-sse | ||||
| ``` | ||||
|  | ||||
| #### Download a dataset | ||||
| @@ -46,7 +48,7 @@ server --host localhost --port 8080 \ | ||||
|  | ||||
| For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: | ||||
| ```shell | ||||
| k6 run script.js --duration 10m --iterations 500 --vus 8 | ||||
| ./k6 run script.js --duration 10m --iterations 500 --vus 8 | ||||
| ``` | ||||
|  | ||||
| The benchmark values can be overridden with: | ||||
| @@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with: | ||||
| ```shell | ||||
| curl http://localhost:8080/metrics | ||||
| ``` | ||||
|  | ||||
| ### Using the CI python script | ||||
| The `bench.py` script does several steps: | ||||
| - start the server | ||||
| - define good variable for k6 | ||||
| - run k6 script | ||||
| - extract metrics from prometheus | ||||
|  | ||||
| It aims to be used in the CI, but you can run it manually: | ||||
|  | ||||
| ```shell | ||||
| LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \ | ||||
|               --runner-label local \ | ||||
|               --name local \ | ||||
|               --branch `git rev-parse --abbrev-ref HEAD` \ | ||||
|               --commit `git rev-parse HEAD` \ | ||||
|               --scenario script.js \ | ||||
|               --duration 5m \ | ||||
|               --hf-repo ggml-org/models	 \ | ||||
|               --hf-file phi-2/ggml-model-q4_0.gguf \ | ||||
|               --model-path-prefix models \ | ||||
|               --parallel 4 \ | ||||
|               -ngl 33 \ | ||||
|               --batch-size 2048 \ | ||||
|               --ubatch-size	256 \ | ||||
|               --ctx-size 4096 \ | ||||
|               --n-prompts 200 \ | ||||
|               --max-prompt-tokens 256 \ | ||||
|               --max-tokens 256 | ||||
| ``` | ||||
|   | ||||
| @@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None: | ||||
|                             data['metrics'][metric_name][metric_metric]=value | ||||
|                             github_env.write( | ||||
|                                 f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n") | ||||
|                 token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] | ||||
|                 iterations = data['root_group']['checks']['success completion']['passes'] | ||||
|  | ||||
|     except Exception: | ||||
| @@ -181,16 +180,16 @@ xychart-beta | ||||
|     bench_results = { | ||||
|         "i": iterations, | ||||
|         "req": { | ||||
|             "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2), | ||||
|             "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2), | ||||
|             "avg": round(data['metrics']["http_req_duration"]["avg"], 2), | ||||
|         }, | ||||
|         "pp": { | ||||
|             "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2), | ||||
|             "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2), | ||||
|             "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), | ||||
|             "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), | ||||
|             "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), | ||||
|         }, | ||||
|         "tg": { | ||||
|             "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2), | ||||
|             "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), | ||||
|             "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), | ||||
|             "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), | ||||
|         }, | ||||
| @@ -206,7 +205,7 @@ xychart-beta | ||||
|  | ||||
|  | ||||
| def start_benchmark(args): | ||||
|     k6_path = 'k6' | ||||
|     k6_path = './k6' | ||||
|     if 'BENCH_K6_BIN_PATH' in os.environ: | ||||
|         k6_path = os.environ['BENCH_K6_BIN_PATH'] | ||||
|     k6_args = [ | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| import http from 'k6/http' | ||||
| import sse from 'k6/x/sse' | ||||
| import {check, sleep} from 'k6' | ||||
| import {SharedArray} from 'k6/data' | ||||
| import {Counter, Rate, Trend} from 'k6/metrics' | ||||
| @@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () { | ||||
|  | ||||
| const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') | ||||
| const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') | ||||
|  | ||||
| const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') | ||||
| const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') | ||||
|  | ||||
| const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') | ||||
| const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') | ||||
| @@ -86,36 +88,62 @@ export default function () { | ||||
|             } | ||||
|         ], | ||||
|         "model": model, | ||||
|         "stream": false, | ||||
|         "stream": true, | ||||
|         "seed": 42, | ||||
|         "max_tokens": max_tokens | ||||
|     } | ||||
|  | ||||
|     const body = JSON.stringify(payload) | ||||
|     const params = {method: 'POST', body: JSON.stringify(payload)}; | ||||
|  | ||||
|     let res = http.post(`${server_url}/chat/completions`, body, { | ||||
|         headers: {'Content-Type': 'application/json'}, | ||||
|         timeout: '300s' | ||||
|     const startTime = new Date() | ||||
|     let promptEvalEndTime = null | ||||
|     let prompt_tokens = 0 | ||||
|     let completions_tokens = 0 | ||||
|     let finish_reason = null | ||||
|     const res = sse.open(`${server_url}/chat/completions`, params, function (client) { | ||||
|         client.on('event', function (event) { | ||||
|             if (promptEvalEndTime == null) { | ||||
|                 promptEvalEndTime = new Date() | ||||
|             } | ||||
|  | ||||
|             let chunk = JSON.parse(event.data) | ||||
|             let choice = chunk.choices[0] | ||||
|             if (choice.finish_reason) { | ||||
|                 finish_reason = choice.finish_reason | ||||
|             } | ||||
|  | ||||
|             if (chunk.usage) { | ||||
|                 prompt_tokens = chunk.usage.prompt_tokens | ||||
|                 llamacpp_prompt_tokens.add(prompt_tokens) | ||||
|                 llamacpp_prompt_tokens_total_counter.add(prompt_tokens) | ||||
|  | ||||
|                 completions_tokens = chunk.usage.completion_tokens | ||||
|                 llamacpp_completion_tokens.add(completions_tokens) | ||||
|                 llamacpp_completion_tokens_total_counter.add(completions_tokens) | ||||
|             } | ||||
|         }) | ||||
|  | ||||
|         client.on('error', function (e) { | ||||
|             console.log('An unexpected error occurred: ', e.error()); | ||||
|             throw e; | ||||
|         }) | ||||
|     }) | ||||
|  | ||||
|     check(res, {'success completion': (r) => r.status === 200}) | ||||
|  | ||||
|     if (res.status === 200) { | ||||
|         const completions = res.json() | ||||
|     const endTime = new Date() | ||||
|  | ||||
|         llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) | ||||
|         llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) | ||||
|  | ||||
|         llamacpp_completion_tokens.add(completions.usage.completion_tokens) | ||||
|         llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) | ||||
|  | ||||
|         llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') | ||||
|         llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') | ||||
|  | ||||
|         llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) | ||||
|     } else { | ||||
|         console.error(`response: ${res.body} request=${payload}`) | ||||
|     const promptEvalTime = promptEvalEndTime - startTime | ||||
|     if (promptEvalTime > 0) { | ||||
|         llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3) | ||||
|     } | ||||
|  | ||||
|     const completion_time = endTime - promptEvalEndTime | ||||
|     if (completions_tokens > 0 && completion_time > 0) { | ||||
|         llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3) | ||||
|     } | ||||
|     llamacpp_completions_truncated_rate.add(finish_reason === 'length') | ||||
|     llamacpp_completions_stop_rate.add(finish_reason === 'stop') | ||||
|  | ||||
|     sleep(0.3) | ||||
| } | ||||
|   | ||||
| @@ -567,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st | ||||
|         {"model",   modelname}, | ||||
|         {"object",  "chat.completion.chunk"} | ||||
|     }; | ||||
|     if (!finish_reason.empty()) { | ||||
|         int num_tokens_predicted = json_value(result, "tokens_predicted", 0); | ||||
|         int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0); | ||||
|         ret.push_back({"usage", json { | ||||
|             {"completion_tokens", num_tokens_predicted}, | ||||
|             {"prompt_tokens",     num_prompt_tokens}, | ||||
|             {"total_tokens",      num_tokens_predicted + num_prompt_tokens} | ||||
|         }}); | ||||
|     } | ||||
|  | ||||
|     return std::vector<json>({ret}); | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Pierrick Hymbert
					Pierrick Hymbert