mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	ci: bench: add more ftype, fix triggers and bot comment (#6466)
* ci: bench: change trigger path to not spawn on each PR * ci: bench: add more file type for phi-2: q8_0 and f16. - do not show the comment by default * ci: bench: add seed parameter in k6 script * ci: bench: artefact name perf job * Add iteration in the commit status, reduce again the autocomment * ci: bench: add per slot metric in the commit status * Fix trailing spaces
This commit is contained in:
		
							
								
								
									
										37
									
								
								.github/workflows/bench.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										37
									
								
								.github/workflows/bench.yml
									
									
									
									
										vendored
									
									
								
							@@ -24,10 +24,10 @@ on:
 | 
				
			|||||||
  push:
 | 
					  push:
 | 
				
			||||||
    branches:
 | 
					    branches:
 | 
				
			||||||
      - master
 | 
					      - master
 | 
				
			||||||
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
 | 
					    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
 | 
				
			||||||
  pull_request_target:
 | 
					  pull_request_target:
 | 
				
			||||||
    types: [opened, synchronize, reopened]
 | 
					    types: [opened, synchronize, reopened]
 | 
				
			||||||
    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
 | 
					    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
 | 
				
			||||||
  schedule:
 | 
					  schedule:
 | 
				
			||||||
    -  cron: '04 2 * * *'
 | 
					    -  cron: '04 2 * * *'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -42,6 +42,16 @@ jobs:
 | 
				
			|||||||
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
 | 
					      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
 | 
				
			||||||
      N_USERS: 8
 | 
					      N_USERS: 8
 | 
				
			||||||
      DURATION: 10m
 | 
					      DURATION: 10m
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    strategy:
 | 
				
			||||||
 | 
					      matrix:
 | 
				
			||||||
 | 
					        model: [phi-2]
 | 
				
			||||||
 | 
					        ftype: [q4_0, q8_0, f16]
 | 
				
			||||||
 | 
					        include:
 | 
				
			||||||
 | 
					          - model: phi-2
 | 
				
			||||||
 | 
					            ftype: q4_0
 | 
				
			||||||
 | 
					            pr_comment_enabled: "true"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
 | 
					    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
      - name: Clone
 | 
					      - name: Clone
 | 
				
			||||||
@@ -116,7 +126,7 @@ jobs:
 | 
				
			|||||||
              --scenario script.js \
 | 
					              --scenario script.js \
 | 
				
			||||||
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
 | 
					              --duration ${{ github.event.inputs.duration || env.DURATION }} \
 | 
				
			||||||
              --hf-repo ggml-org/models	 \
 | 
					              --hf-repo ggml-org/models	 \
 | 
				
			||||||
              --hf-file phi-2/ggml-model-q4_0.gguf \
 | 
					              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
 | 
				
			||||||
              --model-path-prefix /models \
 | 
					              --model-path-prefix /models \
 | 
				
			||||||
              --parallel ${{ env.N_USERS }} \
 | 
					              --parallel ${{ env.N_USERS }} \
 | 
				
			||||||
              -ngl 33 \
 | 
					              -ngl 33 \
 | 
				
			||||||
@@ -134,7 +144,7 @@ jobs:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      - uses: actions/upload-artifact@v4
 | 
					      - uses: actions/upload-artifact@v4
 | 
				
			||||||
        with:
 | 
					        with:
 | 
				
			||||||
          name: benchmark-results
 | 
					          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
 | 
				
			||||||
          compression-level: 9
 | 
					          compression-level: 9
 | 
				
			||||||
          path: |
 | 
					          path: |
 | 
				
			||||||
            examples/server/bench/*.jpg
 | 
					            examples/server/bench/*.jpg
 | 
				
			||||||
@@ -146,7 +156,7 @@ jobs:
 | 
				
			|||||||
        with:
 | 
					        with:
 | 
				
			||||||
          authToken: ${{secrets.GITHUB_TOKEN}}
 | 
					          authToken: ${{secrets.GITHUB_TOKEN}}
 | 
				
			||||||
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
 | 
					          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
 | 
				
			||||||
          context: bench-server-baseline
 | 
					          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
 | 
				
			||||||
          description: |
 | 
					          description: |
 | 
				
			||||||
            ${{ env.BENCH_RESULTS }}
 | 
					            ${{ env.BENCH_RESULTS }}
 | 
				
			||||||
          state: 'success'
 | 
					          state: 'success'
 | 
				
			||||||
@@ -203,11 +213,19 @@ jobs:
 | 
				
			|||||||
      - name: Comment PR
 | 
					      - name: Comment PR
 | 
				
			||||||
        uses: mshick/add-pr-comment@v2
 | 
					        uses: mshick/add-pr-comment@v2
 | 
				
			||||||
        id: comment_pr
 | 
					        id: comment_pr
 | 
				
			||||||
        if: ${{ github.event.pull_request != '' }}
 | 
					        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
 | 
				
			||||||
        with:
 | 
					        with:
 | 
				
			||||||
          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
 | 
					          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
 | 
				
			||||||
          message: |
 | 
					          message: |
 | 
				
			||||||
            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
 | 
					            <p align="center">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            </p>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            <details>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            <summary>Expand details for performance related PR only</summary>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
 | 
					            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
 | 
				
			||||||
            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
 | 
					            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
 | 
				
			||||||
@@ -215,9 +233,6 @@ jobs:
 | 
				
			|||||||
            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
 | 
					            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
 | 
				
			||||||
            - ${{ env.BENCH_GRAPH_XLABEL }}
 | 
					            - ${{ env.BENCH_GRAPH_XLABEL }}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            <details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            <summary>Time series</summary>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            <p align="center">
 | 
					            <p align="center">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,6 +16,7 @@ import matplotlib
 | 
				
			|||||||
import matplotlib.dates
 | 
					import matplotlib.dates
 | 
				
			||||||
import matplotlib.pyplot as plt
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
 | 
					from statistics import mean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main(args_in: list[str] | None = None) -> None:
 | 
					def main(args_in: list[str] | None = None) -> None:
 | 
				
			||||||
@@ -109,6 +110,7 @@ def main(args_in: list[str] | None = None) -> None:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # Prometheus
 | 
					    # Prometheus
 | 
				
			||||||
    end_time = time.time()
 | 
					    end_time = time.time()
 | 
				
			||||||
 | 
					    prometheus_metrics = {}
 | 
				
			||||||
    if is_server_listening("0.0.0.0", 9090):
 | 
					    if is_server_listening("0.0.0.0", 9090):
 | 
				
			||||||
        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
 | 
					        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
 | 
				
			||||||
                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
 | 
					                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
 | 
				
			||||||
@@ -127,6 +129,7 @@ def main(args_in: list[str] | None = None) -> None:
 | 
				
			|||||||
                values = metric_data['data']['result'][0]['values']
 | 
					                values = metric_data['data']['result'][0]['values']
 | 
				
			||||||
                timestamps, metric_values = zip(*values)
 | 
					                timestamps, metric_values = zip(*values)
 | 
				
			||||||
                metric_values = [float(value) for value in metric_values]
 | 
					                metric_values = [float(value) for value in metric_values]
 | 
				
			||||||
 | 
					                prometheus_metrics[metric] = metric_values
 | 
				
			||||||
                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
 | 
					                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
 | 
				
			||||||
                plt.figure(figsize=(16, 10), dpi=80)
 | 
					                plt.figure(figsize=(16, 10), dpi=80)
 | 
				
			||||||
                plt.plot(timestamps_dt, metric_values, label=metric)
 | 
					                plt.plot(timestamps_dt, metric_values, label=metric)
 | 
				
			||||||
@@ -176,17 +179,20 @@ xychart-beta
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # 140 chars max for commit status description
 | 
					    # 140 chars max for commit status description
 | 
				
			||||||
    bench_results = {
 | 
					    bench_results = {
 | 
				
			||||||
 | 
					        "i": iterations,
 | 
				
			||||||
        "req": {
 | 
					        "req": {
 | 
				
			||||||
            "p90": data['metrics']["http_req_duration"]["p(90)"],
 | 
					            "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
 | 
				
			||||||
            "avg": data['metrics']["http_req_duration"]["avg"],
 | 
					            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "pp": {
 | 
					        "pp": {
 | 
				
			||||||
            "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
 | 
					            "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
 | 
				
			||||||
            "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
 | 
					            "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
 | 
				
			||||||
 | 
					            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "tg": {
 | 
					        "tg": {
 | 
				
			||||||
            "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
 | 
					            "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
 | 
				
			||||||
            "avg": data['metrics']["llamacpp_tokens_second"]["avg"],
 | 
					            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
 | 
				
			||||||
 | 
					            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    with open("results.github.env", 'a') as github_env:
 | 
					    with open("results.github.env", 'a') as github_env:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -87,6 +87,7 @@ export default function () {
 | 
				
			|||||||
        ],
 | 
					        ],
 | 
				
			||||||
        "model": model,
 | 
					        "model": model,
 | 
				
			||||||
        "stream": false,
 | 
					        "stream": false,
 | 
				
			||||||
 | 
					        "seed": 42,
 | 
				
			||||||
        "max_tokens": max_tokens
 | 
					        "max_tokens": max_tokens
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user