1# TODO: there have been some issues with the workflow, so disabling for now
2# https://github.com/ggml-org/llama.cpp/issues/7893
3#
4# Benchmark
5name: Benchmark
6
7on:
8 workflow_dispatch:
9 inputs:
10 gpu-series:
11 description: 'Azure GPU series to run with'
12 required: true
13 type: choice
14 options:
15 - Standard_NC4as_T4_v3
16 - Standard_NC24ads_A100_v4
17 - Standard_NC80adis_H100_v5
18 sha:
19 description: 'Commit SHA1 to build'
20 required: false
21 type: string
22 duration:
23 description: 'Duration of the bench'
24 type: string
25 default: 10m
26
27 push:
28 branches:
29 - master
30 paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
31 pull_request_target:
32 types: [opened, synchronize, reopened]
33 paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
34 schedule:
35 - cron: '04 2 * * *'
36
37concurrency:
38 group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
39 cancel-in-progress: true
40
41jobs:
42 bench-server-baseline:
43 runs-on: Standard_NC4as_T4_v3
44 env:
45 RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
46 N_USERS: 8
47 DURATION: 10m
48
49 strategy:
50 matrix:
51 model: [phi-2]
52 ftype: [q4_0, q8_0, f16]
53 include:
54 - model: phi-2
55 ftype: q4_0
56 pr_comment_enabled: "true"
57
58 if: |
59 inputs.gpu-series == 'Standard_NC4as_T4_v3'
60 || github.event_name == 'pull_request_target'
61 steps:
62 - name: Clone
63 id: checkout
64 uses: actions/checkout@v4
65 with:
66 fetch-depth: 0
67 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
68
69 - name: Install python env
70 id: pipenv
71 run: |
72 cd tools/server/bench
73 python3 -m venv venv
74 source venv/bin/activate
75 pip install -r requirements.txt
76
77 - name: Prometheus
78 id: install_prometheus
79 run: |
80 wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
81 tar xzf prometheus*.tar.gz --strip-components=1
82 ./prometheus --config.file=tools/server/bench/prometheus.yml &
83 while ! nc -z localhost 9090; do
84 sleep 0.1
85 done
86
87 - name: Set up Go
88 uses: actions/setup-go@v5
89 with:
90 go-version: '1.21'
91
92 - name: Install k6 and xk6-sse
93 id: k6_installation
94 run: |
95 cd tools/server/bench
96 go install go.k6.io/xk6/cmd/xk6@latest
97 xk6 build master \
98 --with github.com/phymbert/xk6-sse
99
100 - name: Build
101 id: cmake_build
102 run: |
103 set -eux
104 cmake -B build \
105 -DGGML_NATIVE=OFF \
106 -DLLAMA_BUILD_SERVER=ON \
107 -DLLAMA_CUBLAS=ON \
108 -DCUDAToolkit_ROOT=/usr/local/cuda \
109 -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
110 -DCMAKE_CUDA_ARCHITECTURES=75 \
111 -DLLAMA_FATAL_WARNINGS=OFF \
112 -DLLAMA_ALL_WARNINGS=OFF \
113 -DCMAKE_BUILD_TYPE=Release;
114 cmake --build build --config Release -j $(nproc) --target llama-server
115
116 - name: Download the dataset
117 id: download_dataset
118 run: |
119 cd tools/server/bench
120 wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
121
122 - name: Server bench
123 id: server_bench
124 env:
125 HEAD_REF: ${{ github.head_ref || github.ref_name }}
126 run: |
127 set -eux
128
129 cd tools/server/bench
130 source venv/bin/activate
131 python bench.py \
132 --runner-label ${{ env.RUNNER_LABEL }} \
133 --name ${{ github.job }} \
134 --branch $HEAD_REF \
135 --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
136 --scenario script.js \
137 --duration ${{ github.event.inputs.duration || env.DURATION }} \
138 --hf-repo ggml-org/models \
139 --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
140 --model-path-prefix /models \
141 --parallel ${{ env.N_USERS }} \
142 -ngl 33 \
143 --batch-size 2048 \
144 --ubatch-size 256 \
145 --ctx-size 16384 \
146 --n-prompts 1000 \
147 --max-prompt-tokens 1024 \
148 --max-tokens 2048
149
150 cat results.github.env >> $GITHUB_ENV
151
152 # Remove dataset as we do not want it in the artefact
153 rm ShareGPT_V3_unfiltered_cleaned_split.json
154
155 - uses: actions/upload-artifact@v4
156 with:
157 name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
158 compression-level: 9
159 path: |
160 tools/server/bench/*.jpg
161 tools/server/bench/*.json
162 tools/server/bench/*.log
163
164 - name: Commit status
165 uses: Sibz/github-status-action@v1
166 with:
167 authToken: ${{secrets.GITHUB_TOKEN}}
168 sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
169 context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
170 description: |
171 ${{ env.BENCH_RESULTS }}
172 state: 'success'
173
174 - name: Upload benchmark images
175 uses: devicons/public-upload-to-imgur@v2.2.2
176 continue-on-error: true # Important as it looks unstable: 503
177 id: imgur_step
178 with:
179 client_id: ${{secrets.IMGUR_CLIENT_ID}}
180 path: |
181 tools/server/bench/prompt_tokens_seconds.jpg
182 tools/server/bench/predicted_tokens_seconds.jpg
183 tools/server/bench/kv_cache_usage_ratio.jpg
184 tools/server/bench/requests_processing.jpg
185
186 - name: Extract mermaid
187 id: set_mermaid
188 run: |
189 set -eux
190
191 cd tools/server/bench
192 PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
193 echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
194 echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
195 echo "EOF" >> $GITHUB_ENV
196
197 PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
198 echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
199 echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
200 echo "EOF" >> $GITHUB_ENV
201
202 KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
203 echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
204 echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
205 echo "EOF" >> $GITHUB_ENV
206
207 REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
208 echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
209 echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
210 echo "EOF" >> $GITHUB_ENV
211
212 - name: Extract image url
213 id: extract_image_url
214 continue-on-error: true
215 run: |
216 set -eux
217
218 echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
219 echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
220 echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
221 echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
222
223 - name: Comment PR
224 uses: mshick/add-pr-comment@v2
225 id: comment_pr
226 if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
227 with:
228 message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
229 message: |
230 <p align="center">
231
232 ๐ **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** ๐
233
234 </p>
235
236 <details>
237
238 <summary>Expand details for performance related PR only</summary>
239
240 - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
241 - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
242 - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
243 - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
244 - ${{ env.BENCH_GRAPH_XLABEL }}
245
246
247 <p align="center">
248
249 <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
250
251 <details>
252
253 <summary>More</summary>
254
255 ```mermaid
256 ${{ env.PROMPT_TOKENS_SECONDS }}
257 ```
258
259 </details>
260
261 <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
262
263 <details>
264 <summary>More</summary>
265
266 ```mermaid
267 ${{ env.PREDICTED_TOKENS_SECONDS }}
268 ```
269
270 </details>
271
272 </p>
273
274 <details>
275
276 <summary>Details</summary>
277
278 <p align="center">
279
280 <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
281
282 <details>
283 <summary>More</summary>
284
285 ```mermaid
286 ${{ env.KV_CACHE_USAGE_RATIO }}
287 ```
288
289 </details>
290
291 <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
292
293 <details>
294 <summary>More</summary>
295
296 ```mermaid
297 ${{ env.REQUESTS_PROCESSING }}
298 ```
299
300 </details>
301
302 </p>
303 </details>
304 </details>