1# TODO: there have been some issues with the workflow, so disabling for now
  2#       https://github.com/ggml-org/llama.cpp/issues/7893
  3#
  4# Benchmark
  5name: Benchmark
  6
  7on:
  8  workflow_dispatch:
  9    inputs:
 10      gpu-series:
 11        description: 'Azure GPU series to run with'
 12        required: true
 13        type: choice
 14        options:
 15          - Standard_NC4as_T4_v3
 16          - Standard_NC24ads_A100_v4
 17          - Standard_NC80adis_H100_v5
 18      sha:
 19        description: 'Commit SHA1 to build'
 20        required: false
 21        type: string
 22      duration:
 23        description: 'Duration of the bench'
 24        type: string
 25        default: 10m
 26
 27  push:
 28    branches:
 29      - master
 30    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
 31  pull_request_target:
 32    types: [opened, synchronize, reopened]
 33    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
 34  schedule:
 35    -  cron: '04 2 * * *'
 36
 37concurrency:
 38  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
 39  cancel-in-progress: true
 40
 41jobs:
 42  bench-server-baseline:
 43    runs-on: Standard_NC4as_T4_v3
 44    env:
 45      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
 46      N_USERS: 8
 47      DURATION: 10m
 48
 49    strategy:
 50      matrix:
 51        model: [phi-2]
 52        ftype: [q4_0, q8_0, f16]
 53        include:
 54          - model: phi-2
 55            ftype: q4_0
 56            pr_comment_enabled: "true"
 57
 58    if: |
 59      inputs.gpu-series == 'Standard_NC4as_T4_v3'
 60      || github.event_name == 'pull_request_target'
 61    steps:
 62      - name: Clone
 63        id: checkout
 64        uses: actions/checkout@v4
 65        with:
 66          fetch-depth: 0
 67          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 68
 69      - name: Install python env
 70        id: pipenv
 71        run: |
 72          cd tools/server/bench
 73          python3 -m venv venv
 74          source venv/bin/activate
 75          pip install -r requirements.txt
 76
 77      - name: Prometheus
 78        id: install_prometheus
 79        run: |
 80          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
 81          tar xzf prometheus*.tar.gz --strip-components=1
 82          ./prometheus --config.file=tools/server/bench/prometheus.yml &
 83          while ! nc -z localhost 9090; do
 84            sleep 0.1
 85          done
 86
 87      - name: Set up Go
 88        uses: actions/setup-go@v5
 89        with:
 90          go-version: '1.21'
 91
 92      - name: Install k6 and xk6-sse
 93        id: k6_installation
 94        run: |
 95          cd tools/server/bench
 96          go install go.k6.io/xk6/cmd/xk6@latest
 97          xk6 build master \
 98              --with github.com/phymbert/xk6-sse
 99
100      - name: Build
101        id: cmake_build
102        run: |
103          set -eux
104          cmake -B build \
105              -DGGML_NATIVE=OFF \
106              -DLLAMA_BUILD_SERVER=ON \
107              -DLLAMA_CUBLAS=ON \
108              -DCUDAToolkit_ROOT=/usr/local/cuda \
109              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
110              -DCMAKE_CUDA_ARCHITECTURES=75 \
111              -DLLAMA_FATAL_WARNINGS=OFF \
112              -DLLAMA_ALL_WARNINGS=OFF \
113              -DCMAKE_BUILD_TYPE=Release;
114          cmake --build build --config Release -j $(nproc) --target llama-server
115
116      - name: Download the dataset
117        id: download_dataset
118        run: |
119          cd tools/server/bench
120          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
121
122      - name: Server bench
123        id: server_bench
124        env:
125            HEAD_REF: ${{ github.head_ref || github.ref_name }}
126        run: |
127          set -eux
128
129          cd tools/server/bench
130          source venv/bin/activate
131          python bench.py \
132              --runner-label ${{ env.RUNNER_LABEL }} \
133              --name ${{ github.job }} \
134              --branch $HEAD_REF \
135              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
136              --scenario script.js \
137              --duration ${{ github.event.inputs.duration || env.DURATION }} \
138              --hf-repo ggml-org/models	 \
139              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
140              --model-path-prefix /models \
141              --parallel ${{ env.N_USERS }} \
142              -ngl 33 \
143              --batch-size 2048 \
144              --ubatch-size	256 \
145              --ctx-size 16384 \
146              --n-prompts 1000 \
147              --max-prompt-tokens 1024 \
148              --max-tokens 2048
149
150          cat results.github.env >> $GITHUB_ENV
151
152          # Remove dataset as we do not want it in the artefact
153          rm ShareGPT_V3_unfiltered_cleaned_split.json
154
155      - uses: actions/upload-artifact@v4
156        with:
157          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
158          compression-level: 9
159          path: |
160            tools/server/bench/*.jpg
161            tools/server/bench/*.json
162            tools/server/bench/*.log
163
164      - name: Commit status
165        uses: Sibz/github-status-action@v1
166        with:
167          authToken: ${{secrets.GITHUB_TOKEN}}
168          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
169          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
170          description: |
171            ${{ env.BENCH_RESULTS }}
172          state: 'success'
173
174      - name: Upload benchmark images
175        uses: devicons/public-upload-to-imgur@v2.2.2
176        continue-on-error: true # Important as it looks unstable: 503
177        id: imgur_step
178        with:
179          client_id: ${{secrets.IMGUR_CLIENT_ID}}
180          path: |
181            tools/server/bench/prompt_tokens_seconds.jpg
182            tools/server/bench/predicted_tokens_seconds.jpg
183            tools/server/bench/kv_cache_usage_ratio.jpg
184            tools/server/bench/requests_processing.jpg
185
186      - name: Extract mermaid
187        id: set_mermaid
188        run: |
189          set -eux
190
191          cd tools/server/bench
192          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
193          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
194          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
195          echo "EOF" >> $GITHUB_ENV
196
197          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
198          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
199          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
200          echo "EOF" >> $GITHUB_ENV
201
202          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
203          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
204          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
205          echo "EOF" >> $GITHUB_ENV
206
207          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
208          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
209          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
210          echo "EOF" >> $GITHUB_ENV
211
212      - name: Extract image url
213        id: extract_image_url
214        continue-on-error: true
215        run: |
216          set -eux
217
218          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
219          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
220          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
221          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
222
223      - name: Comment PR
224        uses: mshick/add-pr-comment@v2
225        id: comment_pr
226        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
227        with:
228          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
229          message: |
230            <p align="center">
231
232            ๐Ÿ“ˆ **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** ๐Ÿš€
233
234            </p>
235
236            <details>
237
238            <summary>Expand details for performance related PR only</summary>
239
240            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
241            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
242            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
243            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
244            - ${{ env.BENCH_GRAPH_XLABEL }}
245
246
247            <p align="center">
248
249            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
250
251            <details>
252
253            <summary>More</summary>
254
255            ```mermaid
256            ${{ env.PROMPT_TOKENS_SECONDS }}
257            ```
258
259            </details>
260
261            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
262
263            <details>
264                <summary>More</summary>
265
266            ```mermaid
267            ${{ env.PREDICTED_TOKENS_SECONDS }}
268            ```
269
270            </details>
271
272            </p>
273
274            <details>
275
276            <summary>Details</summary>
277
278            <p align="center">
279
280            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
281
282            <details>
283                <summary>More</summary>
284
285            ```mermaid
286            ${{ env.KV_CACHE_USAGE_RATIO }}
287            ```
288
289            </details>
290
291            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
292
293            <details>
294                <summary>More</summary>
295
296            ```mermaid
297            ${{ env.REQUESTS_PROCESSING }}
298            ```
299
300            </details>
301
302            </p>
303            </details>
304            </details>