1name: Server-Metal
2
3on:
4 workflow_dispatch: # allows manual triggering
5 inputs:
6 sha:
7 description: 'Commit SHA1 to build'
8 required: false
9 type: string
10 slow_tests:
11 description: 'Run slow tests'
12 required: true
13 type: boolean
14 push:
15 branches:
16 - master
17 paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
18
19env:
20 LLAMA_LOG_COLORS: 1
21 LLAMA_LOG_PREFIX: 1
22 LLAMA_LOG_TIMESTAMPS: 1
23 LLAMA_LOG_VERBOSITY: 10
24
25concurrency:
26 group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
27 cancel-in-progress: true
28
29jobs:
30 server-metal:
31 runs-on: [self-hosted, macOS, ARM64]
32
33 name: server-metal (${{ matrix.wf_name }})
34 strategy:
35 matrix:
36 build_type: [Release]
37 wf_name: ["GPUx1"]
38 include:
39 - build_type: Release
40 extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
41 wf_name: "GPUx1, backend-sampling"
42 - build_type: Release
43 extra_args: "GGML_METAL_DEVICES=2"
44 wf_name: "GPUx2"
45 - build_type: Release
46 extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
47 wf_name: "GPUx2, backend-sampling"
48 fail-fast: false
49
50 steps:
51 - name: Clone
52 id: checkout
53 uses: actions/checkout@v6
54 with:
55 fetch-depth: 0
56 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
57
58 - name: Build
59 id: cmake_build
60 run: |
61 cmake -B build -DGGML_SCHED_NO_REALLOC=ON
62 cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
63
64 - name: Tests
65 id: server_integration_tests
66 if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
67 run: |
68 cd tools/server/tests
69 python3 -m venv venv
70 source venv/bin/activate
71 pip install -r requirements.txt
72 export ${{ matrix.extra_args }}
73 pytest -v -x -m "not slow"