diff options
Diffstat (limited to 'llama.cpp/benches/dgx-spark/dgx-spark.md')
| -rw-r--r-- | llama.cpp/benches/dgx-spark/dgx-spark.md | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/llama.cpp/benches/dgx-spark/dgx-spark.md b/llama.cpp/benches/dgx-spark/dgx-spark.md new file mode 100644 index 0000000..fd5c4e2 --- /dev/null +++ b/llama.cpp/benches/dgx-spark/dgx-spark.md | |||
| @@ -0,0 +1,311 @@ | |||
| 1 | ## System info | ||
| 2 | |||
| 3 | ```bash | ||
| 4 | uname --all | ||
| 5 | Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux | ||
| 6 | |||
| 7 | g++ --version | ||
| 8 | g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 | ||
| 9 | |||
| 10 | nvidia-smi | ||
| 11 | Thu Feb 5 13:49:40 2026 | ||
| 12 | +-----------------------------------------------------------------------------------------+ | ||
| 13 | | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | | ||
| 14 | +-----------------------------------------+------------------------+----------------------+ | ||
| 15 | | GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | ||
| 16 | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | ||
| 17 | | | | MIG M. | | ||
| 18 | |=========================================+========================+======================| | ||
| 19 | | 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A | | ||
| 20 | | N/A 47C P0 13W / N/A | Not Supported | 0% Default | | ||
| 21 | | | | N/A | | ||
| 22 | +-----------------------------------------+------------------------+----------------------+ | ||
| 23 | ``` | ||
| 24 | |||
| 25 | ## ggml-org/gpt-oss-20b-GGUF | ||
| 26 | |||
| 27 | Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF | ||
| 28 | |||
| 29 | - `llama-batched-bench` | ||
| 30 | |||
| 31 | |||
| 32 | main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | ||
| 33 | |||
| 34 | | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | | ||
| 35 | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | ||
| 36 | | 512 | 32 | 1 | 544 | 0.270 | 1895.57 | 0.399 | 80.13 | 0.669 | 812.60 | | ||
| 37 | | 512 | 32 | 2 | 1088 | 0.230 | 4451.23 | 0.583 | 109.71 | 0.813 | 1337.56 | | ||
| 38 | | 512 | 32 | 4 | 2176 | 0.437 | 4688.87 | 0.820 | 156.03 | 1.257 | 1730.91 | | ||
| 39 | | 512 | 32 | 8 | 4352 | 0.863 | 4744.23 | 0.942 | 271.79 | 1.805 | 2410.73 | | ||
| 40 | | 512 | 32 | 16 | 8704 | 1.725 | 4748.19 | 1.173 | 436.38 | 2.899 | 3002.85 | | ||
| 41 | | 512 | 32 | 32 | 17408 | 3.437 | 4767.38 | 1.503 | 681.49 | 4.939 | 3524.40 | | ||
| 42 | | 4096 | 32 | 1 | 4128 | 0.907 | 4513.91 | 0.407 | 78.54 | 1.315 | 3139.56 | | ||
| 43 | | 4096 | 32 | 2 | 8256 | 1.796 | 4560.42 | 0.625 | 102.37 | 2.422 | 3409.45 | | ||
| 44 | | 4096 | 32 | 4 | 16512 | 3.596 | 4555.66 | 0.888 | 144.11 | 4.485 | 3681.93 | | ||
| 45 | | 4096 | 32 | 8 | 33024 | 7.184 | 4561.44 | 1.098 | 233.11 | 8.282 | 3987.51 | | ||
| 46 | | 4096 | 32 | 16 | 66048 | 14.369 | 4560.82 | 1.503 | 340.74 | 15.872 | 4161.30 | | ||
| 47 | | 4096 | 32 | 32 | 132096 | 28.760 | 4557.52 | 2.162 | 473.59 | 30.922 | 4271.95 | | ||
| 48 | | 8192 | 32 | 1 | 8224 | 1.859 | 4405.59 | 0.430 | 74.36 | 2.290 | 3591.61 | | ||
| 49 | | 8192 | 32 | 2 | 16448 | 3.698 | 4430.02 | 0.656 | 97.59 | 4.354 | 3777.47 | | ||
| 50 | | 8192 | 32 | 4 | 32896 | 7.403 | 4426.10 | 0.957 | 133.82 | 8.360 | 3934.97 | | ||
| 51 | | 8192 | 32 | 8 | 65792 | 14.802 | 4427.63 | 1.222 | 209.44 | 16.024 | 4105.87 | | ||
| 52 | | 8192 | 32 | 16 | 131584 | 29.596 | 4428.67 | 1.741 | 294.13 | 31.337 | 4199.00 | | ||
| 53 | | 8192 | 32 | 32 | 263168 | 59.169 | 4430.42 | 2.619 | 390.92 | 61.789 | 4259.17 | | ||
| 54 | |||
| 55 | |||
| 56 | - `llama-bench` | ||
| 57 | |||
| 58 | | model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | | ||
| 59 | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | | ||
| 60 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 4505.82 ± 12.90 | | ||
| 61 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 83.43 ± 0.59 | | ||
| 62 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 4158.34 ± 18.84 | | ||
| 63 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 79.22 ± 0.60 | | ||
| 64 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 3993.81 ± 17.55 | | ||
| 65 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 75.22 ± 1.05 | | ||
| 66 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 3449.98 ± 12.13 | | ||
| 67 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 70.36 ± 0.37 | | ||
| 68 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 2689.42 ± 18.89 | | ||
| 69 | | gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 61.65 ± 0.30 | | ||
| 70 | |||
| 71 | build: 11fb327bf (7941) | ||
| 72 | |||
| 73 | ## ggml-org/gpt-oss-120b-GGUF | ||
| 74 | |||
| 75 | Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF | ||
| 76 | |||
| 77 | - `llama-batched-bench` | ||
| 78 | |||
| 79 | |||
| 80 | main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | ||
| 81 | |||
| 82 | | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | | ||
| 83 | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | ||
| 84 | | 512 | 32 | 1 | 544 | 0.445 | 1151.80 | 0.560 | 57.14 | 1.005 | 541.53 | | ||
| 85 | | 512 | 32 | 2 | 1088 | 0.472 | 2169.85 | 0.874 | 73.27 | 1.345 | 808.65 | | ||
| 86 | | 512 | 32 | 4 | 2176 | 0.826 | 2480.33 | 1.299 | 98.51 | 2.125 | 1023.94 | | ||
| 87 | | 512 | 32 | 8 | 4352 | 1.644 | 2491.67 | 1.608 | 159.18 | 3.252 | 1338.20 | | ||
| 88 | | 512 | 32 | 16 | 8704 | 3.292 | 2488.35 | 2.117 | 241.85 | 5.409 | 1609.13 | | ||
| 89 | | 512 | 32 | 32 | 17408 | 6.604 | 2481.07 | 2.898 | 353.31 | 9.502 | 1832.04 | | ||
| 90 | | 4096 | 32 | 1 | 4128 | 1.698 | 2412.65 | 0.580 | 55.21 | 2.277 | 1812.66 | | ||
| 91 | | 4096 | 32 | 2 | 8256 | 3.399 | 2409.88 | 0.934 | 68.53 | 4.333 | 1905.27 | | ||
| 92 | | 4096 | 32 | 4 | 16512 | 6.823 | 2401.21 | 1.411 | 90.72 | 8.234 | 2005.30 | | ||
| 93 | | 4096 | 32 | 8 | 33024 | 13.574 | 2413.97 | 1.841 | 139.07 | 15.415 | 2142.31 | | ||
| 94 | | 4096 | 32 | 16 | 66048 | 27.176 | 2411.52 | 2.609 | 196.26 | 29.785 | 2217.49 | | ||
| 95 | | 4096 | 32 | 32 | 132096 | 54.359 | 2411.23 | 3.905 | 262.20 | 58.264 | 2267.19 | | ||
| 96 | | 8192 | 32 | 1 | 8224 | 3.491 | 2346.81 | 0.613 | 52.23 | 4.103 | 2004.21 | | ||
| 97 | | 8192 | 32 | 2 | 16448 | 6.939 | 2361.03 | 0.981 | 65.21 | 7.921 | 2076.56 | | ||
| 98 | | 8192 | 32 | 4 | 32896 | 13.888 | 2359.40 | 1.511 | 84.71 | 15.399 | 2136.21 | | ||
| 99 | | 8192 | 32 | 8 | 65792 | 27.756 | 2361.18 | 2.034 | 125.86 | 29.790 | 2208.56 | | ||
| 100 | | 8192 | 32 | 16 | 131584 | 55.554 | 2359.34 | 3.021 | 169.49 | 58.575 | 2246.41 | | ||
| 101 | | 8192 | 32 | 32 | 263168 | 111.036 | 2360.89 | 4.537 | 225.72 | 115.573 | 2277.08 | | ||
| 102 | |||
| 103 | |||
| 104 | - `llama-bench` | ||
| 105 | |||
| 106 | | model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | | ||
| 107 | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | | ||
| 108 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2443.91 ± 7.47 | | ||
| 109 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 58.72 ± 0.20 | | ||
| 110 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2309.84 ± 3.63 | | ||
| 111 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 55.67 ± 0.35 | | ||
| 112 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2216.68 ± 10.16 | | ||
| 113 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 52.87 ± 0.43 | | ||
| 114 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1956.31 ± 6.39 | | ||
| 115 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 49.45 ± 0.20 | | ||
| 116 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1567.08 ± 11.79 | | ||
| 117 | | gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 42.76 ± 0.14 | | ||
| 118 | |||
| 119 | build: 11fb327bf (7941) | ||
| 120 | |||
| 121 | ## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF | ||
| 122 | |||
| 123 | Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF | ||
| 124 | |||
| 125 | - `llama-batched-bench` | ||
| 126 | |||
| 127 | |||
| 128 | main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | ||
| 129 | |||
| 130 | | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | | ||
| 131 | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | ||
| 132 | | 512 | 32 | 1 | 544 | 0.393 | 1303.73 | 0.548 | 58.36 | 0.941 | 578.10 | | ||
| 133 | | 512 | 32 | 2 | 1088 | 0.387 | 2648.68 | 0.910 | 70.35 | 1.296 | 839.27 | | ||
| 134 | | 512 | 32 | 4 | 2176 | 0.659 | 3107.63 | 1.302 | 98.33 | 1.961 | 1109.77 | | ||
| 135 | | 512 | 32 | 8 | 4352 | 1.322 | 3099.35 | 1.669 | 153.42 | 2.990 | 1455.43 | | ||
| 136 | | 512 | 32 | 16 | 8704 | 2.639 | 3104.63 | 2.212 | 231.44 | 4.851 | 1794.32 | | ||
| 137 | | 512 | 32 | 32 | 17408 | 5.284 | 3100.80 | 2.955 | 346.53 | 8.239 | 2112.93 | | ||
| 138 | | 4096 | 32 | 1 | 4128 | 1.417 | 2890.36 | 0.598 | 53.51 | 2.015 | 2048.45 | | ||
| 139 | | 4096 | 32 | 2 | 8256 | 2.829 | 2895.62 | 1.019 | 62.82 | 3.848 | 2145.60 | | ||
| 140 | | 4096 | 32 | 4 | 16512 | 5.656 | 2896.96 | 1.528 | 83.79 | 7.183 | 2298.71 | | ||
| 141 | | 4096 | 32 | 8 | 33024 | 11.338 | 2890.02 | 2.127 | 120.36 | 13.465 | 2452.53 | | ||
| 142 | | 4096 | 32 | 16 | 66048 | 22.709 | 2885.96 | 3.104 | 164.97 | 25.812 | 2558.79 | | ||
| 143 | | 4096 | 32 | 32 | 132096 | 45.301 | 2893.35 | 4.723 | 216.80 | 50.024 | 2640.63 | | ||
| 144 | | 8192 | 32 | 1 | 8224 | 3.022 | 2711.09 | 0.678 | 47.20 | 3.700 | 2222.89 | | ||
| 145 | | 8192 | 32 | 2 | 16448 | 6.039 | 2713.01 | 1.149 | 55.70 | 7.188 | 2288.21 | | ||
| 146 | | 8192 | 32 | 4 | 32896 | 12.050 | 2719.35 | 1.785 | 71.69 | 13.835 | 2377.67 | | ||
| 147 | | 8192 | 32 | 8 | 65792 | 24.113 | 2717.90 | 2.629 | 97.39 | 26.741 | 2460.31 | | ||
| 148 | | 8192 | 32 | 16 | 131584 | 48.178 | 2720.58 | 4.099 | 124.91 | 52.277 | 2517.06 | | ||
| 149 | | 8192 | 32 | 32 | 263168 | 96.401 | 2719.31 | 6.696 | 152.93 | 103.097 | 2552.63 | | ||
| 150 | |||
| 151 | |||
| 152 | - `llama-bench` | ||
| 153 | |||
| 154 | | model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | | ||
| 155 | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | | ||
| 156 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2986.97 ± 18.87 | | ||
| 157 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 61.06 ± 0.23 | | ||
| 158 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2633.45 ± 6.26 | | ||
| 159 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 54.77 ± 0.28 | | ||
| 160 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2354.14 ± 3.84 | | ||
| 161 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 48.02 ± 0.40 | | ||
| 162 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1908.86 ± 4.25 | | ||
| 163 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 40.23 ± 0.10 | | ||
| 164 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1348.17 ± 2.00 | | ||
| 165 | | qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 30.21 ± 0.04 | | ||
| 166 | |||
| 167 | build: 11fb327bf (7941) | ||
| 168 | |||
| 169 | ## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF | ||
| 170 | |||
| 171 | Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF | ||
| 172 | |||
| 173 | - `llama-batched-bench` | ||
| 174 | |||
| 175 | |||
| 176 | main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | ||
| 177 | |||
| 178 | | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | | ||
| 179 | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | ||
| 180 | | 512 | 32 | 1 | 544 | 0.212 | 2420.12 | 1.100 | 29.10 | 1.311 | 414.85 | | ||
| 181 | | 512 | 32 | 2 | 1088 | 0.428 | 2393.89 | 1.185 | 54.00 | 1.613 | 674.56 | | ||
| 182 | | 512 | 32 | 4 | 2176 | 0.894 | 2290.41 | 1.229 | 104.17 | 2.123 | 1025.02 | | ||
| 183 | | 512 | 32 | 8 | 4352 | 1.758 | 2330.36 | 1.319 | 194.15 | 3.076 | 1414.70 | | ||
| 184 | | 512 | 32 | 16 | 8704 | 3.508 | 2335.21 | 1.543 | 331.90 | 5.051 | 1723.33 | | ||
| 185 | | 512 | 32 | 32 | 17408 | 7.035 | 2328.93 | 1.738 | 589.21 | 8.773 | 1984.29 | | ||
| 186 | | 4096 | 32 | 1 | 4128 | 1.831 | 2237.25 | 1.125 | 28.44 | 2.956 | 1396.42 | | ||
| 187 | | 4096 | 32 | 2 | 8256 | 3.642 | 2249.48 | 1.253 | 51.07 | 4.895 | 1686.64 | | ||
| 188 | | 4096 | 32 | 4 | 16512 | 7.274 | 2252.26 | 1.380 | 92.72 | 8.655 | 1907.81 | | ||
| 189 | | 4096 | 32 | 8 | 33024 | 14.576 | 2248.09 | 1.617 | 158.29 | 16.193 | 2039.37 | | ||
| 190 | | 4096 | 32 | 16 | 66048 | 29.138 | 2249.17 | 2.081 | 246.01 | 31.219 | 2115.63 | | ||
| 191 | | 4096 | 32 | 32 | 132096 | 58.275 | 2249.19 | 2.814 | 363.87 | 61.089 | 2162.34 | | ||
| 192 | | 8192 | 32 | 1 | 8224 | 3.757 | 2180.26 | 1.184 | 27.03 | 4.941 | 1664.37 | | ||
| 193 | | 8192 | 32 | 2 | 16448 | 7.522 | 2178.05 | 1.341 | 47.73 | 8.863 | 1855.77 | | ||
| 194 | | 8192 | 32 | 4 | 32896 | 15.043 | 2178.25 | 1.548 | 82.69 | 16.591 | 1982.74 | | ||
| 195 | | 8192 | 32 | 8 | 65792 | 30.111 | 2176.49 | 1.937 | 132.13 | 32.048 | 2052.90 | | ||
| 196 | | 8192 | 32 | 16 | 131584 | 60.405 | 2169.90 | 2.706 | 189.21 | 63.111 | 2084.97 | | ||
| 197 | | 8192 | 32 | 32 | 263168 | 120.439 | 2176.58 | 3.993 | 256.46 | 124.432 | 2114.96 | | ||
| 198 | |||
| 199 | |||
| 200 | - `llama-bench` | ||
| 201 | |||
| 202 | | model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | | ||
| 203 | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | | ||
| 204 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2250.28 ± 6.41 | | ||
| 205 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 29.43 ± 0.02 | | ||
| 206 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2100.19 ± 8.96 | | ||
| 207 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 28.61 ± 0.02 | | ||
| 208 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2007.56 ± 4.16 | | ||
| 209 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 27.38 ± 0.09 | | ||
| 210 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1779.11 ± 6.42 | | ||
| 211 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 25.72 ± 0.03 | | ||
| 212 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1471.23 ± 1.71 | | ||
| 213 | | qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 22.51 ± 0.02 | | ||
| 214 | |||
| 215 | build: 11fb327bf (7941) | ||
| 216 | |||
| 217 | ## ggml-org/gemma-3-4b-it-qat-GGUF | ||
| 218 | |||
| 219 | Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF | ||
| 220 | |||
| 221 | - `llama-batched-bench` | ||
| 222 | |||
| 223 | |||
| 224 | main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | ||
| 225 | |||
| 226 | | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | | ||
| 227 | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | ||
| 228 | | 512 | 32 | 1 | 544 | 0.092 | 5566.97 | 0.412 | 77.63 | 0.504 | 1078.95 | | ||
| 229 | | 512 | 32 | 2 | 1088 | 0.161 | 6345.67 | 0.522 | 122.70 | 0.683 | 1593.06 | | ||
| 230 | | 512 | 32 | 4 | 2176 | 0.325 | 6309.87 | 0.562 | 227.68 | 0.887 | 2453.87 | | ||
| 231 | | 512 | 32 | 8 | 4352 | 0.643 | 6374.42 | 0.685 | 373.67 | 1.328 | 3277.94 | | ||
| 232 | | 512 | 32 | 16 | 8704 | 1.277 | 6413.64 | 0.915 | 559.47 | 2.192 | 3970.01 | | ||
| 233 | | 512 | 32 | 32 | 17408 | 2.518 | 6506.57 | 1.249 | 819.61 | 3.767 | 4620.64 | | ||
| 234 | | 4096 | 32 | 1 | 4128 | 0.674 | 6079.68 | 0.453 | 70.60 | 1.127 | 3662.88 | | ||
| 235 | | 4096 | 32 | 2 | 8256 | 1.335 | 6137.82 | 0.627 | 102.03 | 1.962 | 4208.11 | | ||
| 236 | | 4096 | 32 | 4 | 16512 | 2.657 | 6167.35 | 0.749 | 170.92 | 3.405 | 4848.71 | | ||
| 237 | | 4096 | 32 | 8 | 33024 | 5.307 | 6173.91 | 0.974 | 262.89 | 6.281 | 5257.53 | | ||
| 238 | | 4096 | 32 | 16 | 66048 | 10.610 | 6176.96 | 1.379 | 371.42 | 11.988 | 5509.40 | | ||
| 239 | | 4096 | 32 | 32 | 132096 | 21.213 | 6178.89 | 2.122 | 482.50 | 23.335 | 5660.82 | | ||
| 240 | | 8192 | 32 | 1 | 8224 | 1.359 | 6027.34 | 0.467 | 68.52 | 1.826 | 4503.48 | | ||
| 241 | | 8192 | 32 | 2 | 16448 | 2.699 | 6069.68 | 0.653 | 98.03 | 3.352 | 4906.68 | | ||
| 242 | | 8192 | 32 | 4 | 32896 | 5.366 | 6106.74 | 0.818 | 156.55 | 6.184 | 5319.96 | | ||
| 243 | | 8192 | 32 | 8 | 65792 | 10.755 | 6093.50 | 1.174 | 218.04 | 11.929 | 5515.22 | | ||
| 244 | | 8192 | 32 | 16 | 131584 | 21.484 | 6100.82 | 1.829 | 279.90 | 23.314 | 5644.11 | | ||
| 245 | | 8192 | 32 | 32 | 263168 | 42.950 | 6103.40 | 3.058 | 334.91 | 46.008 | 5720.05 | | ||
| 246 | |||
| 247 | |||
| 248 | - `llama-bench` | ||
| 249 | |||
| 250 | | model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | | ||
| 251 | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | | ||
| 252 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 5948.74 ± 10.61 | | ||
| 253 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 81.05 ± 0.20 | | ||
| 254 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 5652.69 ± 34.29 | | ||
| 255 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 76.37 ± 0.58 | | ||
| 256 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 5509.57 ± 40.69 | | ||
| 257 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 71.61 ± 0.80 | | ||
| 258 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 5340.86 ± 36.92 | | ||
| 259 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 70.89 ± 0.34 | | ||
| 260 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 5023.30 ± 13.52 | | ||
| 261 | | gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 62.28 ± 0.30 | | ||
| 262 | |||
| 263 | build: 11fb327bf (7941) | ||
| 264 | |||
| 265 | ## ggml-org/GLM-4.7-Flash-GGUF | ||
| 266 | |||
| 267 | Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF | ||
| 268 | |||
| 269 | - `llama-batched-bench` | ||
| 270 | |||
| 271 | |||
| 272 | main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | ||
| 273 | |||
| 274 | | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | | ||
| 275 | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| | ||
| 276 | | 512 | 32 | 1 | 544 | 0.433 | 1181.83 | 0.693 | 46.16 | 1.126 | 482.94 | | ||
| 277 | | 512 | 32 | 2 | 1088 | 0.439 | 2334.46 | 1.034 | 61.89 | 1.473 | 738.75 | | ||
| 278 | | 512 | 32 | 4 | 2176 | 0.772 | 2654.46 | 1.459 | 87.76 | 2.230 | 975.77 | | ||
| 279 | | 512 | 32 | 8 | 4352 | 1.541 | 2658.78 | 2.043 | 125.31 | 3.583 | 1214.47 | | ||
| 280 | | 512 | 32 | 16 | 8704 | 3.083 | 2656.91 | 2.675 | 191.42 | 5.758 | 1511.62 | | ||
| 281 | | 512 | 32 | 32 | 17408 | 6.159 | 2660.12 | 3.615 | 283.24 | 9.774 | 1780.98 | | ||
| 282 | | 4096 | 32 | 1 | 4128 | 1.915 | 2139.30 | 0.725 | 44.14 | 2.640 | 1563.83 | | ||
| 283 | | 4096 | 32 | 2 | 8256 | 3.834 | 2136.40 | 1.119 | 57.21 | 4.953 | 1666.81 | | ||
| 284 | | 4096 | 32 | 4 | 16512 | 7.636 | 2145.72 | 1.631 | 78.49 | 9.266 | 1781.93 | | ||
| 285 | | 4096 | 32 | 8 | 33024 | 15.295 | 2142.40 | 2.344 | 109.21 | 17.639 | 1872.20 | | ||
| 286 | | 4096 | 32 | 16 | 66048 | 30.573 | 2143.62 | 3.773 | 135.70 | 34.346 | 1923.04 | | ||
| 287 | | 4096 | 32 | 32 | 132096 | 61.282 | 2138.82 | 5.795 | 176.71 | 67.077 | 1969.31 | | ||
| 288 | | 8192 | 32 | 1 | 8224 | 4.510 | 1816.24 | 0.760 | 42.11 | 5.270 | 1560.44 | | ||
| 289 | | 8192 | 32 | 2 | 16448 | 9.036 | 1813.19 | 1.206 | 53.06 | 10.242 | 1605.91 | | ||
| 290 | | 8192 | 32 | 4 | 32896 | 18.070 | 1813.43 | 1.783 | 71.80 | 19.852 | 1657.03 | | ||
| 291 | | 8192 | 32 | 8 | 65792 | 36.125 | 1814.15 | 2.635 | 97.14 | 38.760 | 1697.41 | | ||
| 292 | | 8192 | 32 | 16 | 131584 | 72.367 | 1811.20 | 4.954 | 103.34 | 77.322 | 1701.77 | | ||
| 293 | | 8192 | 32 | 32 | 263168 | 144.501 | 1814.13 | 8.103 | 126.37 | 152.604 | 1724.51 | | ||
| 294 | |||
| 295 | |||
| 296 | - `llama-bench` | ||
| 297 | |||
| 298 | | model | size | params | backend | ngl | n_ubatch | fa | dio | test | t/s | | ||
| 299 | | ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | --: | --------------: | -------------------: | | ||
| 300 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 | 2364.18 ± 11.43 | | ||
| 301 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 | 48.68 ± 0.12 | | ||
| 302 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d4096 | 1684.13 ± 1.24 | | ||
| 303 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d4096 | 44.62 ± 0.22 | | ||
| 304 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d8192 | 1314.68 ± 1.41 | | ||
| 305 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d8192 | 42.59 ± 0.11 | | ||
| 306 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d16384 | 914.05 ± 3.32 | | ||
| 307 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d16384 | 38.72 ± 0.13 | | ||
| 308 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d32768 | 567.20 ± 0.90 | | ||
| 309 | | deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d32768 | 32.65 ± 0.09 | | ||
| 310 | |||
| 311 | build: 11fb327bf (7941) | ||
