summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/server/bench/script.js
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/server/bench/script.js
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/tools/server/bench/script.js')
-rw-r--r--llama.cpp/tools/server/bench/script.js162
1 files changed, 162 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/bench/script.js b/llama.cpp/tools/server/bench/script.js
new file mode 100644
index 0000000..2772bee
--- /dev/null
+++ b/llama.cpp/tools/server/bench/script.js
@@ -0,0 +1,162 @@
+import sse from 'k6/x/sse'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Rate, Trend} from 'k6/metrics'
+import exec from 'k6/execution';
+
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
+
+// Max tokens to predict
+const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
+
+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
+export function setup() {
+ console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
+}
+
+const data = new SharedArray('conversations', function () {
+ const tokenizer = (message) => message.split(/[\s,'".?]/)
+
+ return JSON.parse(open(dataset_path))
+ // Filter out the conversations with less than 2 turns.
+ .filter(data => data["conversations"].length >= 2)
+ .filter(data => data["conversations"][0]["from"] === "human")
+ .map(data => {
+ return {
+ prompt: data["conversations"][0]["value"],
+ n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
+ n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
+ }
+ })
+ // Filter out too short sequences
+ .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
+ // Filter out too long sequences.
+ .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
+ // Keep only first n prompts
+ .slice(0, n_prompt)
+})
+
+const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
+const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
+
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
+
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
+
+export const options = {
+ thresholds: {
+ llamacpp_completions_truncated_rate: [
+ // more than 80% of truncated input will abort the test
+ {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
+ ],
+ },
+ duration: '10m',
+ vus: 8,
+}
+
+export default function () {
+ const conversation = data[exec.scenario.iterationInInstance % data.length]
+ const payload = {
+ "messages": [
+ {
+ "role": "system",
+ "content": "You are ChatGPT, an AI assistant.",
+ },
+ {
+ "role": "user",
+ "content": conversation.prompt,
+ }
+ ],
+ "model": model,
+ "stream": true,
+ "stream_options": {
+ "include_usage": true, // False to be supported in llama.cpp server
+ },
+ "seed": 42,
+ "max_tokens": max_tokens,
+ "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
+ }
+
+ const params = {method: 'POST', body: JSON.stringify(payload)};
+
+ const startTime = new Date()
+ let promptEvalEndTime = null
+ let prompt_tokens = 0
+ let completions_tokens = 0
+ let finish_reason = null
+ const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+ client.on('event', function (event) {
+ if (promptEvalEndTime == null) {
+ promptEvalEndTime = new Date()
+ llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
+ }
+
+ if (event.data === '[DONE]' || event.data === '') {
+ return
+ }
+
+ let chunk = JSON.parse(event.data)
+
+ if (chunk.choices && chunk.choices.length > 0) {
+ let choice = chunk.choices[0]
+ if (choice.finish_reason) {
+ finish_reason = choice.finish_reason
+ }
+ }
+
+ if (chunk.usage) {
+ prompt_tokens = chunk.usage.prompt_tokens
+ llamacpp_prompt_tokens.add(prompt_tokens)
+ llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+ completions_tokens = chunk.usage.completion_tokens
+ llamacpp_completion_tokens.add(completions_tokens)
+ llamacpp_completion_tokens_total_counter.add(completions_tokens)
+ }
+ })
+
+ client.on('error', function (e) {
+ console.log('An unexpected error occurred: ', e.error());
+ throw e;
+ })
+ })
+
+ check(res, {'success completion': (r) => r.status === 200})
+
+ const endTime = new Date()
+
+ const promptEvalTime = promptEvalEndTime - startTime
+ if (promptEvalTime > 0) {
+ llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
+ }
+
+ const completion_time = endTime - promptEvalEndTime
+ if (completions_tokens > 0 && completion_time > 0) {
+ llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+ }
+ llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+ llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
+ sleep(0.3)
+}