1import sse from 'k6/x/sse'
2import {check, sleep} from 'k6'
3import {SharedArray} from 'k6/data'
4import {Counter, Rate, Trend} from 'k6/metrics'
5import exec from 'k6/execution';
6
7// Server chat completions prefix
8const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
9
10// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
11const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
12
13// Model name to request
14const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
15
16// Dataset path
17const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
18
19// Max tokens to predict
20const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
21
22// Max prompt tokens
23const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
24
25// Max slot context
26const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
27
28export function setup() {
29 console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
30}
31
32const data = new SharedArray('conversations', function () {
33 const tokenizer = (message) => message.split(/[\s,'".?]/)
34
35 return JSON.parse(open(dataset_path))
36 // Filter out the conversations with less than 2 turns.
37 .filter(data => data["conversations"].length >= 2)
38 .filter(data => data["conversations"][0]["from"] === "human")
39 .map(data => {
40 return {
41 prompt: data["conversations"][0]["value"],
42 n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
43 n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
44 }
45 })
46 // Filter out too short sequences
47 .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
48 // Filter out too long sequences.
49 .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
50 // Keep only first n prompts
51 .slice(0, n_prompt)
52})
53
54const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
55const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
56
57const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
58const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
59const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
60
61const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
62const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
63
64const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
65const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
66
67export const options = {
68 thresholds: {
69 llamacpp_completions_truncated_rate: [
70 // more than 80% of truncated input will abort the test
71 {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
72 ],
73 },
74 duration: '10m',
75 vus: 8,
76}
77
78export default function () {
79 const conversation = data[exec.scenario.iterationInInstance % data.length]
80 const payload = {
81 "messages": [
82 {
83 "role": "system",
84 "content": "You are ChatGPT, an AI assistant.",
85 },
86 {
87 "role": "user",
88 "content": conversation.prompt,
89 }
90 ],
91 "model": model,
92 "stream": true,
93 "stream_options": {
94 "include_usage": true, // False to be supported in llama.cpp server
95 },
96 "seed": 42,
97 "max_tokens": max_tokens,
98 "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
99 }
100
101 const params = {method: 'POST', body: JSON.stringify(payload)};
102
103 const startTime = new Date()
104 let promptEvalEndTime = null
105 let prompt_tokens = 0
106 let completions_tokens = 0
107 let finish_reason = null
108 const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
109 client.on('event', function (event) {
110 if (promptEvalEndTime == null) {
111 promptEvalEndTime = new Date()
112 llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
113 }
114
115 if (event.data === '[DONE]' || event.data === '') {
116 return
117 }
118
119 let chunk = JSON.parse(event.data)
120
121 if (chunk.choices && chunk.choices.length > 0) {
122 let choice = chunk.choices[0]
123 if (choice.finish_reason) {
124 finish_reason = choice.finish_reason
125 }
126 }
127
128 if (chunk.usage) {
129 prompt_tokens = chunk.usage.prompt_tokens
130 llamacpp_prompt_tokens.add(prompt_tokens)
131 llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
132
133 completions_tokens = chunk.usage.completion_tokens
134 llamacpp_completion_tokens.add(completions_tokens)
135 llamacpp_completion_tokens_total_counter.add(completions_tokens)
136 }
137 })
138
139 client.on('error', function (e) {
140 console.log('An unexpected error occurred: ', e.error());
141 throw e;
142 })
143 })
144
145 check(res, {'success completion': (r) => r.status === 200})
146
147 const endTime = new Date()
148
149 const promptEvalTime = promptEvalEndTime - startTime
150 if (promptEvalTime > 0) {
151 llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
152 }
153
154 const completion_time = endTime - promptEvalEndTime
155 if (completions_tokens > 0 && completion_time > 0) {
156 llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
157 }
158 llamacpp_completions_truncated_rate.add(finish_reason === 'length')
159 llamacpp_completions_stop_rate.add(finish_reason === 'stop')
160
161 sleep(0.3)
162}