1import pytest
2import requests
3import time
4import random
5
6from openai import OpenAI
7from utils import *
8
9server = ServerPreset.tinyllama2()
10
11JSON_MULTIMODAL_KEY = "multimodal_data"
12JSON_PROMPT_STRING_KEY = "prompt_string"
13
14@pytest.fixture(autouse=True)
15def create_server():
16 global server
17 server = ServerPreset.tinyllama2()
18
19@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
20 ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
21 ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
22])
23def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
24 global server
25 server.start()
26 res = server.make_request("POST", "/completion", data={
27 "n_predict": n_predict,
28 "prompt": prompt,
29 "return_tokens": return_tokens,
30 })
31 assert res.status_code == 200
32 assert res.body["timings"]["prompt_n"] == n_prompt
33 assert res.body["timings"]["predicted_n"] == n_predicted
34 assert res.body["truncated"] == truncated
35 assert type(res.body["has_new_line"]) == bool
36 assert match_regex(re_content, res.body["content"])
37 if return_tokens:
38 assert len(res.body["tokens"]) > 0
39 assert all(type(tok) == int for tok in res.body["tokens"])
40 else:
41 assert res.body["tokens"] == []
42
43
44@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
45 ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
46 ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
47])
48def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
49 global server
50 server.start()
51 res = server.make_stream_request("POST", "/completion", data={
52 "n_predict": n_predict,
53 "prompt": prompt,
54 "stream": True,
55 })
56 content = ""
57 for data in res:
58 assert "stop" in data and type(data["stop"]) == bool
59 if data["stop"]:
60 assert data["timings"]["prompt_n"] == n_prompt
61 assert data["timings"]["predicted_n"] == n_predicted
62 assert data["truncated"] == truncated
63 assert data["stop_type"] == "limit"
64 assert type(data["has_new_line"]) == bool
65 assert "generation_settings" in data
66 assert server.n_predict is not None
67 assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
68 assert data["generation_settings"]["seed"] == server.seed
69 assert match_regex(re_content, content)
70 else:
71 assert len(data["tokens"]) > 0
72 assert all(type(tok) == int for tok in data["tokens"])
73 content += data["content"]
74
75
76def test_completion_stream_vs_non_stream():
77 global server
78 server.start()
79 res_stream = server.make_stream_request("POST", "/completion", data={
80 "n_predict": 8,
81 "prompt": "I believe the meaning of life is",
82 "stream": True,
83 })
84 res_non_stream = server.make_request("POST", "/completion", data={
85 "n_predict": 8,
86 "prompt": "I believe the meaning of life is",
87 })
88 content_stream = ""
89 for data in res_stream:
90 content_stream += data["content"]
91 assert content_stream == res_non_stream.body["content"]
92
93
94def test_completion_with_openai_library():
95 global server
96 server.start()
97 client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
98 res = client.completions.create(
99 model="davinci-002",
100 prompt="I believe the meaning of life is",
101 max_tokens=8,
102 )
103 assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
104 assert res.choices[0].finish_reason == "length"
105 assert res.choices[0].text is not None
106 assert match_regex("(going|bed)+", res.choices[0].text)
107
108
109def test_completion_stream_with_openai_library():
110 global server
111 server.start()
112 client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
113 res = client.completions.create(
114 model="davinci-002",
115 prompt="I believe the meaning of life is",
116 max_tokens=8,
117 stream=True,
118 )
119 output_text = ''
120 for data in res:
121 choice = data.choices[0]
122 if choice.finish_reason is None:
123 assert choice.text is not None
124 output_text += choice.text
125 assert match_regex("(going|bed)+", output_text)
126
127
128# Test case from https://github.com/ggml-org/llama.cpp/issues/13780
129@pytest.mark.slow
130def test_completion_stream_with_openai_library_stops():
131 global server
132 server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
133 server.model_hf_file = None
134 server.start()
135 client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
136 res = client.completions.create(
137 model="davinci-002",
138 prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
139 stop=["User:\n", "Assistant:\n"],
140 max_tokens=200,
141 stream=True,
142 )
143 output_text = ''
144 for data in res:
145 choice = data.choices[0]
146 if choice.finish_reason is None:
147 assert choice.text is not None
148 output_text += choice.text
149 assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
150
151
152@pytest.mark.parametrize("n_slots", [1, 2])
153def test_consistent_result_same_seed(n_slots: int):
154 global server
155 server.n_slots = n_slots
156 server.start()
157 last_res = None
158 for _ in range(4):
159 res = server.make_request("POST", "/completion", data={
160 "prompt": "I believe the meaning of life is",
161 "seed": 42,
162 "temperature": 0.0,
163 "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
164 })
165 if last_res is not None:
166 assert res.body["content"] == last_res.body["content"]
167 last_res = res
168
169
170@pytest.mark.parametrize("n_slots", [1, 2])
171def test_different_result_different_seed(n_slots: int):
172 global server
173 server.n_slots = n_slots
174 server.start()
175 last_res = None
176 for seed in range(4):
177 res = server.make_request("POST", "/completion", data={
178 "prompt": "I believe the meaning of life is",
179 "seed": seed,
180 "temperature": 1.0,
181 "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
182 })
183 if last_res is not None:
184 assert res.body["content"] != last_res.body["content"]
185 last_res = res
186
187# TODO figure why it don't work with temperature = 1
188#ย @pytest.mark.parametrize("temperature", [0.0, 1.0])
189@pytest.mark.parametrize("n_batch", [16, 32])
190@pytest.mark.parametrize("temperature", [0.0])
191def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
192 global server
193 server.n_batch = n_batch
194 server.start()
195 last_res = None
196 for _ in range(4):
197 res = server.make_request("POST", "/completion", data={
198 "prompt": "I believe the meaning of life is",
199 "seed": 42,
200 "temperature": temperature,
201 "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
202 })
203 if last_res is not None:
204 assert res.body["content"] == last_res.body["content"]
205 last_res = res
206
207
208@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
209def test_cache_vs_nocache_prompt():
210 global server
211 server.start()
212 res_cache = server.make_request("POST", "/completion", data={
213 "prompt": "I believe the meaning of life is",
214 "seed": 42,
215 "temperature": 1.0,
216 "cache_prompt": True,
217 })
218 res_no_cache = server.make_request("POST", "/completion", data={
219 "prompt": "I believe the meaning of life is",
220 "seed": 42,
221 "temperature": 1.0,
222 "cache_prompt": False,
223 })
224 assert res_cache.body["content"] == res_no_cache.body["content"]
225
226
227def test_nocache_long_input_prompt():
228 global server
229 server.start()
230 res = server.make_request("POST", "/completion", data={
231 "prompt": "I believe the meaning of life is"*32,
232 "seed": 42,
233 "temperature": 1.0,
234 "cache_prompt": False,
235 })
236 assert res.status_code == 400
237
238def test_json_prompt_no_mtmd():
239 global server
240 server.start()
241 res = server.make_request("POST", "/completion", data={
242 "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is" },
243 "seed": 42,
244 "temperature": 1.0,
245 "cache_prompt": False,
246 })
247 assert res.status_code == 200
248
249def test_json_prompt_mtm_error_when_not_supported():
250 global server
251 server.start()
252 res = server.make_request("POST", "/completion", data={
253 "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is <__media__>", JSON_MULTIMODAL_KEY: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" },
254 "seed": 42,
255 "temperature": 1.0,
256 "cache_prompt": False,
257 })
258 # MTMD is disabled on this model, so this should fail.
259 assert res.status_code != 200
260
261def test_completion_with_tokens_input():
262 global server
263 server.temperature = 0.0
264 server.start()
265 prompt_str = "I believe the meaning of life is"
266 res = server.make_request("POST", "/tokenize", data={
267 "content": prompt_str,
268 "add_special": True,
269 })
270 assert res.status_code == 200
271 tokens = res.body["tokens"]
272
273 # single completion
274 res = server.make_request("POST", "/completion", data={
275 "prompt": tokens,
276 })
277 assert res.status_code == 200
278 assert type(res.body["content"]) == str
279
280 # batch completion
281 res = server.make_request("POST", "/completion", data={
282 "prompt": [tokens, tokens],
283 })
284 assert res.status_code == 200
285 assert type(res.body) == list
286 assert len(res.body) == 2
287 assert res.body[0]["content"] == res.body[1]["content"]
288
289 # mixed string and tokens
290 res = server.make_request("POST", "/completion", data={
291 "prompt": [tokens, prompt_str],
292 })
293 assert res.status_code == 200
294 assert type(res.body) == list
295 assert len(res.body) == 2
296 assert res.body[0]["content"] == res.body[1]["content"]
297
298 # mixed JSON and tokens
299 res = server.make_request("POST", "/completion", data={
300 "prompt": [
301 tokens,
302 {
303 JSON_PROMPT_STRING_KEY: "I believe the meaning of life is",
304 },
305 ],
306 })
307 assert res.status_code == 200
308 assert type(res.body) == list
309 assert len(res.body) == 2
310 assert res.body[0]["content"] == res.body[1]["content"]
311
312 # mixed string and tokens in one sequence
313 res = server.make_request("POST", "/completion", data={
314 "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
315 })
316 assert res.status_code == 200
317 assert type(res.body["content"]) == str
318
319
320@pytest.mark.parametrize("n_slots,n_requests", [
321 (1, 3),
322 (2, 2),
323 (2, 4),
324 (4, 2), # some slots must be idle
325 (4, 6),
326])
327def test_completion_parallel_slots(n_slots: int, n_requests: int):
328 global server
329 server.n_slots = n_slots
330 server.temperature = 0.0
331 server.start()
332
333 PROMPTS = [
334 ("Write a very long book.", "(very|special|big)+"),
335 ("Write another a poem.", "(small|house)+"),
336 ("What is LLM?", "(Dad|said)+"),
337 ("The sky is blue and I love it.", "(climb|leaf)+"),
338 ("Write another very long music lyrics.", "(friends|step|sky)+"),
339 ("Write a very long joke.", "(cat|Whiskers)+"),
340 ]
341 def check_slots_status():
342 should_all_slots_busy = n_requests >= n_slots
343 time.sleep(0.1)
344 res = server.make_request("GET", "/slots")
345 n_busy = sum([1 for slot in res.body if slot["is_processing"]])
346 if should_all_slots_busy:
347 assert n_busy == n_slots
348 else:
349 assert n_busy <= n_slots
350
351 tasks = []
352 for i in range(n_requests):
353 prompt, re_content = PROMPTS[i % len(PROMPTS)]
354 tasks.append((server.make_request, ("POST", "/completion", {
355 "prompt": prompt,
356 "seed": 42,
357 "temperature": 1.0,
358 })))
359 tasks.append((check_slots_status, ()))
360 results = parallel_function_calls(tasks)
361
362 # check results
363 for i in range(n_requests):
364 prompt, re_content = PROMPTS[i % len(PROMPTS)]
365 res = results[i]
366 assert res.status_code == 200
367 assert type(res.body["content"]) == str
368 assert len(res.body["content"]) > 10
369 # FIXME: the result is not deterministic when using other slot than slot 0
370 # assert match_regex(re_content, res.body["content"])
371
372
373@pytest.mark.parametrize(
374 "n_ctx,n_slots,n_predict_vals,expected_success",
375 [
376 (256, 4, [80, 40, 80, 80], [True, True, True, True]),
377 (256, 4, [70, 70, 70, 70], [False, False, False, False]),
378 (256, 4, [90, 90, 40, 90], [False, False, True, False]),
379 (256, 4, [90, 90, 40, 75], [True, True, True, True]),
380 ],
381)
382def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
383 global server
384 server.n_slots = n_slots
385 server.kv_unified = True
386 server.n_ctx = n_ctx
387 server.start()
388 prompt = "A"
389 tasks = []
390 for n_predict in n_predict_vals:
391 tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict})))
392 results = parallel_function_calls(tasks)
393 for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
394 if expect_ok:
395 assert res.status_code == 200
396
397 # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
398 if res.status_code == 200:
399 assert "content" in res.body
400 if "timings" in res.body:
401 assert res.body["timings"]["predicted_n"] == n_predict
402
403
404@pytest.mark.parametrize(
405 "prompt,n_predict,response_fields",
406 [
407 ("I believe the meaning of life is", 8, []),
408 ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
409 ],
410)
411def test_completion_response_fields(
412 prompt: str, n_predict: int, response_fields: list[str]
413):
414 global server
415 server.start()
416 res = server.make_request(
417 "POST",
418 "/completion",
419 data={
420 "n_predict": n_predict,
421 "prompt": prompt,
422 "response_fields": response_fields,
423 },
424 )
425 assert res.status_code == 200
426 assert "content" in res.body
427 assert len(res.body["content"])
428 if len(response_fields):
429 assert res.body["generation_settings/n_predict"] == n_predict
430 assert res.body["prompt"] == "<s> " + prompt
431 assert isinstance(res.body["content"], str)
432 assert len(res.body) == len(response_fields)
433 else:
434 assert len(res.body)
435 assert "generation_settings" in res.body
436
437
438def test_n_probs():
439 global server
440 server.start()
441 res = server.make_request("POST", "/completion", data={
442 "prompt": "I believe the meaning of life is",
443 "n_probs": 10,
444 "temperature": 0.0,
445 "n_predict": 5,
446 })
447 assert res.status_code == 200
448 assert "completion_probabilities" in res.body
449 assert len(res.body["completion_probabilities"]) == 5
450 for tok in res.body["completion_probabilities"]:
451 assert "id" in tok and tok["id"] > 0
452 assert "token" in tok and type(tok["token"]) == str
453 assert "logprob" in tok and tok["logprob"] <= 0.0
454 assert "bytes" in tok and type(tok["bytes"]) == list
455 assert len(tok["top_logprobs"]) == 10
456 for prob in tok["top_logprobs"]:
457 assert "id" in prob and prob["id"] > 0
458 assert "token" in prob and type(prob["token"]) == str
459 assert "logprob" in prob and prob["logprob"] <= 0.0
460 assert "bytes" in prob and type(prob["bytes"]) == list
461
462
463def test_n_probs_stream():
464 global server
465 server.start()
466 res = server.make_stream_request("POST", "/completion", data={
467 "prompt": "I believe the meaning of life is",
468 "n_probs": 10,
469 "temperature": 0.0,
470 "n_predict": 5,
471 "stream": True,
472 })
473 for data in res:
474 if data["stop"] == False:
475 assert "completion_probabilities" in data
476 assert len(data["completion_probabilities"]) == 1
477 for tok in data["completion_probabilities"]:
478 assert "id" in tok and tok["id"] > 0
479 assert "token" in tok and type(tok["token"]) == str
480 assert "logprob" in tok and tok["logprob"] <= 0.0
481 assert "bytes" in tok and type(tok["bytes"]) == list
482 assert len(tok["top_logprobs"]) == 10
483 for prob in tok["top_logprobs"]:
484 assert "id" in prob and prob["id"] > 0
485 assert "token" in prob and type(prob["token"]) == str
486 assert "logprob" in prob and prob["logprob"] <= 0.0
487 assert "bytes" in prob and type(prob["bytes"]) == list
488
489
490def test_n_probs_post_sampling():
491 global server
492 server.start()
493 res = server.make_request("POST", "/completion", data={
494 "prompt": "I believe the meaning of life is",
495 "n_probs": 10,
496 "temperature": 0.0,
497 "n_predict": 5,
498 "post_sampling_probs": True,
499 })
500 assert res.status_code == 200
501 assert "completion_probabilities" in res.body
502 assert len(res.body["completion_probabilities"]) == 5
503 for tok in res.body["completion_probabilities"]:
504 assert "id" in tok and tok["id"] > 0
505 assert "token" in tok and type(tok["token"]) == str
506 assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
507 assert "bytes" in tok and type(tok["bytes"]) == list
508 assert len(tok["top_probs"]) == 10
509 for prob in tok["top_probs"]:
510 assert "id" in prob and prob["id"] > 0
511 assert "token" in prob and type(prob["token"]) == str
512 assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
513 assert "bytes" in prob and type(prob["bytes"]) == list
514 # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
515 assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
516
517
518@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
519def test_logit_bias(tokenize, openai_style):
520 global server
521 server.start()
522
523 exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
524
525 logit_bias = []
526 if tokenize:
527 res = server.make_request("POST", "/tokenize", data={
528 "content": " " + " ".join(exclude) + " ",
529 })
530 assert res.status_code == 200
531 tokens = res.body["tokens"]
532 logit_bias = [[tok, -100] for tok in tokens]
533
534 else:
535 logit_bias = [[" " + tok + " ", -100] for tok in exclude]
536
537 if openai_style:
538 logit_bias = {el[0]: -100 for el in logit_bias}
539
540 res = server.make_request("POST", "/completion", data={
541 "n_predict": 64,
542 "prompt": "What is the best book",
543 "logit_bias": logit_bias,
544 "temperature": 0.0
545 })
546 assert res.status_code == 200
547 output_text = res.body["content"]
548 assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
549
550
551def test_cancel_request():
552 global server
553 server.n_ctx = 4096
554 server.n_predict = -1
555 server.n_slots = 1
556 server.server_slots = True
557 server.start()
558 # send a request that will take a long time, but cancel it before it finishes
559 try:
560 server.make_request("POST", "/completion", data={
561 "prompt": "I believe the meaning of life is",
562 }, timeout=0.1)
563 except requests.exceptions.ReadTimeout:
564 pass # expected
565 # make sure the slot is free
566 time.sleep(1) # wait for HTTP_POLLING_SECONDS
567 res = server.make_request("GET", "/slots")
568 assert res.body[0]["is_processing"] == False
569
570
571# this test exercises the host-memory prompt cache
572# ref: https://github.com/ggml-org/llama.cpp/pull/16391
573# ref: https://github.com/ggml-org/llama.cpp/pull/17078
574def test_completion_prompt_cache():
575 global server
576 server.n_slots = 2
577 server.kv_unified = True
578 server.start()
579
580 for _ in range(16):
581 # generate alternating random prompts with variable lengths in order to get them in and out of the cache
582 r = random.randint(0, 4)
583 prompt = (" Hello " + str(r)) * (40 + r)
584 n_prompt = (40 + r)*5 + 2
585 n_predict = random.randint(1, 8)
586
587 res = server.make_request(
588 "POST",
589 "/completion",
590 data={
591 "prompt": prompt,
592 "n_predict": n_predict,
593 },
594 )
595
596 assert res.status_code == 200
597 assert "content" in res.body
598 content = res.body["content"]
599 assert isinstance(content, str)
600 assert len(content) > 0
601
602 assert type(res.body["has_new_line"]) == bool
603 assert "timings" in res.body
604 timings = res.body["timings"]
605
606 assert "prompt_n" in timings and timings["prompt_n"] + timings["cache_n"] == n_prompt
607 assert "predicted_n" in timings and timings["predicted_n"] == n_predict
608 assert "tokens" in res.body and isinstance(res.body["tokens"], list)