1import pytest
  2import requests
  3import time
  4import random
  5
  6from openai import OpenAI
  7from utils import *
  8
  9server = ServerPreset.tinyllama2()
 10
 11JSON_MULTIMODAL_KEY = "multimodal_data"
 12JSON_PROMPT_STRING_KEY = "prompt_string"
 13
 14@pytest.fixture(autouse=True)
 15def create_server():
 16    global server
 17    server = ServerPreset.tinyllama2()
 18
 19@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
 20    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
 21    ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
 22])
 23def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
 24    global server
 25    server.start()
 26    res = server.make_request("POST", "/completion", data={
 27        "n_predict": n_predict,
 28        "prompt": prompt,
 29        "return_tokens": return_tokens,
 30    })
 31    assert res.status_code == 200
 32    assert res.body["timings"]["prompt_n"] == n_prompt
 33    assert res.body["timings"]["predicted_n"] == n_predicted
 34    assert res.body["truncated"] == truncated
 35    assert type(res.body["has_new_line"]) == bool
 36    assert match_regex(re_content, res.body["content"])
 37    if return_tokens:
 38        assert len(res.body["tokens"]) > 0
 39        assert all(type(tok) == int for tok in res.body["tokens"])
 40    else:
 41        assert res.body["tokens"] == []
 42
 43
 44@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
 45    ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
 46    ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
 47])
 48def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
 49    global server
 50    server.start()
 51    res = server.make_stream_request("POST", "/completion", data={
 52        "n_predict": n_predict,
 53        "prompt": prompt,
 54        "stream": True,
 55    })
 56    content = ""
 57    for data in res:
 58        assert "stop" in data and type(data["stop"]) == bool
 59        if data["stop"]:
 60            assert data["timings"]["prompt_n"] == n_prompt
 61            assert data["timings"]["predicted_n"] == n_predicted
 62            assert data["truncated"] == truncated
 63            assert data["stop_type"] == "limit"
 64            assert type(data["has_new_line"]) == bool
 65            assert "generation_settings" in data
 66            assert server.n_predict is not None
 67            assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
 68            assert data["generation_settings"]["seed"] == server.seed
 69            assert match_regex(re_content, content)
 70        else:
 71            assert len(data["tokens"]) > 0
 72            assert all(type(tok) == int for tok in data["tokens"])
 73            content += data["content"]
 74
 75
 76def test_completion_stream_vs_non_stream():
 77    global server
 78    server.start()
 79    res_stream = server.make_stream_request("POST", "/completion", data={
 80        "n_predict": 8,
 81        "prompt": "I believe the meaning of life is",
 82        "stream": True,
 83    })
 84    res_non_stream = server.make_request("POST", "/completion", data={
 85        "n_predict": 8,
 86        "prompt": "I believe the meaning of life is",
 87    })
 88    content_stream = ""
 89    for data in res_stream:
 90        content_stream += data["content"]
 91    assert content_stream == res_non_stream.body["content"]
 92
 93
 94def test_completion_with_openai_library():
 95    global server
 96    server.start()
 97    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
 98    res = client.completions.create(
 99        model="davinci-002",
100        prompt="I believe the meaning of life is",
101        max_tokens=8,
102    )
103    assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
104    assert res.choices[0].finish_reason == "length"
105    assert res.choices[0].text is not None
106    assert match_regex("(going|bed)+", res.choices[0].text)
107
108
109def test_completion_stream_with_openai_library():
110    global server
111    server.start()
112    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
113    res = client.completions.create(
114        model="davinci-002",
115        prompt="I believe the meaning of life is",
116        max_tokens=8,
117        stream=True,
118    )
119    output_text = ''
120    for data in res:
121        choice = data.choices[0]
122        if choice.finish_reason is None:
123            assert choice.text is not None
124            output_text += choice.text
125    assert match_regex("(going|bed)+", output_text)
126
127
128# Test case from https://github.com/ggml-org/llama.cpp/issues/13780
129@pytest.mark.slow
130def test_completion_stream_with_openai_library_stops():
131    global server
132    server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
133    server.model_hf_file = None
134    server.start()
135    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
136    res = client.completions.create(
137        model="davinci-002",
138        prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
139        stop=["User:\n", "Assistant:\n"],
140        max_tokens=200,
141        stream=True,
142    )
143    output_text = ''
144    for data in res:
145        choice = data.choices[0]
146        if choice.finish_reason is None:
147            assert choice.text is not None
148            output_text += choice.text
149    assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
150
151
152@pytest.mark.parametrize("n_slots", [1, 2])
153def test_consistent_result_same_seed(n_slots: int):
154    global server
155    server.n_slots = n_slots
156    server.start()
157    last_res = None
158    for _ in range(4):
159        res = server.make_request("POST", "/completion", data={
160            "prompt": "I believe the meaning of life is",
161            "seed": 42,
162            "temperature": 0.0,
163            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
164        })
165        if last_res is not None:
166            assert res.body["content"] == last_res.body["content"]
167        last_res = res
168
169
170@pytest.mark.parametrize("n_slots", [1, 2])
171def test_different_result_different_seed(n_slots: int):
172    global server
173    server.n_slots = n_slots
174    server.start()
175    last_res = None
176    for seed in range(4):
177        res = server.make_request("POST", "/completion", data={
178            "prompt": "I believe the meaning of life is",
179            "seed": seed,
180            "temperature": 1.0,
181            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
182        })
183        if last_res is not None:
184            assert res.body["content"] != last_res.body["content"]
185        last_res = res
186
187# TODO figure why it don't work with temperature = 1
188#ย @pytest.mark.parametrize("temperature", [0.0, 1.0])
189@pytest.mark.parametrize("n_batch", [16, 32])
190@pytest.mark.parametrize("temperature", [0.0])
191def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
192    global server
193    server.n_batch = n_batch
194    server.start()
195    last_res = None
196    for _ in range(4):
197        res = server.make_request("POST", "/completion", data={
198            "prompt": "I believe the meaning of life is",
199            "seed": 42,
200            "temperature": temperature,
201            "cache_prompt": False,  # TODO: remove this once test_cache_vs_nocache_prompt is fixed
202        })
203        if last_res is not None:
204            assert res.body["content"] == last_res.body["content"]
205        last_res = res
206
207
208@pytest.mark.skip(reason="This test fails on linux, need to be fixed")
209def test_cache_vs_nocache_prompt():
210    global server
211    server.start()
212    res_cache = server.make_request("POST", "/completion", data={
213        "prompt": "I believe the meaning of life is",
214        "seed": 42,
215        "temperature": 1.0,
216        "cache_prompt": True,
217    })
218    res_no_cache = server.make_request("POST", "/completion", data={
219        "prompt": "I believe the meaning of life is",
220        "seed": 42,
221        "temperature": 1.0,
222        "cache_prompt": False,
223    })
224    assert res_cache.body["content"] == res_no_cache.body["content"]
225
226
227def test_nocache_long_input_prompt():
228    global server
229    server.start()
230    res = server.make_request("POST", "/completion", data={
231        "prompt": "I believe the meaning of life is"*32,
232        "seed": 42,
233        "temperature": 1.0,
234        "cache_prompt": False,
235    })
236    assert res.status_code == 400
237
238def test_json_prompt_no_mtmd():
239    global server
240    server.start()
241    res = server.make_request("POST", "/completion", data={
242        "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is" },
243        "seed": 42,
244        "temperature": 1.0,
245        "cache_prompt": False,
246    })
247    assert res.status_code == 200
248
249def test_json_prompt_mtm_error_when_not_supported():
250    global server
251    server.start()
252    res = server.make_request("POST", "/completion", data={
253        "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is <__media__>", JSON_MULTIMODAL_KEY: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" },
254        "seed": 42,
255        "temperature": 1.0,
256        "cache_prompt": False,
257    })
258    # MTMD is disabled on this model, so this should fail.
259    assert res.status_code != 200
260
261def test_completion_with_tokens_input():
262    global server
263    server.temperature = 0.0
264    server.start()
265    prompt_str = "I believe the meaning of life is"
266    res = server.make_request("POST", "/tokenize", data={
267        "content": prompt_str,
268        "add_special": True,
269    })
270    assert res.status_code == 200
271    tokens = res.body["tokens"]
272
273    # single completion
274    res = server.make_request("POST", "/completion", data={
275        "prompt": tokens,
276    })
277    assert res.status_code == 200
278    assert type(res.body["content"]) == str
279
280    # batch completion
281    res = server.make_request("POST", "/completion", data={
282        "prompt": [tokens, tokens],
283    })
284    assert res.status_code == 200
285    assert type(res.body) == list
286    assert len(res.body) == 2
287    assert res.body[0]["content"] == res.body[1]["content"]
288
289    # mixed string and tokens
290    res = server.make_request("POST", "/completion", data={
291        "prompt": [tokens, prompt_str],
292    })
293    assert res.status_code == 200
294    assert type(res.body) == list
295    assert len(res.body) == 2
296    assert res.body[0]["content"] == res.body[1]["content"]
297
298    # mixed JSON and tokens
299    res = server.make_request("POST", "/completion", data={
300        "prompt": [
301            tokens,
302            {
303                JSON_PROMPT_STRING_KEY: "I believe the meaning of life is",
304            },
305        ],
306    })
307    assert res.status_code == 200
308    assert type(res.body) == list
309    assert len(res.body) == 2
310    assert res.body[0]["content"] == res.body[1]["content"]
311
312    # mixed string and tokens in one sequence
313    res = server.make_request("POST", "/completion", data={
314        "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str],
315    })
316    assert res.status_code == 200
317    assert type(res.body["content"]) == str
318
319
320@pytest.mark.parametrize("n_slots,n_requests", [
321    (1, 3),
322    (2, 2),
323    (2, 4),
324    (4, 2), # some slots must be idle
325    (4, 6),
326])
327def test_completion_parallel_slots(n_slots: int, n_requests: int):
328    global server
329    server.n_slots = n_slots
330    server.temperature = 0.0
331    server.start()
332
333    PROMPTS = [
334        ("Write a very long book.", "(very|special|big)+"),
335        ("Write another a poem.", "(small|house)+"),
336        ("What is LLM?", "(Dad|said)+"),
337        ("The sky is blue and I love it.", "(climb|leaf)+"),
338        ("Write another very long music lyrics.", "(friends|step|sky)+"),
339        ("Write a very long joke.", "(cat|Whiskers)+"),
340    ]
341    def check_slots_status():
342        should_all_slots_busy = n_requests >= n_slots
343        time.sleep(0.1)
344        res = server.make_request("GET", "/slots")
345        n_busy = sum([1 for slot in res.body if slot["is_processing"]])
346        if should_all_slots_busy:
347            assert n_busy == n_slots
348        else:
349            assert n_busy <= n_slots
350
351    tasks = []
352    for i in range(n_requests):
353        prompt, re_content = PROMPTS[i % len(PROMPTS)]
354        tasks.append((server.make_request, ("POST", "/completion", {
355            "prompt": prompt,
356            "seed": 42,
357            "temperature": 1.0,
358        })))
359    tasks.append((check_slots_status, ()))
360    results = parallel_function_calls(tasks)
361
362    # check results
363    for i in range(n_requests):
364        prompt, re_content = PROMPTS[i % len(PROMPTS)]
365        res = results[i]
366        assert res.status_code == 200
367        assert type(res.body["content"]) == str
368        assert len(res.body["content"]) > 10
369        # FIXME: the result is not deterministic when using other slot than slot 0
370        # assert match_regex(re_content, res.body["content"])
371
372
373@pytest.mark.parametrize(
374    "n_ctx,n_slots,n_predict_vals,expected_success",
375    [
376        (256, 4, [80, 40, 80, 80], [True,  True,  True,  True]),
377        (256, 4, [70, 70, 70, 70], [False, False, False, False]),
378        (256, 4, [90, 90, 40, 90], [False, False, True,  False]),
379        (256, 4, [90, 90, 40, 75], [True,  True,  True,  True]),
380    ],
381)
382def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
383    global server
384    server.n_slots = n_slots
385    server.kv_unified = True
386    server.n_ctx = n_ctx
387    server.start()
388    prompt = "A"
389    tasks = []
390    for n_predict in n_predict_vals:
391        tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict})))
392    results = parallel_function_calls(tasks)
393    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
394        if expect_ok:
395            assert res.status_code == 200
396
397        # note: https://github.com/ggml-org/llama.cpp/pull/18700#issuecomment-3728695581
398        if res.status_code == 200:
399            assert "content" in res.body
400            if "timings" in res.body:
401                assert res.body["timings"]["predicted_n"] == n_predict
402
403
404@pytest.mark.parametrize(
405    "prompt,n_predict,response_fields",
406    [
407        ("I believe the meaning of life is", 8, []),
408        ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
409    ],
410)
411def test_completion_response_fields(
412    prompt: str, n_predict: int, response_fields: list[str]
413):
414    global server
415    server.start()
416    res = server.make_request(
417        "POST",
418        "/completion",
419        data={
420            "n_predict": n_predict,
421            "prompt": prompt,
422            "response_fields": response_fields,
423        },
424    )
425    assert res.status_code == 200
426    assert "content" in res.body
427    assert len(res.body["content"])
428    if len(response_fields):
429        assert res.body["generation_settings/n_predict"] == n_predict
430        assert res.body["prompt"] == "<s> " + prompt
431        assert isinstance(res.body["content"], str)
432        assert len(res.body) == len(response_fields)
433    else:
434        assert len(res.body)
435        assert "generation_settings" in res.body
436
437
438def test_n_probs():
439    global server
440    server.start()
441    res = server.make_request("POST", "/completion", data={
442        "prompt": "I believe the meaning of life is",
443        "n_probs": 10,
444        "temperature": 0.0,
445        "n_predict": 5,
446    })
447    assert res.status_code == 200
448    assert "completion_probabilities" in res.body
449    assert len(res.body["completion_probabilities"]) == 5
450    for tok in res.body["completion_probabilities"]:
451        assert "id" in tok and tok["id"] > 0
452        assert "token" in tok and type(tok["token"]) == str
453        assert "logprob" in tok and tok["logprob"] <= 0.0
454        assert "bytes" in tok and type(tok["bytes"]) == list
455        assert len(tok["top_logprobs"]) == 10
456        for prob in tok["top_logprobs"]:
457            assert "id" in prob and prob["id"] > 0
458            assert "token" in prob and type(prob["token"]) == str
459            assert "logprob" in prob and prob["logprob"] <= 0.0
460            assert "bytes" in prob and type(prob["bytes"]) == list
461
462
463def test_n_probs_stream():
464    global server
465    server.start()
466    res = server.make_stream_request("POST", "/completion", data={
467        "prompt": "I believe the meaning of life is",
468        "n_probs": 10,
469        "temperature": 0.0,
470        "n_predict": 5,
471        "stream": True,
472    })
473    for data in res:
474        if data["stop"] == False:
475            assert "completion_probabilities" in data
476            assert len(data["completion_probabilities"]) == 1
477            for tok in data["completion_probabilities"]:
478                assert "id" in tok and tok["id"] > 0
479                assert "token" in tok and type(tok["token"]) == str
480                assert "logprob" in tok and tok["logprob"] <= 0.0
481                assert "bytes" in tok and type(tok["bytes"]) == list
482                assert len(tok["top_logprobs"]) == 10
483                for prob in tok["top_logprobs"]:
484                    assert "id" in prob and prob["id"] > 0
485                    assert "token" in prob and type(prob["token"]) == str
486                    assert "logprob" in prob and prob["logprob"] <= 0.0
487                    assert "bytes" in prob and type(prob["bytes"]) == list
488
489
490def test_n_probs_post_sampling():
491    global server
492    server.start()
493    res = server.make_request("POST", "/completion", data={
494        "prompt": "I believe the meaning of life is",
495        "n_probs": 10,
496        "temperature": 0.0,
497        "n_predict": 5,
498        "post_sampling_probs": True,
499    })
500    assert res.status_code == 200
501    assert "completion_probabilities" in res.body
502    assert len(res.body["completion_probabilities"]) == 5
503    for tok in res.body["completion_probabilities"]:
504        assert "id" in tok and tok["id"] > 0
505        assert "token" in tok and type(tok["token"]) == str
506        assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
507        assert "bytes" in tok and type(tok["bytes"]) == list
508        assert len(tok["top_probs"]) == 10
509        for prob in tok["top_probs"]:
510            assert "id" in prob and prob["id"] > 0
511            assert "token" in prob and type(prob["token"]) == str
512            assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
513            assert "bytes" in prob and type(prob["bytes"]) == list
514        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
515        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
516
517
518@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
519def test_logit_bias(tokenize, openai_style):
520    global server
521    server.start()
522
523    exclude = ["i", "I", "the", "The", "to", "a", "an", "be", "is", "was", "but", "But", "and", "And", "so", "So", "you", "You", "he", "He", "she", "She", "we", "We", "they", "They", "it", "It", "his", "His", "her", "Her", "book", "Book"]
524
525    logit_bias = []
526    if tokenize:
527        res = server.make_request("POST", "/tokenize", data={
528            "content": " " + " ".join(exclude) + " ",
529        })
530        assert res.status_code == 200
531        tokens = res.body["tokens"]
532        logit_bias = [[tok, -100] for tok in tokens]
533
534    else:
535        logit_bias = [[" " + tok + " ", -100] for tok in exclude]
536
537    if openai_style:
538        logit_bias = {el[0]: -100 for el in logit_bias}
539
540    res = server.make_request("POST", "/completion", data={
541        "n_predict": 64,
542        "prompt": "What is the best book",
543        "logit_bias": logit_bias,
544        "temperature": 0.0
545    })
546    assert res.status_code == 200
547    output_text = res.body["content"]
548    assert all(output_text.find(" " + tok + " ") == -1 for tok in exclude)
549
550
551def test_cancel_request():
552    global server
553    server.n_ctx = 4096
554    server.n_predict = -1
555    server.n_slots = 1
556    server.server_slots = True
557    server.start()
558    # send a request that will take a long time, but cancel it before it finishes
559    try:
560        server.make_request("POST", "/completion", data={
561            "prompt": "I believe the meaning of life is",
562        }, timeout=0.1)
563    except requests.exceptions.ReadTimeout:
564        pass # expected
565    # make sure the slot is free
566    time.sleep(1) # wait for HTTP_POLLING_SECONDS
567    res = server.make_request("GET", "/slots")
568    assert res.body[0]["is_processing"] == False
569
570
571# this test exercises the host-memory prompt cache
572# ref: https://github.com/ggml-org/llama.cpp/pull/16391
573# ref: https://github.com/ggml-org/llama.cpp/pull/17078
574def test_completion_prompt_cache():
575    global server
576    server.n_slots = 2
577    server.kv_unified = True
578    server.start()
579
580    for _ in range(16):
581        # generate alternating random prompts with variable lengths in order to get them in and out of the cache
582        r = random.randint(0, 4)
583        prompt = (" Hello " +  str(r)) * (40 + r)
584        n_prompt = (40 + r)*5 + 2
585        n_predict = random.randint(1, 8)
586
587        res = server.make_request(
588            "POST",
589            "/completion",
590            data={
591                "prompt": prompt,
592                "n_predict": n_predict,
593            },
594        )
595
596        assert res.status_code == 200
597        assert "content" in res.body
598        content = res.body["content"]
599        assert isinstance(content, str)
600        assert len(content) > 0
601
602        assert type(res.body["has_new_line"]) == bool
603        assert "timings" in res.body
604        timings = res.body["timings"]
605
606        assert "prompt_n" in timings and timings["prompt_n"] + timings["cache_n"] == n_prompt
607        assert "predicted_n" in timings and timings["predicted_n"] == n_predict
608        assert "tokens" in res.body and isinstance(res.body["tokens"], list)