llama.cpp/tools/server/tests/unit/test_infill.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

import pytest
from utils import *

server = ServerPreset.tinyllama_infill()

@pytest.fixture(autouse=True)
def create_server():
    global server
    server = ServerPreset.tinyllama_infill()


def test_infill_without_input_extra():
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
    assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"])


def test_infill_with_input_extra():
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
        "input_extra": [{
            "filename": "llama.h",
            "text": "LLAMA_API int32_t llama_n_threads();\n"
        }],
        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
    assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"])


@pytest.mark.parametrize("input_extra", [
    {},
    {"filename": "ok"},
    {"filename": 123},
    {"filename": 123, "text": "abc"},
    {"filename": 123, "text": 456},
])
def test_invalid_input_extra_req(input_extra):
    global server
    server.start()
    res = server.make_request("POST", "/infill", data={
        "input_extra": [input_extra],
        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 400
    assert "error" in res.body


@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
def test_with_qwen_model():
    global server
    server.model_file = None
    server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
    server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
    server.start(timeout_seconds=600)
    res = server.make_request("POST", "/infill", data={
        "input_extra": [{
            "filename": "llama.h",
            "text": "LLAMA_API int32_t llama_n_threads();\n"
        }],
        "input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
        "prompt": "    int n_threads = llama_",
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
    assert res.body["content"] == "n_threads();\n    printf(\"Number of threads: %d\\n\", n_threads);\n    return 0;\n"