llmnpc - llama.cpp/tools/server/tests/unit/test

Path: llmnpc / llama.cpp / tools / server / tests / unit / test_router.py (raw)
  1import pytest
  2from utils import *
  3
  4server: ServerProcess
  5
  6@pytest.fixture(autouse=True)
  7def create_server():
  8    global server
  9    server = ServerPreset.router()
 10
 11
 12@pytest.mark.parametrize(
 13    "model,success",
 14    [
 15        ("ggml-org/tinygemma3-GGUF:Q8_0", True),
 16        ("non-existent/model", False),
 17    ]
 18)
 19def test_router_chat_completion_stream(model: str, success: bool):
 20    global server
 21    server.start()
 22    content = ""
 23    ex: ServerError | None = None
 24    try:
 25        res = server.make_stream_request("POST", "/chat/completions", data={
 26            "model": model,
 27            "max_tokens": 16,
 28            "messages": [
 29                {"role": "user", "content": "hello"},
 30            ],
 31            "stream": True,
 32        })
 33        for data in res:
 34            if data["choices"]:
 35                choice = data["choices"][0]
 36                if choice["finish_reason"] in ["stop", "length"]:
 37                    assert "content" not in choice["delta"]
 38                else:
 39                    assert choice["finish_reason"] is None
 40                    content += choice["delta"]["content"] or ''
 41    except ServerError as e:
 42        ex = e
 43
 44    if success:
 45        assert ex is None
 46        assert len(content) > 0
 47    else:
 48        assert ex is not None
 49        assert content == ""
 50
 51
 52def _get_model_status(model_id: str) -> str:
 53    res = server.make_request("GET", "/models")
 54    assert res.status_code == 200
 55    for item in res.body.get("data", []):
 56        if item.get("id") == model_id or item.get("model") == model_id:
 57            return item["status"]["value"]
 58    raise AssertionError(f"Model {model_id} not found in /models response")
 59
 60
 61def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
 62    deadline = time.time() + timeout
 63    last_status = None
 64    while time.time() < deadline:
 65        last_status = _get_model_status(model_id)
 66        if last_status in desired:
 67            return last_status
 68        time.sleep(1)
 69    raise AssertionError(
 70        f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
 71    )
 72
 73
 74def _load_model_and_wait(
 75    model_id: str, timeout: int = 60, headers: dict | None = None
 76) -> None:
 77    load_res = server.make_request(
 78        "POST", "/models/load", data={"model": model_id}, headers=headers
 79    )
 80    assert load_res.status_code == 200
 81    assert isinstance(load_res.body, dict)
 82    assert load_res.body.get("success") is True
 83    _wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
 84
 85
 86def test_router_unload_model():
 87    global server
 88    server.start()
 89    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
 90
 91    _load_model_and_wait(model_id)
 92
 93    unload_res = server.make_request("POST", "/models/unload", data={"model": model_id})
 94    assert unload_res.status_code == 200
 95    assert unload_res.body.get("success") is True
 96    _wait_for_model_status(model_id, {"unloaded"})
 97
 98
 99def test_router_models_max_evicts_lru():
100    global server
101    server.models_max = 2
102    server.start()
103
104    candidate_models = [
105        "ggml-org/tinygemma3-GGUF:Q8_0",
106        "ggml-org/test-model-stories260K",
107        "ggml-org/test-model-stories260K-infill",
108    ]
109
110    # Load only the first 2 models to fill the cache
111    first, second, third = candidate_models[:3]
112
113    _load_model_and_wait(first, timeout=120)
114    _load_model_and_wait(second, timeout=120)
115
116    # Verify both models are loaded
117    assert _get_model_status(first) == "loaded"
118    assert _get_model_status(second) == "loaded"
119
120    # Load the third model - this should trigger LRU eviction of the first model
121    _load_model_and_wait(third, timeout=120)
122
123    # Verify eviction: third is loaded, first was evicted
124    assert _get_model_status(third) == "loaded"
125    assert _get_model_status(first) == "unloaded"
126
127
128def test_router_no_models_autoload():
129    global server
130    server.no_models_autoload = True
131    server.start()
132    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
133
134    res = server.make_request(
135        "POST",
136        "/v1/chat/completions",
137        data={
138            "model": model_id,
139            "messages": [{"role": "user", "content": "hello"}],
140            "max_tokens": 4,
141        },
142    )
143    assert res.status_code == 400
144    assert "error" in res.body
145
146    _load_model_and_wait(model_id)
147
148    success_res = server.make_request(
149        "POST",
150        "/v1/chat/completions",
151        data={
152            "model": model_id,
153            "messages": [{"role": "user", "content": "hello"}],
154            "max_tokens": 4,
155        },
156    )
157    assert success_res.status_code == 200
158    assert "error" not in success_res.body
159
160
161def test_router_api_key_required():
162    global server
163    server.api_key = "sk-router-secret"
164    server.start()
165
166    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
167    auth_headers = {"Authorization": f"Bearer {server.api_key}"}
168
169    res = server.make_request(
170        "POST",
171        "/v1/chat/completions",
172        data={
173            "model": model_id,
174            "messages": [{"role": "user", "content": "hello"}],
175            "max_tokens": 4,
176        },
177    )
178    assert res.status_code == 401
179    assert res.body.get("error", {}).get("type") == "authentication_error"
180
181    _load_model_and_wait(model_id, headers=auth_headers)
182
183    authed = server.make_request(
184        "POST",
185        "/v1/chat/completions",
186        headers=auth_headers,
187        data={
188            "model": model_id,
189            "messages": [{"role": "user", "content": "hello"}],
190            "max_tokens": 4,
191        },
192    )
193    assert authed.status_code == 200
194    assert "error" not in authed.body