1import pytest
2from utils import *
3
4server: ServerProcess
5
6@pytest.fixture(autouse=True)
7def create_server():
8 global server
9 server = ServerPreset.router()
10
11
12@pytest.mark.parametrize(
13 "model,success",
14 [
15 ("ggml-org/tinygemma3-GGUF:Q8_0", True),
16 ("non-existent/model", False),
17 ]
18)
19def test_router_chat_completion_stream(model: str, success: bool):
20 global server
21 server.start()
22 content = ""
23 ex: ServerError | None = None
24 try:
25 res = server.make_stream_request("POST", "/chat/completions", data={
26 "model": model,
27 "max_tokens": 16,
28 "messages": [
29 {"role": "user", "content": "hello"},
30 ],
31 "stream": True,
32 })
33 for data in res:
34 if data["choices"]:
35 choice = data["choices"][0]
36 if choice["finish_reason"] in ["stop", "length"]:
37 assert "content" not in choice["delta"]
38 else:
39 assert choice["finish_reason"] is None
40 content += choice["delta"]["content"] or ''
41 except ServerError as e:
42 ex = e
43
44 if success:
45 assert ex is None
46 assert len(content) > 0
47 else:
48 assert ex is not None
49 assert content == ""
50
51
52def _get_model_status(model_id: str) -> str:
53 res = server.make_request("GET", "/models")
54 assert res.status_code == 200
55 for item in res.body.get("data", []):
56 if item.get("id") == model_id or item.get("model") == model_id:
57 return item["status"]["value"]
58 raise AssertionError(f"Model {model_id} not found in /models response")
59
60
61def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
62 deadline = time.time() + timeout
63 last_status = None
64 while time.time() < deadline:
65 last_status = _get_model_status(model_id)
66 if last_status in desired:
67 return last_status
68 time.sleep(1)
69 raise AssertionError(
70 f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
71 )
72
73
74def _load_model_and_wait(
75 model_id: str, timeout: int = 60, headers: dict | None = None
76) -> None:
77 load_res = server.make_request(
78 "POST", "/models/load", data={"model": model_id}, headers=headers
79 )
80 assert load_res.status_code == 200
81 assert isinstance(load_res.body, dict)
82 assert load_res.body.get("success") is True
83 _wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
84
85
86def test_router_unload_model():
87 global server
88 server.start()
89 model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
90
91 _load_model_and_wait(model_id)
92
93 unload_res = server.make_request("POST", "/models/unload", data={"model": model_id})
94 assert unload_res.status_code == 200
95 assert unload_res.body.get("success") is True
96 _wait_for_model_status(model_id, {"unloaded"})
97
98
99def test_router_models_max_evicts_lru():
100 global server
101 server.models_max = 2
102 server.start()
103
104 candidate_models = [
105 "ggml-org/tinygemma3-GGUF:Q8_0",
106 "ggml-org/test-model-stories260K",
107 "ggml-org/test-model-stories260K-infill",
108 ]
109
110 # Load only the first 2 models to fill the cache
111 first, second, third = candidate_models[:3]
112
113 _load_model_and_wait(first, timeout=120)
114 _load_model_and_wait(second, timeout=120)
115
116 # Verify both models are loaded
117 assert _get_model_status(first) == "loaded"
118 assert _get_model_status(second) == "loaded"
119
120 # Load the third model - this should trigger LRU eviction of the first model
121 _load_model_and_wait(third, timeout=120)
122
123 # Verify eviction: third is loaded, first was evicted
124 assert _get_model_status(third) == "loaded"
125 assert _get_model_status(first) == "unloaded"
126
127
128def test_router_no_models_autoload():
129 global server
130 server.no_models_autoload = True
131 server.start()
132 model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
133
134 res = server.make_request(
135 "POST",
136 "/v1/chat/completions",
137 data={
138 "model": model_id,
139 "messages": [{"role": "user", "content": "hello"}],
140 "max_tokens": 4,
141 },
142 )
143 assert res.status_code == 400
144 assert "error" in res.body
145
146 _load_model_and_wait(model_id)
147
148 success_res = server.make_request(
149 "POST",
150 "/v1/chat/completions",
151 data={
152 "model": model_id,
153 "messages": [{"role": "user", "content": "hello"}],
154 "max_tokens": 4,
155 },
156 )
157 assert success_res.status_code == 200
158 assert "error" not in success_res.body
159
160
161def test_router_api_key_required():
162 global server
163 server.api_key = "sk-router-secret"
164 server.start()
165
166 model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
167 auth_headers = {"Authorization": f"Bearer {server.api_key}"}
168
169 res = server.make_request(
170 "POST",
171 "/v1/chat/completions",
172 data={
173 "model": model_id,
174 "messages": [{"role": "user", "content": "hello"}],
175 "max_tokens": 4,
176 },
177 )
178 assert res.status_code == 401
179 assert res.body.get("error", {}).get("type") == "authentication_error"
180
181 _load_model_and_wait(model_id, headers=auth_headers)
182
183 authed = server.make_request(
184 "POST",
185 "/v1/chat/completions",
186 headers=auth_headers,
187 data={
188 "model": model_id,
189 "messages": [{"role": "user", "content": "hello"}],
190 "max_tokens": 4,
191 },
192 )
193 assert authed.status_code == 200
194 assert "error" not in authed.body