1#!/usr/bin/env python
2import pytest
3
4# ensure grandparent path is in sys.path
5from pathlib import Path
6import sys
7path = Path(__file__).resolve().parents[1]
8sys.path.insert(0, str(path))
9
10from utils import *
11from enum import Enum
12
13server: ServerProcess
14
15TIMEOUT_START_SLOW = 15 * 60 # this is needed for real model tests
16TIMEOUT_HTTP_REQUEST = 60
17
18@pytest.fixture(autouse=True)
19def create_server():
20 global server
21 server = ServerPreset.tinyllama2()
22 server.model_alias = "tinyllama-2-tool-call"
23 server.server_port = 8081
24 server.n_slots = 1
25 server.n_ctx = 8192
26 server.n_batch = 2048
27
28class CompletionMode(Enum):
29 NORMAL = "normal"
30 STREAMED = "streamed"
31
32TEST_TOOL = {
33 "type":"function",
34 "function": {
35 "name": "test",
36 "description": "",
37 "parameters": {
38 "type": "object",
39 "properties": {
40 "success": {"type": "boolean", "const": True},
41 },
42 "required": ["success"]
43 }
44 }
45}
46
47PYTHON_TOOL = {
48 "type": "function",
49 "function": {
50 "name": "python",
51 "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
52 "parameters": {
53 "type": "object",
54 "properties": {
55 "code": {
56 "type": "string",
57 "description": "The code to run in the ipython interpreter."
58 }
59 },
60 "required": ["code"]
61 }
62 }
63}
64
65WEATHER_TOOL = {
66 "type":"function",
67 "function":{
68 "name":"get_current_weather",
69 "description":"Get the current weather in a given location",
70 "parameters":{
71 "type":"object",
72 "properties":{
73 "location":{
74 "type":"string",
75 "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
76 }
77 },
78 "required":["location"]
79 }
80 }
81}
82
83def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
84 body = server.make_any_request("POST", "/v1/chat/completions", data={
85 "max_tokens": n_predict,
86 "messages": [
87 {"role": "system", "content": "You are a coding assistant."},
88 {"role": "user", "content": "Write an example"},
89 ],
90 "tool_choice": "required",
91 "tools": [tool],
92 "parallel_tool_calls": False,
93 **kwargs,
94 })
95 # assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
96 choice = body["choices"][0]
97 tool_calls = choice["message"].get("tool_calls")
98 assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
99 tool_call = tool_calls[0]
100 assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
101 # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
102 expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
103 assert expected_function_name == tool_call["function"]["name"]
104 actual_arguments = tool_call["function"]["arguments"]
105 assert isinstance(actual_arguments, str)
106 if argument_key is not None:
107 actual_arguments = json.loads(actual_arguments)
108 assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
109
110
111@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
112@pytest.mark.parametrize("template_name,tool,argument_key", [
113 ("google-gemma-2-2b-it", TEST_TOOL, "success"),
114 ("google-gemma-2-2b-it", TEST_TOOL, "success"),
115 ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
116 ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"),
117 ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
118 ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"),
119])
120def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
121 global server
122 n_predict = 1024
123 # server = ServerPreset.stories15m_moe()
124 server.jinja = True
125 server.n_predict = n_predict
126 server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
127 server.start()
128 do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
129
130
131@pytest.mark.slow
132@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
133@pytest.mark.parametrize("template_name,tool,argument_key", [
134 ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"),
135 ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"),
136
137 ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"),
138 ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"),
139
140 ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"),
141 # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
142 # ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"),
143
144 ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"),
145 ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"),
146
147 ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"),
148 ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"),
149
150 ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"),
151 ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"),
152
153 ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"),
154 ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"),
155
156 ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"),
157 ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"),
158
159 ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"),
160 # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "codeFalse), True),
161 # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"),
162
163])
164def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
165 global server
166 n_predict = 512
167 # server = ServerPreset.stories15m_moe()
168 server.jinja = True
169 server.n_predict = n_predict
170 server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
171 server.start(timeout_seconds=TIMEOUT_START_SLOW)
172 do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
173
174
175@pytest.mark.slow
176@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
177@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
178 (TEST_TOOL, "success", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
179 (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
180 (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
181
182 (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
183 (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
184 (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"),
185
186 (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
187 (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
188 (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
189
190 (TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
191 (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
192 (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
193
194 (TEST_TOOL, "success", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
195 (PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
196 (PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
197
198 (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
199 (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
200 (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
201
202 (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
203 (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
204 (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
205
206 (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
207 (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
208 (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
209
210 # (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
211 # (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
212 # (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
213
214 (TEST_TOOL, "success", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
215 (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", ("meetkai/functionary-medium-v3.2", None)),
216 (PYTHON_TOOL, "code", "bartowski/functionary-small-v3.2-GGUF:Q4_K_M", "chatml"),
217
218 (TEST_TOOL, "success", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
219 (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
220 (PYTHON_TOOL, "code", "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
221
222 (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
223 (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
224 (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"),
225
226 (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
227 (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
228])
229def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
230 global server
231 n_predict = 512
232 server.jinja = True
233 server.n_ctx = 8192
234 server.n_predict = n_predict
235 server.model_hf_repo = hf_repo
236 server.model_hf_file = None
237 if isinstance(template_override, tuple):
238 (template_hf_repo, template_variant) = template_override
239 server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
240 assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
241 elif isinstance(template_override, str):
242 server.chat_template = template_override
243 server.start(timeout_seconds=TIMEOUT_START_SLOW)
244 body = server.make_any_request("POST", "/v1/chat/completions", data={
245 "max_tokens": n_predict,
246 "messages": [
247 {"role": "system", "content": "You are a coding assistant."},
248 {"role": "user", "content": "Write an example"},
249 ],
250 "tool_choice": "required",
251 "tools": [tool],
252 "parallel_tool_calls": False,
253 "stream": stream == CompletionMode.STREAMED,
254 "temperature": 0.0,
255 "top_k": 1,
256 "top_p": 1.0,
257 }, timeout=TIMEOUT_HTTP_REQUEST)
258 choice = body["choices"][0]
259 tool_calls = choice["message"].get("tool_calls")
260 assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
261 tool_call = tool_calls[0]
262 # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
263 expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
264 assert expected_function_name == tool_call["function"]["name"]
265 actual_arguments = tool_call["function"]["arguments"]
266 assert isinstance(actual_arguments, str)
267 if argument_key is not None:
268 actual_arguments = json.loads(actual_arguments)
269 assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
270
271
272def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
273 body = server.make_any_request("POST", "/v1/chat/completions", data={
274 "max_tokens": n_predict,
275 "messages": [
276 {"role": "system", "content": "You are a coding assistant."},
277 {"role": "user", "content": "say hello world with python"},
278 ],
279 "tools": tools if tools else None,
280 "tool_choice": tool_choice,
281 **kwargs,
282 }, timeout=TIMEOUT_HTTP_REQUEST)
283 choice = body["choices"][0]
284 assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
285
286
287@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
288@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
289 ("meta-llama-Llama-3.3-70B-Instruct", 128, [], None),
290 ("meta-llama-Llama-3.3-70B-Instruct", 128, [TEST_TOOL], None),
291 ("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'),
292])
293def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
294 global server
295 server.n_predict = n_predict
296 server.jinja = True
297 server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
298 server.start()
299 do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
300
301
302@pytest.mark.slow
303@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
304@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
305 ("meetkai-functionary-medium-v3.2", 256, [], None),
306 ("meetkai-functionary-medium-v3.2", 256, [TEST_TOOL], None),
307 ("meetkai-functionary-medium-v3.2", 256, [PYTHON_TOOL], 'none'),
308 ("meetkai-functionary-medium-v3.1", 256, [], None),
309 ("meetkai-functionary-medium-v3.1", 256, [TEST_TOOL], None),
310 ("meetkai-functionary-medium-v3.1", 256, [PYTHON_TOOL], 'none'),
311 ("meta-llama-Llama-3.2-3B-Instruct", 256, [], None),
312 ("meta-llama-Llama-3.2-3B-Instruct", 256, [TEST_TOOL], None),
313 ("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'),
314])
315def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
316 global server
317 server.n_predict = n_predict
318 server.jinja = True
319 server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
320 server.start(timeout_seconds=TIMEOUT_START_SLOW)
321 do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
322
323
324@pytest.mark.slow
325@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
326@pytest.mark.parametrize("hf_repo,template_override", [
327 ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
328 ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
329
330 ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
331 ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
332
333 ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None),
334 ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"),
335
336 ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
337 ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
338
339 ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
340 ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
341
342 ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
343 ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
344
345 ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
346 ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
347
348 # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
349 # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
350
351 # ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
352 # ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
353
354 ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
355 ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"),
356
357 ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L", ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
358
359 ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
360
361 # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
362 ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
363
364 # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
365])
366def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
367 global server
368 n_predict = 512
369 server.jinja = True
370 server.n_ctx = 8192
371 server.n_predict = n_predict
372 server.model_hf_repo = hf_repo
373 server.model_hf_file = None
374 if isinstance(template_override, tuple):
375 (template_hf_repo, template_variant) = template_override
376 server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
377 assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
378 elif isinstance(template_override, str):
379 server.chat_template = template_override
380 server.start()
381 do_test_weather(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
382
383
384def do_test_weather(server: ServerProcess, **kwargs):
385 body = server.make_any_request("POST", "/v1/chat/completions", data={
386 "messages": [
387 {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
388 {"role": "user", "content": "What is the weather in Istanbul?"},
389 ],
390 "tools": [WEATHER_TOOL],
391 **kwargs,
392 }, timeout=TIMEOUT_HTTP_REQUEST)
393 choice = body["choices"][0]
394 tool_calls = choice["message"].get("tool_calls")
395 assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
396 tool_call = tool_calls[0]
397 # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
398 assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
399 # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
400 actual_arguments = json.loads(tool_call["function"]["arguments"])
401 assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
402 location = actual_arguments["location"]
403 assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
404 assert re.match('^Istanbul(( |, ?)(TR|Turkey|Tรผrkiye))?$', location), f'Expected Istanbul for location, got {location}'
405
406
407@pytest.mark.slow
408@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
409@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
410 (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
411 (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
412 (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
413 (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
414 (None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
415 (None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
416 (None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)),
417 (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
418 (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
419 (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
420 ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
421
422 # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
423 # (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
424 # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
425])
426def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
427 global server
428 server.jinja = True
429 server.n_ctx = 8192 * 2
430 server.n_predict = n_predict
431 server.model_hf_repo = hf_repo
432 server.model_hf_file = None
433 if isinstance(template_override, tuple):
434 (template_hf_repo, template_variant) = template_override
435 server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
436 assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
437 elif isinstance(template_override, str):
438 server.chat_template = template_override
439 server.start(timeout_seconds=TIMEOUT_START_SLOW)
440 do_test_calc_result(server, result_override, n_predict, stream=stream == CompletionMode.STREAMED)
441
442
443def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
444 body = server.make_any_request("POST", "/v1/chat/completions", data={
445 "max_tokens": n_predict,
446 "messages": [
447 {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
448 {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
449 {
450 "role": "assistant",
451 "content": None,
452 "tool_calls": [
453 {
454 "id": "call_6789",
455 "type": "function",
456 "function": {
457 "name": "calculate",
458 "arguments": "{\"expression\":\"sin(30 * pi / 180)\"}"
459 }
460 }
461 ]
462 },
463 {
464 "role": "tool",
465 "name": "calculate",
466 "content": "0.55644242476",
467 "tool_call_id": "call_6789"
468 }
469 ],
470 "tools": [
471 {
472 "type":"function",
473 "function":{
474 "name":"calculate",
475 "description":"A calculator function that computes values of arithmetic expressions in the Python syntax",
476 "parameters":{
477 "type":"object",
478 "properties":{
479 "expression":{
480 "type":"string",
481 "description":"An arithmetic expression to compute the value of (Python syntad, assuming all floats)"
482 }
483 },
484 "required":["expression"]
485 }
486 }
487 }
488 ],
489 **kwargs,
490 }, timeout=TIMEOUT_HTTP_REQUEST)
491 choice = body["choices"][0]
492 tool_calls = choice["message"].get("tool_calls")
493 assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
494 content = choice["message"].get("content")
495 assert content is not None, f'Expected content in {choice["message"]}'
496 if result_override is not None:
497 assert re.match(result_override, content), f'Expected {result_override}, got {content}'
498 else:
499 assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \
500 f'Expected something like "The y coordinate is 0.56.", got {content}'
501
502
503@pytest.mark.slow
504@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
505@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
506 (128, 'deepseek', None, "^The sum of 102 and 7 is 109[\\s\\S]*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
507 (128, None, None, "^The sum of 102 and 7 is 109[\\s\\S]*", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
508 (1024, 'deepseek', "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
509 (1024, 'deepseek', "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
510 # (1024, 'none', CompletionMode.NORMAL, None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
511 # (128, 'deepseek', None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*", "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M", None),
512])
513def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
514 global server
515 server.reasoning_format = reasoning_format
516 server.jinja = True
517 server.n_ctx = 8192 * 2
518 server.n_predict = n_predict
519 server.model_hf_repo = hf_repo
520 server.model_hf_file = None
521 if isinstance(template_override, tuple):
522 (template_hf_repo, template_variant) = template_override
523 server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
524 assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
525 elif isinstance(template_override, str):
526 server.chat_template = template_override
527 server.start()
528 body = server.make_any_request("POST", "/v1/chat/completions", data={
529 "max_tokens": n_predict,
530 "messages": [
531 {"role": "user", "content": "What's the sum of 102 and 7?"},
532 ],
533 "stream": stream == CompletionMode.STREAMED,
534 }, timeout=TIMEOUT_HTTP_REQUEST)
535 choice = body["choices"][0]
536 assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
537
538 content = choice["message"].get("content")
539 if expect_content is None:
540 assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
541 else:
542 assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
543
544 reasoning_content = choice["message"].get("reasoning_content")
545 if expect_reasoning_content is None:
546 assert reasoning_content is None, f'Expected no reasoning content in {choice["message"]}'
547 else:
548 assert re.match(expect_reasoning_content, reasoning_content), f'Expected {expect_reasoning_content}, got {reasoning_content}'
549
550
551@pytest.mark.slow
552@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
553@pytest.mark.parametrize("hf_repo,template_override", [
554 ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
555
556 ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None),
557 ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"),
558
559 ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)),
560 ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"),
561
562 # ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
563 ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
564
565 ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
566 ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", None),
567
568 ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)),
569 ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", None),
570
571 ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None),
572 ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"),
573
574 ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
575 ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
576
577 ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
578 ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"),
579
580 ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
581 ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
582
583 ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None),
584 ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"),
585])
586def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
587 global server
588 n_predict = 512 # High because of DeepSeek R1
589 server.jinja = True
590 server.n_ctx = 8192
591 server.n_predict = n_predict
592 server.model_hf_repo = hf_repo
593 server.model_hf_file = None
594 if isinstance(template_override, tuple):
595 (template_hf_repo, template_variant) = template_override
596 server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
597 assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
598 elif isinstance(template_override, str):
599 server.chat_template = template_override
600 server.start(timeout_seconds=TIMEOUT_START_SLOW)
601
602 do_test_hello_world(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
603
604
605def do_test_hello_world(server: ServerProcess, **kwargs):
606 body = server.make_any_request("POST", "/v1/chat/completions", data={
607 "messages": [
608 {"role": "system", "content": "You are a tool-calling agent."},
609 {"role": "user", "content": "say hello world with python"},
610 ],
611 "tools": [PYTHON_TOOL],
612 **kwargs,
613 }, timeout=TIMEOUT_HTTP_REQUEST)
614 choice = body["choices"][0]
615 tool_calls = choice["message"].get("tool_calls")
616 assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
617 tool_call = tool_calls[0]
618 # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
619 assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
620 # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
621 actual_arguments = json.loads(tool_call["function"]["arguments"])
622 assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
623 code = actual_arguments["code"]
624 assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
625 assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', re.sub(r'#.*\n?', '', code)), f'Expected hello world, got {code}'