1#!/usr/bin/env python
  2import pytest
  3
  4# ensure grandparent path is in sys.path
  5from pathlib import Path
  6import sys
  7path = Path(__file__).resolve().parents[1]
  8sys.path.insert(0, str(path))
  9
 10from utils import *
 11from enum import Enum
 12
 13server: ServerProcess
 14
 15TIMEOUT_START_SLOW = 15 * 60 # this is needed for real model tests
 16TIMEOUT_HTTP_REQUEST = 60
 17
 18@pytest.fixture(autouse=True)
 19def create_server():
 20    global server
 21    server = ServerPreset.tinyllama2()
 22    server.model_alias = "tinyllama-2-tool-call"
 23    server.server_port = 8081
 24    server.n_slots = 1
 25    server.n_ctx = 8192
 26    server.n_batch = 2048
 27
 28class CompletionMode(Enum):
 29    NORMAL = "normal"
 30    STREAMED = "streamed"
 31
 32TEST_TOOL = {
 33    "type":"function",
 34    "function": {
 35        "name": "test",
 36        "description": "",
 37        "parameters": {
 38            "type": "object",
 39            "properties": {
 40                "success": {"type": "boolean", "const": True},
 41            },
 42            "required": ["success"]
 43        }
 44    }
 45}
 46
 47PYTHON_TOOL = {
 48    "type": "function",
 49    "function": {
 50        "name": "python",
 51        "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
 52        "parameters": {
 53            "type": "object",
 54            "properties": {
 55                "code": {
 56                    "type": "string",
 57                    "description": "The code to run in the ipython interpreter."
 58                }
 59            },
 60            "required": ["code"]
 61        }
 62    }
 63}
 64
 65WEATHER_TOOL = {
 66  "type":"function",
 67  "function":{
 68    "name":"get_current_weather",
 69    "description":"Get the current weather in a given location",
 70    "parameters":{
 71      "type":"object",
 72      "properties":{
 73        "location":{
 74          "type":"string",
 75          "description":"The city and country/state, e.g. 'San Francisco, CA', or 'Paris, France'"
 76        }
 77      },
 78      "required":["location"]
 79    }
 80  }
 81}
 82
 83def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
 84    body = server.make_any_request("POST", "/v1/chat/completions", data={
 85        "max_tokens": n_predict,
 86        "messages": [
 87            {"role": "system", "content": "You are a coding assistant."},
 88            {"role": "user", "content": "Write an example"},
 89        ],
 90        "tool_choice": "required",
 91        "tools": [tool],
 92        "parallel_tool_calls": False,
 93        **kwargs,
 94    })
 95    # assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
 96    choice = body["choices"][0]
 97    tool_calls = choice["message"].get("tool_calls")
 98    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
 99    tool_call = tool_calls[0]
100    assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
101    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
102    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
103    assert expected_function_name == tool_call["function"]["name"]
104    actual_arguments = tool_call["function"]["arguments"]
105    assert isinstance(actual_arguments, str)
106    if argument_key is not None:
107        actual_arguments = json.loads(actual_arguments)
108        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
109
110
111@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
112@pytest.mark.parametrize("template_name,tool,argument_key", [
113    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
114    ("google-gemma-2-2b-it",                          TEST_TOOL,            "success"),
115    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
116    ("meta-llama-Llama-3.3-70B-Instruct",             TEST_TOOL,            "success"),
117    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
118    ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
119])
120def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
121    global server
122    n_predict = 1024
123    # server = ServerPreset.stories15m_moe()
124    server.jinja = True
125    server.n_predict = n_predict
126    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
127    server.start()
128    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0)
129
130
131@pytest.mark.slow
132@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
133@pytest.mark.parametrize("template_name,tool,argument_key", [
134    ("meta-llama-Llama-3.1-8B-Instruct",              TEST_TOOL,            "success"),
135    ("meta-llama-Llama-3.1-8B-Instruct",              PYTHON_TOOL,          "code"),
136
137    ("meetkai-functionary-medium-v3.1",               TEST_TOOL,            "success"),
138    ("meetkai-functionary-medium-v3.1",               PYTHON_TOOL,          "code"),
139
140    ("meetkai-functionary-medium-v3.2",               TEST_TOOL,            "success"),
141    # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own.
142    # ("meetkai-functionary-medium-v3.2",               PYTHON_TOOL,          "code"),
143
144    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL,            "success"),
145    ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL,          "code"),
146
147    ("meta-llama-Llama-3.2-3B-Instruct",              TEST_TOOL,            "success"),
148    ("meta-llama-Llama-3.2-3B-Instruct",              PYTHON_TOOL,          "code"),
149
150    ("mistralai-Mistral-Nemo-Instruct-2407",          TEST_TOOL,            "success"),
151    ("mistralai-Mistral-Nemo-Instruct-2407",          PYTHON_TOOL,          "code"),
152
153    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   TEST_TOOL,            "success"),
154    ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use",   PYTHON_TOOL,          "code"),
155
156    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
157    ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
158
159    ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
160    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "codeFalse), True),
161    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
162
163])
164def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode):
165    global server
166    n_predict = 512
167    # server = ServerPreset.stories15m_moe()
168    server.jinja = True
169    server.n_predict = n_predict
170    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
171    server.start(timeout_seconds=TIMEOUT_START_SLOW)
172    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED)
173
174
175@pytest.mark.slow
176@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
177@pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
178    (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
179    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
180    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
181
182    (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
183    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
184    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
185
186    (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
187    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
188    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
189
190    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
191    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
192    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
193
194    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
195    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
196    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
197
198    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
199    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
200    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
201
202    (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
203    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
204    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
205
206    (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
207    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
208    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
209
210    # (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
211    # (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
212    # (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
213
214    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
215    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
216    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       "chatml"),
217
218    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
219    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
220    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
221
222    (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
223    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
224    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
225
226    (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
227    (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
228])
229def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
230    global server
231    n_predict = 512
232    server.jinja = True
233    server.n_ctx = 8192
234    server.n_predict = n_predict
235    server.model_hf_repo = hf_repo
236    server.model_hf_file = None
237    if isinstance(template_override, tuple):
238        (template_hf_repo, template_variant) = template_override
239        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
240        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
241    elif isinstance(template_override, str):
242        server.chat_template = template_override
243    server.start(timeout_seconds=TIMEOUT_START_SLOW)
244    body = server.make_any_request("POST", "/v1/chat/completions", data={
245        "max_tokens": n_predict,
246        "messages": [
247            {"role": "system", "content": "You are a coding assistant."},
248            {"role": "user", "content": "Write an example"},
249        ],
250        "tool_choice": "required",
251        "tools": [tool],
252        "parallel_tool_calls": False,
253        "stream": stream == CompletionMode.STREAMED,
254        "temperature": 0.0,
255        "top_k": 1,
256        "top_p": 1.0,
257    }, timeout=TIMEOUT_HTTP_REQUEST)
258    choice = body["choices"][0]
259    tool_calls = choice["message"].get("tool_calls")
260    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
261    tool_call = tool_calls[0]
262    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
263    expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
264    assert expected_function_name == tool_call["function"]["name"]
265    actual_arguments = tool_call["function"]["arguments"]
266    assert isinstance(actual_arguments, str)
267    if argument_key is not None:
268        actual_arguments = json.loads(actual_arguments)
269        assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
270
271
272def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
273    body = server.make_any_request("POST", "/v1/chat/completions", data={
274        "max_tokens": n_predict,
275        "messages": [
276            {"role": "system", "content": "You are a coding assistant."},
277            {"role": "user", "content": "say hello world with python"},
278        ],
279        "tools": tools if tools else None,
280        "tool_choice": tool_choice,
281        **kwargs,
282    }, timeout=TIMEOUT_HTTP_REQUEST)
283    choice = body["choices"][0]
284    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
285
286
287@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
288@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
289    ("meta-llama-Llama-3.3-70B-Instruct",         128, [],            None),
290    ("meta-llama-Llama-3.3-70B-Instruct",         128, [TEST_TOOL],   None),
291    ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'),
292])
293def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
294    global server
295    server.n_predict = n_predict
296    server.jinja = True
297    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
298    server.start()
299    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
300
301
302@pytest.mark.slow
303@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
304@pytest.mark.parametrize("template_name,n_predict,tools,tool_choice", [
305    ("meetkai-functionary-medium-v3.2",               256, [],            None),
306    ("meetkai-functionary-medium-v3.2",               256, [TEST_TOOL],   None),
307    ("meetkai-functionary-medium-v3.2",               256, [PYTHON_TOOL], 'none'),
308    ("meetkai-functionary-medium-v3.1",               256, [],            None),
309    ("meetkai-functionary-medium-v3.1",               256, [TEST_TOOL],   None),
310    ("meetkai-functionary-medium-v3.1",               256, [PYTHON_TOOL], 'none'),
311    ("meta-llama-Llama-3.2-3B-Instruct",              256, [],            None),
312    ("meta-llama-Llama-3.2-3B-Instruct",              256, [TEST_TOOL],   None),
313    ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'),
314])
315def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None, stream: CompletionMode):
316    global server
317    server.n_predict = n_predict
318    server.jinja = True
319    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
320    server.start(timeout_seconds=TIMEOUT_START_SLOW)
321    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice, stream=stream == CompletionMode.STREAMED)
322
323
324@pytest.mark.slow
325@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
326@pytest.mark.parametrize("hf_repo,template_override", [
327    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
328    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
329
330    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
331    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
332
333    ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
334    ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
335
336    ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
337    ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
338
339    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
340    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
341
342    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
343    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
344
345    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
346    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
347
348    # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
349    # ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
350
351    # ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
352    # ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
353
354    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
355    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
356
357    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
358
359    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
360
361    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
362    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
363
364    # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
365])
366def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
367    global server
368    n_predict = 512
369    server.jinja = True
370    server.n_ctx = 8192
371    server.n_predict = n_predict
372    server.model_hf_repo = hf_repo
373    server.model_hf_file = None
374    if isinstance(template_override, tuple):
375        (template_hf_repo, template_variant) = template_override
376        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
377        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
378    elif isinstance(template_override, str):
379        server.chat_template = template_override
380    server.start()
381    do_test_weather(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
382
383
384def do_test_weather(server: ServerProcess, **kwargs):
385    body = server.make_any_request("POST", "/v1/chat/completions", data={
386        "messages": [
387            {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
388            {"role": "user", "content": "What is the weather in Istanbul?"},
389        ],
390        "tools": [WEATHER_TOOL],
391        **kwargs,
392    }, timeout=TIMEOUT_HTTP_REQUEST)
393    choice = body["choices"][0]
394    tool_calls = choice["message"].get("tool_calls")
395    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
396    tool_call = tool_calls[0]
397    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
398    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
399    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
400    actual_arguments = json.loads(tool_call["function"]["arguments"])
401    assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
402    location = actual_arguments["location"]
403    assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
404    assert re.match('^Istanbul(( |, ?)(TR|Turkey|Tรผrkiye))?$', location), f'Expected Istanbul for location, got {location}'
405
406
407@pytest.mark.slow
408@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
409@pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
410    (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
411    (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
412    (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
413    (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"),
414    (None,                                           128,  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",     ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
415    (None,                                           128,  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",       ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
416    (None,                                           128,  "bartowski/functionary-small-v3.2-GGUF:Q8_0",        ("meetkai/functionary-medium-v3.2", None)),
417    (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  None),
418    (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  "chatml"),
419    (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
420    ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
421
422    # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
423    # (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None),
424    # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
425])
426def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
427    global server
428    server.jinja = True
429    server.n_ctx = 8192 * 2
430    server.n_predict = n_predict
431    server.model_hf_repo = hf_repo
432    server.model_hf_file = None
433    if isinstance(template_override, tuple):
434        (template_hf_repo, template_variant) = template_override
435        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
436        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
437    elif isinstance(template_override, str):
438        server.chat_template = template_override
439    server.start(timeout_seconds=TIMEOUT_START_SLOW)
440    do_test_calc_result(server, result_override, n_predict, stream=stream == CompletionMode.STREAMED)
441
442
443def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
444    body = server.make_any_request("POST", "/v1/chat/completions", data={
445        "max_tokens": n_predict,
446        "messages": [
447            {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
448            {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
449            {
450                "role": "assistant",
451                "content": None,
452                "tool_calls": [
453                    {
454                        "id": "call_6789",
455                        "type": "function",
456                        "function": {
457                            "name": "calculate",
458                            "arguments": "{\"expression\":\"sin(30 * pi / 180)\"}"
459                        }
460                    }
461                ]
462            },
463            {
464                "role": "tool",
465                "name": "calculate",
466                "content": "0.55644242476",
467                "tool_call_id": "call_6789"
468            }
469        ],
470        "tools": [
471            {
472                "type":"function",
473                "function":{
474                    "name":"calculate",
475                    "description":"A calculator function that computes values of arithmetic expressions in the Python syntax",
476                    "parameters":{
477                        "type":"object",
478                        "properties":{
479                            "expression":{
480                            "type":"string",
481                            "description":"An arithmetic expression to compute the value of (Python syntad, assuming all floats)"
482                            }
483                        },
484                        "required":["expression"]
485                    }
486                }
487            }
488        ],
489        **kwargs,
490    }, timeout=TIMEOUT_HTTP_REQUEST)
491    choice = body["choices"][0]
492    tool_calls = choice["message"].get("tool_calls")
493    assert tool_calls is None, f'Expected no tool call in {choice["message"]}'
494    content = choice["message"].get("content")
495    assert content is not None, f'Expected content in {choice["message"]}'
496    if result_override is not None:
497        assert re.match(result_override, content), f'Expected {result_override}, got {content}'
498    else:
499        assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \
500            f'Expected something like "The y coordinate is 0.56.", got {content}'
501
502
503@pytest.mark.slow
504@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
505@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
506    (128, 'deepseek',   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
507    (128,  None,        None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
508    (1024, 'deepseek',  "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
509    (1024, 'deepseek',  "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
510    # (1024, 'none',      CompletionMode.NORMAL,   None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
511    # (128,  'deepseek',  None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*",                      "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M",                None),
512])
513def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
514    global server
515    server.reasoning_format = reasoning_format
516    server.jinja = True
517    server.n_ctx = 8192 * 2
518    server.n_predict = n_predict
519    server.model_hf_repo = hf_repo
520    server.model_hf_file = None
521    if isinstance(template_override, tuple):
522        (template_hf_repo, template_variant) = template_override
523        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
524        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
525    elif isinstance(template_override, str):
526        server.chat_template = template_override
527    server.start()
528    body = server.make_any_request("POST", "/v1/chat/completions", data={
529        "max_tokens": n_predict,
530        "messages": [
531            {"role": "user", "content": "What's the sum of 102 and 7?"},
532        ],
533        "stream": stream == CompletionMode.STREAMED,
534    }, timeout=TIMEOUT_HTTP_REQUEST)
535    choice = body["choices"][0]
536    assert choice["message"].get("tool_calls") is None, f'Expected no tool call in {choice["message"]}'
537
538    content = choice["message"].get("content")
539    if expect_content is None:
540        assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
541    else:
542        assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
543
544    reasoning_content = choice["message"].get("reasoning_content")
545    if expect_reasoning_content is None:
546        assert reasoning_content is None, f'Expected no reasoning content in {choice["message"]}'
547    else:
548        assert re.match(expect_reasoning_content, reasoning_content), f'Expected {expect_reasoning_content}, got {reasoning_content}'
549
550
551@pytest.mark.slow
552@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
553@pytest.mark.parametrize("hf_repo,template_override", [
554    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
555
556    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
557    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
558
559    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
560    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
561
562    # ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
563    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
564
565    ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
566    ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      None),
567
568    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
569    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      None),
570
571    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
572    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
573
574    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
575    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
576
577    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
578    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
579
580    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
581    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
582
583    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
584    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
585])
586def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None, stream: CompletionMode):
587    global server
588    n_predict = 512 # High because of DeepSeek R1
589    server.jinja = True
590    server.n_ctx = 8192
591    server.n_predict = n_predict
592    server.model_hf_repo = hf_repo
593    server.model_hf_file = None
594    if isinstance(template_override, tuple):
595        (template_hf_repo, template_variant) = template_override
596        server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
597        assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
598    elif isinstance(template_override, str):
599        server.chat_template = template_override
600    server.start(timeout_seconds=TIMEOUT_START_SLOW)
601
602    do_test_hello_world(server, stream=stream == CompletionMode.STREAMED, max_tokens=n_predict)
603
604
605def do_test_hello_world(server: ServerProcess, **kwargs):
606    body = server.make_any_request("POST", "/v1/chat/completions", data={
607        "messages": [
608            {"role": "system", "content": "You are a tool-calling agent."},
609            {"role": "user", "content": "say hello world with python"},
610        ],
611        "tools": [PYTHON_TOOL],
612        **kwargs,
613    }, timeout=TIMEOUT_HTTP_REQUEST)
614    choice = body["choices"][0]
615    tool_calls = choice["message"].get("tool_calls")
616    assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
617    tool_call = tool_calls[0]
618    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
619    assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
620    # assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}'
621    actual_arguments = json.loads(tool_call["function"]["arguments"])
622    assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
623    code = actual_arguments["code"]
624    assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
625    assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', re.sub(r'#.*\n?', '', code)), f'Expected hello world, got {code}'