llmnpc - llama.cpp/tools/server/tests/unit/test_compat

Path: llmnpc / llama.cpp / tools / server / tests / unit / test_compat_anthropic.py (raw)
  1#!/usr/bin/env python3
  2import pytest
  3import base64
  4import requests
  5
  6from utils import *
  7
  8server: ServerProcess
  9
 10
 11def get_test_image_base64() -> str:
 12    """Get a test image in base64 format"""
 13    # Use the same test image as test_vision_api.py
 14    IMG_URL = "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/test/11_truck.png"
 15    response = requests.get(IMG_URL)
 16    response.raise_for_status()
 17    return base64.b64encode(response.content).decode("utf-8")
 18
 19@pytest.fixture(autouse=True)
 20def create_server():
 21    global server
 22    server = ServerPreset.tinyllama2()
 23    server.model_alias = "tinyllama-2-anthropic"
 24    server.server_port = 8082
 25    server.n_slots = 1
 26    server.n_ctx = 8192
 27    server.n_batch = 2048
 28
 29
 30@pytest.fixture
 31def vision_server():
 32    """Separate fixture for vision tests that require multimodal support"""
 33    global server
 34    server = ServerPreset.tinygemma3()
 35    server.offline = False  # Allow downloading the model
 36    server.model_alias = "tinygemma3-anthropic"
 37    server.server_port = 8083  # Different port to avoid conflicts
 38    server.n_slots = 1
 39    return server
 40
 41
 42# Basic message tests
 43
 44def test_anthropic_messages_basic():
 45    """Test basic Anthropic messages endpoint"""
 46    server.start()
 47
 48    res = server.make_request("POST", "/v1/messages", data={
 49        "model": "test",
 50        "max_tokens": 50,
 51        "messages": [
 52            {"role": "user", "content": "Say hello"}
 53        ]
 54    })
 55
 56    assert res.status_code == 200, f"Expected 200, got {res.status_code}"
 57    assert res.body["type"] == "message", f"Expected type 'message', got {res.body.get('type')}"
 58    assert res.body["role"] == "assistant", f"Expected role 'assistant', got {res.body.get('role')}"
 59    assert "content" in res.body, "Missing 'content' field"
 60    assert isinstance(res.body["content"], list), "Content should be an array"
 61    assert len(res.body["content"]) > 0, "Content array should not be empty"
 62    assert res.body["content"][0]["type"] == "text", "First content block should be text"
 63    assert "text" in res.body["content"][0], "Text content block missing 'text' field"
 64    assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}"
 65    assert "usage" in res.body, "Missing 'usage' field"
 66    assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens"
 67    assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens"
 68    assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer"
 69    assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer"
 70    assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens"
 71    # Anthropic API should NOT include timings
 72    assert "timings" not in res.body, "Anthropic API should not include timings field"
 73
 74
 75def test_anthropic_messages_with_system():
 76    """Test messages with system prompt"""
 77    server.start()
 78
 79    res = server.make_request("POST", "/v1/messages", data={
 80        "model": "test",
 81        "max_tokens": 50,
 82        "system": "You are a helpful assistant.",
 83        "messages": [
 84            {"role": "user", "content": "Hello"}
 85        ]
 86    })
 87
 88    assert res.status_code == 200
 89    assert res.body["type"] == "message"
 90    assert len(res.body["content"]) > 0
 91
 92
 93def test_anthropic_messages_multipart_content():
 94    """Test messages with multipart content blocks"""
 95    server.start()
 96
 97    res = server.make_request("POST", "/v1/messages", data={
 98        "model": "test",
 99        "max_tokens": 50,
100        "messages": [
101            {
102                "role": "user",
103                "content": [
104                    {"type": "text", "text": "What is"},
105                    {"type": "text", "text": " the answer?"}
106                ]
107            }
108        ]
109    })
110
111    assert res.status_code == 200
112    assert res.body["type"] == "message"
113
114
115def test_anthropic_messages_conversation():
116    """Test multi-turn conversation"""
117    server.start()
118
119    res = server.make_request("POST", "/v1/messages", data={
120        "model": "test",
121        "max_tokens": 50,
122        "messages": [
123            {"role": "user", "content": "Hello"},
124            {"role": "assistant", "content": "Hi there!"},
125            {"role": "user", "content": "How are you?"}
126        ]
127    })
128
129    assert res.status_code == 200
130    assert res.body["type"] == "message"
131
132
133# Streaming tests
134
135def test_anthropic_messages_streaming():
136    """Test streaming messages"""
137    server.start()
138
139    res = server.make_stream_request("POST", "/v1/messages", data={
140        "model": "test",
141        "max_tokens": 30,
142        "messages": [
143            {"role": "user", "content": "Say hello"}
144        ],
145        "stream": True
146    })
147
148    events = []
149    for data in res:
150        # Each event should have type and other fields
151        assert "type" in data, f"Missing 'type' in event: {data}"
152        events.append(data)
153
154    # Verify event sequence
155    event_types = [e["type"] for e in events]
156    assert "message_start" in event_types, "Missing message_start event"
157    assert "content_block_start" in event_types, "Missing content_block_start event"
158    assert "content_block_delta" in event_types, "Missing content_block_delta event"
159    assert "content_block_stop" in event_types, "Missing content_block_stop event"
160    assert "message_delta" in event_types, "Missing message_delta event"
161    assert "message_stop" in event_types, "Missing message_stop event"
162
163    # Check message_start structure
164    message_start = next(e for e in events if e["type"] == "message_start")
165    assert "message" in message_start, "message_start missing 'message' field"
166    assert message_start["message"]["type"] == "message"
167    assert message_start["message"]["role"] == "assistant"
168    assert message_start["message"]["content"] == []
169    assert "usage" in message_start["message"]
170    assert message_start["message"]["usage"]["input_tokens"] > 0
171
172    # Check content_block_start
173    block_start = next(e for e in events if e["type"] == "content_block_start")
174    assert "index" in block_start, "content_block_start missing 'index'"
175    assert block_start["index"] == 0, "First content block should be at index 0"
176    assert "content_block" in block_start
177    assert block_start["content_block"]["type"] == "text"
178
179    # Check content_block_delta
180    deltas = [e for e in events if e["type"] == "content_block_delta"]
181    assert len(deltas) > 0, "Should have at least one content_block_delta"
182    for delta in deltas:
183        assert "index" in delta
184        assert "delta" in delta
185        assert delta["delta"]["type"] == "text_delta"
186        assert "text" in delta["delta"]
187
188    # Check content_block_stop
189    block_stop = next(e for e in events if e["type"] == "content_block_stop")
190    assert "index" in block_stop
191    assert block_stop["index"] == 0
192
193    # Check message_delta
194    message_delta = next(e for e in events if e["type"] == "message_delta")
195    assert "delta" in message_delta
196    assert "stop_reason" in message_delta["delta"]
197    assert message_delta["delta"]["stop_reason"] in ["end_turn", "max_tokens"]
198    assert "usage" in message_delta
199    assert message_delta["usage"]["output_tokens"] > 0
200
201    # Check message_stop
202    message_stop = next(e for e in events if e["type"] == "message_stop")
203    # message_stop should NOT have timings for Anthropic API
204    assert "timings" not in message_stop, "Anthropic streaming should not include timings"
205
206
207# Token counting tests
208
209def test_anthropic_count_tokens():
210    """Test token counting endpoint"""
211    server.start()
212
213    res = server.make_request("POST", "/v1/messages/count_tokens", data={
214        "model": "test",
215        "messages": [
216            {"role": "user", "content": "Hello world"}
217        ]
218    })
219
220    assert res.status_code == 200
221    assert "input_tokens" in res.body
222    assert isinstance(res.body["input_tokens"], int)
223    assert res.body["input_tokens"] > 0
224    # Should only have input_tokens, no other fields
225    assert "output_tokens" not in res.body
226
227
228def test_anthropic_count_tokens_with_system():
229    """Test token counting with system prompt"""
230    server.start()
231
232    res = server.make_request("POST", "/v1/messages/count_tokens", data={
233        "model": "test",
234        "system": "You are a helpful assistant.",
235        "messages": [
236            {"role": "user", "content": "Hello"}
237        ]
238    })
239
240    assert res.status_code == 200
241    assert res.body["input_tokens"] > 0
242
243
244def test_anthropic_count_tokens_no_max_tokens():
245    """Test that count_tokens doesn't require max_tokens"""
246    server.start()
247
248    # max_tokens is NOT required for count_tokens
249    res = server.make_request("POST", "/v1/messages/count_tokens", data={
250        "model": "test",
251        "messages": [
252            {"role": "user", "content": "Hello"}
253        ]
254    })
255
256    assert res.status_code == 200
257    assert "input_tokens" in res.body
258
259
260# Tool use tests
261
262def test_anthropic_tool_use_basic():
263    """Test basic tool use"""
264    server.jinja = True
265    server.start()
266
267    res = server.make_request("POST", "/v1/messages", data={
268        "model": "test",
269        "max_tokens": 200,
270        "tools": [{
271            "name": "get_weather",
272            "description": "Get the current weather in a location",
273            "input_schema": {
274                "type": "object",
275                "properties": {
276                    "location": {
277                        "type": "string",
278                        "description": "City name"
279                    }
280                },
281                "required": ["location"]
282            }
283        }],
284        "messages": [
285            {"role": "user", "content": "What's the weather in Paris?"}
286        ]
287    })
288
289    assert res.status_code == 200
290    assert res.body["type"] == "message"
291    assert len(res.body["content"]) > 0
292
293    # Check if model used the tool (it might not always, depending on the model)
294    content_types = [block.get("type") for block in res.body["content"]]
295
296    if "tool_use" in content_types:
297        # Model used the tool
298        assert res.body["stop_reason"] == "tool_use"
299
300        # Find the tool_use block
301        tool_block = next(b for b in res.body["content"] if b.get("type") == "tool_use")
302        assert "id" in tool_block
303        assert "name" in tool_block
304        assert tool_block["name"] == "get_weather"
305        assert "input" in tool_block
306        assert isinstance(tool_block["input"], dict)
307
308
309def test_anthropic_tool_result():
310    """Test sending tool results back
311
312    This test verifies that tool_result blocks are properly converted to
313    role="tool" messages internally. Without proper conversion, this would
314    fail with a 500 error: "unsupported content[].type" because tool_result
315    blocks would remain in the user message content array.
316    """
317    server.jinja = True
318    server.start()
319
320    res = server.make_request("POST", "/v1/messages", data={
321        "model": "test",
322        "max_tokens": 100,
323        "messages": [
324            {"role": "user", "content": "What's the weather?"},
325            {
326                "role": "assistant",
327                "content": [
328                    {
329                        "type": "tool_use",
330                        "id": "test123",
331                        "name": "get_weather",
332                        "input": {"location": "Paris"}
333                    }
334                ]
335            },
336            {
337                "role": "user",
338                "content": [
339                    {
340                        "type": "tool_result",
341                        "tool_use_id": "test123",
342                        "content": "The weather is sunny, 25°C"
343                    }
344                ]
345            }
346        ]
347    })
348
349    # This would be 500 with the old bug where tool_result blocks weren't converted
350    assert res.status_code == 200
351    assert res.body["type"] == "message"
352    # Model should respond to the tool result
353    assert len(res.body["content"]) > 0
354    assert res.body["content"][0]["type"] == "text"
355
356
357def test_anthropic_tool_result_with_text():
358    """Test tool result mixed with text content
359
360    This tests the edge case where a user message contains both text and
361    tool_result blocks. The server must properly split these into separate
362    messages: a user message with text, followed by tool messages.
363    Without proper handling, this would fail with 500: "unsupported content[].type"
364    """
365    server.jinja = True
366    server.start()
367
368    res = server.make_request("POST", "/v1/messages", data={
369        "model": "test",
370        "max_tokens": 100,
371        "messages": [
372            {"role": "user", "content": "What's the weather?"},
373            {
374                "role": "assistant",
375                "content": [
376                    {
377                        "type": "tool_use",
378                        "id": "tool_1",
379                        "name": "get_weather",
380                        "input": {"location": "Paris"}
381                    }
382                ]
383            },
384            {
385                "role": "user",
386                "content": [
387                    {"type": "text", "text": "Here are the results:"},
388                    {
389                        "type": "tool_result",
390                        "tool_use_id": "tool_1",
391                        "content": "Sunny, 25°C"
392                    }
393                ]
394            }
395        ]
396    })
397
398    assert res.status_code == 200
399    assert res.body["type"] == "message"
400    assert len(res.body["content"]) > 0
401
402
403def test_anthropic_tool_result_error():
404    """Test tool result with error flag"""
405    server.jinja = True
406    server.start()
407
408    res = server.make_request("POST", "/v1/messages", data={
409        "model": "test",
410        "max_tokens": 100,
411        "messages": [
412            {"role": "user", "content": "Get the weather"},
413            {
414                "role": "assistant",
415                "content": [
416                    {
417                        "type": "tool_use",
418                        "id": "test123",
419                        "name": "get_weather",
420                        "input": {"location": "InvalidCity"}
421                    }
422                ]
423            },
424            {
425                "role": "user",
426                "content": [
427                    {
428                        "type": "tool_result",
429                        "tool_use_id": "test123",
430                        "is_error": True,
431                        "content": "City not found"
432                    }
433                ]
434            }
435        ]
436    })
437
438    assert res.status_code == 200
439    assert res.body["type"] == "message"
440
441
442def test_anthropic_tool_streaming():
443    """Test streaming with tool use"""
444    server.jinja = True
445    server.start()
446
447    res = server.make_stream_request("POST", "/v1/messages", data={
448        "model": "test",
449        "max_tokens": 200,
450        "stream": True,
451        "tools": [{
452            "name": "calculator",
453            "description": "Calculate math",
454            "input_schema": {
455                "type": "object",
456                "properties": {
457                    "expression": {"type": "string"}
458                },
459                "required": ["expression"]
460            }
461        }],
462        "messages": [
463            {"role": "user", "content": "Calculate 2+2"}
464        ]
465    })
466
467    events = []
468    for data in res:
469        events.append(data)
470
471    event_types = [e["type"] for e in events]
472
473    # Should have basic events
474    assert "message_start" in event_types
475    assert "message_stop" in event_types
476
477    # If tool was used, check for proper tool streaming
478    if any(e.get("type") == "content_block_start" and
479           e.get("content_block", {}).get("type") == "tool_use"
480           for e in events):
481        # Find tool use block start
482        tool_starts = [e for e in events if
483                      e.get("type") == "content_block_start" and
484                      e.get("content_block", {}).get("type") == "tool_use"]
485
486        assert len(tool_starts) > 0, "Should have tool_use content_block_start"
487
488        # Check index is correct (should be 0 if no text, 1 if there's text)
489        tool_start = tool_starts[0]
490        assert "index" in tool_start
491        assert tool_start["content_block"]["type"] == "tool_use"
492        assert "name" in tool_start["content_block"]
493
494
495# Vision/multimodal tests
496
497def test_anthropic_vision_format_accepted():
498    """Test that Anthropic vision format is accepted (format validation only)"""
499    server.start()
500
501    # Small 1x1 red PNG image in base64
502    red_pixel_png = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg=="
503
504    res = server.make_request("POST", "/v1/messages", data={
505        "model": "test",
506        "max_tokens": 10,
507        "messages": [
508            {
509                "role": "user",
510                "content": [
511                    {
512                        "type": "image",
513                        "source": {
514                            "type": "base64",
515                            "media_type": "image/png",
516                            "data": red_pixel_png
517                        }
518                    },
519                    {
520                        "type": "text",
521                        "text": "What is this?"
522                    }
523                ]
524            }
525        ]
526    })
527
528    # Server accepts the format but tinyllama doesn't support images
529    # So it should return 500 with clear error message about missing mmproj
530    assert res.status_code == 500
531    assert "image input is not supported" in res.body.get("error", {}).get("message", "").lower()
532
533
534def test_anthropic_vision_base64_with_multimodal_model(vision_server):
535    """Test vision with base64 image using Anthropic format with multimodal model"""
536    global server
537    server = vision_server
538    server.start()
539
540    # Get test image in base64 format
541    image_base64 = get_test_image_base64()
542
543    res = server.make_request("POST", "/v1/messages", data={
544        "model": "test",
545        "max_tokens": 10,
546        "messages": [
547            {
548                "role": "user",
549                "content": [
550                    {
551                        "type": "image",
552                        "source": {
553                            "type": "base64",
554                            "media_type": "image/png",
555                            "data": image_base64
556                        }
557                    },
558                    {
559                        "type": "text",
560                        "text": "What is this:\n"
561                    }
562                ]
563            }
564        ]
565    })
566
567    assert res.status_code == 200, f"Expected 200, got {res.status_code}: {res.body}"
568    assert res.body["type"] == "message"
569    assert len(res.body["content"]) > 0
570    assert res.body["content"][0]["type"] == "text"
571    # The model should generate some response about the image
572    assert len(res.body["content"][0]["text"]) > 0
573
574
575# Parameter tests
576
577def test_anthropic_stop_sequences():
578    """Test stop_sequences parameter"""
579    server.start()
580
581    res = server.make_request("POST", "/v1/messages", data={
582        "model": "test",
583        "max_tokens": 100,
584        "stop_sequences": ["\n", "END"],
585        "messages": [
586            {"role": "user", "content": "Count to 10"}
587        ]
588    })
589
590    assert res.status_code == 200
591    assert res.body["type"] == "message"
592
593
594def test_anthropic_temperature():
595    """Test temperature parameter"""
596    server.start()
597
598    res = server.make_request("POST", "/v1/messages", data={
599        "model": "test",
600        "max_tokens": 50,
601        "temperature": 0.5,
602        "messages": [
603            {"role": "user", "content": "Hello"}
604        ]
605    })
606
607    assert res.status_code == 200
608    assert res.body["type"] == "message"
609
610
611def test_anthropic_top_p():
612    """Test top_p parameter"""
613    server.start()
614
615    res = server.make_request("POST", "/v1/messages", data={
616        "model": "test",
617        "max_tokens": 50,
618        "top_p": 0.9,
619        "messages": [
620            {"role": "user", "content": "Hello"}
621        ]
622    })
623
624    assert res.status_code == 200
625    assert res.body["type"] == "message"
626
627
628def test_anthropic_top_k():
629    """Test top_k parameter (llama.cpp specific)"""
630    server.start()
631
632    res = server.make_request("POST", "/v1/messages", data={
633        "model": "test",
634        "max_tokens": 50,
635        "top_k": 40,
636        "messages": [
637            {"role": "user", "content": "Hello"}
638        ]
639    })
640
641    assert res.status_code == 200
642    assert res.body["type"] == "message"
643
644
645# Error handling tests
646
647def test_anthropic_missing_messages():
648    """Test error when messages are missing"""
649    server.start()
650
651    res = server.make_request("POST", "/v1/messages", data={
652        "model": "test",
653        "max_tokens": 50
654        # missing "messages" field
655    })
656
657    # Should return an error (400 or 500)
658    assert res.status_code >= 400
659
660
661def test_anthropic_empty_messages():
662    """Test permissive handling of empty messages array"""
663    server.start()
664
665    res = server.make_request("POST", "/v1/messages", data={
666        "model": "test",
667        "max_tokens": 50,
668        "messages": []
669    })
670
671    # Server is permissive and accepts empty messages (provides defaults)
672    # This matches the permissive validation design choice
673    assert res.status_code == 200
674    assert res.body["type"] == "message"
675
676
677# Content block index tests
678
679def test_anthropic_streaming_content_block_indices():
680    """Test that content block indices are correct in streaming"""
681    server.jinja = True
682    server.start()
683
684    # Request that might produce both text and tool use
685    res = server.make_stream_request("POST", "/v1/messages", data={
686        "model": "test",
687        "max_tokens": 400,
688        "stream": True,
689        "tools": [{
690            "name": "test_tool",
691            "description": "A test tool",
692            "input_schema": {
693                "type": "object",
694                "properties": {
695                    "param": {"type": "string"}
696                },
697                "required": ["param"]
698            }
699        }],
700        "messages": [
701            {"role": "user", "content": "Use the test tool"}
702        ]
703    })
704
705    events = []
706    for data in res:
707        events.append(data)
708
709    # Check content_block_start events have sequential indices
710    block_starts = [e for e in events if e.get("type") == "content_block_start"]
711    if len(block_starts) > 1:
712        # If there are multiple blocks, indices should be sequential
713        indices = [e["index"] for e in block_starts]
714        expected_indices = list(range(len(block_starts)))
715        assert indices == expected_indices, f"Expected indices {expected_indices}, got {indices}"
716
717    # Check content_block_stop events match the starts
718    block_stops = [e for e in events if e.get("type") == "content_block_stop"]
719    start_indices = set(e["index"] for e in block_starts)
720    stop_indices = set(e["index"] for e in block_stops)
721    assert start_indices == stop_indices, "content_block_stop indices should match content_block_start indices"
722
723
724# Extended features tests
725
726def test_anthropic_thinking():
727    """Test extended thinking parameter"""
728    server.jinja = True
729    server.start()
730
731    res = server.make_request("POST", "/v1/messages", data={
732        "model": "test",
733        "max_tokens": 100,
734        "thinking": {
735            "type": "enabled",
736            "budget_tokens": 50
737        },
738        "messages": [
739            {"role": "user", "content": "What is 2+2?"}
740        ]
741    })
742
743    assert res.status_code == 200
744    assert res.body["type"] == "message"
745
746
747def test_anthropic_metadata():
748    """Test metadata parameter"""
749    server.start()
750
751    res = server.make_request("POST", "/v1/messages", data={
752        "model": "test",
753        "max_tokens": 50,
754        "metadata": {
755            "user_id": "test_user_123"
756        },
757        "messages": [
758            {"role": "user", "content": "Hello"}
759        ]
760    })
761
762    assert res.status_code == 200
763    assert res.body["type"] == "message"
764
765
766# Compatibility tests
767
768def test_anthropic_vs_openai_different_response_format():
769    """Verify Anthropic format is different from OpenAI format"""
770    server.start()
771
772    # Make OpenAI request
773    openai_res = server.make_request("POST", "/v1/chat/completions", data={
774        "model": "test",
775        "max_tokens": 50,
776        "messages": [
777            {"role": "user", "content": "Hello"}
778        ]
779    })
780
781    # Make Anthropic request
782    anthropic_res = server.make_request("POST", "/v1/messages", data={
783        "model": "test",
784        "max_tokens": 50,
785        "messages": [
786            {"role": "user", "content": "Hello"}
787        ]
788    })
789
790    assert openai_res.status_code == 200
791    assert anthropic_res.status_code == 200
792
793    # OpenAI has "object", Anthropic has "type"
794    assert "object" in openai_res.body
795    assert "type" in anthropic_res.body
796    assert openai_res.body["object"] == "chat.completion"
797    assert anthropic_res.body["type"] == "message"
798
799    # OpenAI has "choices", Anthropic has "content"
800    assert "choices" in openai_res.body
801    assert "content" in anthropic_res.body
802
803    # Different usage field names
804    assert "prompt_tokens" in openai_res.body["usage"]
805    assert "input_tokens" in anthropic_res.body["usage"]
806    assert "completion_tokens" in openai_res.body["usage"]
807    assert "output_tokens" in anthropic_res.body["usage"]
808
809
810# Extended thinking tests with reasoning models
811
812@pytest.mark.slow
813@pytest.mark.parametrize("stream", [False, True])
814def test_anthropic_thinking_with_reasoning_model(stream):
815    """Test that thinking content blocks are properly returned for reasoning models"""
816    global server
817    server = ServerProcess()
818    server.model_hf_repo = "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF"
819    server.model_hf_file = "DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf"
820    server.reasoning_format = "deepseek"
821    server.jinja = True
822    server.n_ctx = 8192
823    server.n_predict = 1024
824    server.server_port = 8084
825    server.start(timeout_seconds=600)  # large model needs time to download
826
827    if stream:
828        res = server.make_stream_request("POST", "/v1/messages", data={
829            "model": "test",
830            "max_tokens": 1024,
831            "thinking": {
832                "type": "enabled",
833                "budget_tokens": 500
834            },
835            "messages": [
836                {"role": "user", "content": "What is 2+2?"}
837            ],
838            "stream": True
839        })
840
841        events = list(res)
842
843        # should have thinking content block events
844        thinking_starts = [e for e in events if
845            e.get("type") == "content_block_start" and
846            e.get("content_block", {}).get("type") == "thinking"]
847        assert len(thinking_starts) > 0, "Should have thinking content_block_start event"
848        assert thinking_starts[0]["index"] == 0, "Thinking block should be at index 0"
849
850        # should have thinking_delta events
851        thinking_deltas = [e for e in events if
852            e.get("type") == "content_block_delta" and
853            e.get("delta", {}).get("type") == "thinking_delta"]
854        assert len(thinking_deltas) > 0, "Should have thinking_delta events"
855
856        # should have signature_delta event before thinking block closes (Anthropic API requirement)
857        signature_deltas = [e for e in events if
858            e.get("type") == "content_block_delta" and
859            e.get("delta", {}).get("type") == "signature_delta"]
860        assert len(signature_deltas) > 0, "Should have signature_delta event for thinking block"
861
862        # should have text block after thinking
863        text_starts = [e for e in events if
864            e.get("type") == "content_block_start" and
865            e.get("content_block", {}).get("type") == "text"]
866        assert len(text_starts) > 0, "Should have text content_block_start event"
867        assert text_starts[0]["index"] == 1, "Text block should be at index 1 (after thinking)"
868    else:
869        res = server.make_request("POST", "/v1/messages", data={
870            "model": "test",
871            "max_tokens": 1024,
872            "thinking": {
873                "type": "enabled",
874                "budget_tokens": 500
875            },
876            "messages": [
877                {"role": "user", "content": "What is 2+2?"}
878            ]
879        })
880
881        assert res.status_code == 200
882        assert res.body["type"] == "message"
883
884        content = res.body["content"]
885        assert len(content) >= 2, "Should have at least thinking and text blocks"
886
887        # first block should be thinking
888        thinking_blocks = [b for b in content if b.get("type") == "thinking"]
889        assert len(thinking_blocks) > 0, "Should have thinking content block"
890        assert "thinking" in thinking_blocks[0], "Thinking block should have 'thinking' field"
891        assert len(thinking_blocks[0]["thinking"]) > 0, "Thinking content should not be empty"
892        assert "signature" in thinking_blocks[0], "Thinking block should have 'signature' field (Anthropic API requirement)"
893
894        # should also have text block
895        text_blocks = [b for b in content if b.get("type") == "text"]
896        assert len(text_blocks) > 0, "Should have text content block"