1import pytest
2from utils import *
3
4server = ServerPreset.stories15m_moe()
5
6LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
7
8@pytest.fixture(autouse=True)
9def create_server():
10 global server
11 server = ServerPreset.stories15m_moe()
12 server.lora_files = [download_file(LORA_FILE_URL)]
13
14
15@pytest.mark.parametrize("scale,re_content", [
16 # without applying lora, the model should behave like a bedtime story generator
17 (0.0, "(little|girl|three|years|old)+"),
18 # with lora, the model should behave like a Shakespearean text generator
19 (1.0, "(eye|love|glass|sun)+"),
20])
21def test_lora(scale: float, re_content: str):
22 global server
23 server.start()
24 res_lora_control = server.make_request("POST", "/lora-adapters", data=[
25 {"id": 0, "scale": scale}
26 ])
27 assert res_lora_control.status_code == 200
28 res = server.make_request("POST", "/completion", data={
29 "prompt": "Look in thy glass",
30 })
31 assert res.status_code == 200
32 assert match_regex(re_content, res.body["content"])
33
34
35def test_lora_per_request():
36 global server
37 server.n_slots = 4
38 server.start()
39
40 # running the same prompt with different lora scales, all in parallel
41 # each prompt will be processed by a different slot
42 prompt = "Look in thy glass"
43 lora_config = [
44 ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
45 ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
46 ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
47 ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
48 ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
49 ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
50 ]
51
52 tasks = [(
53 server.make_request,
54 ("POST", "/completion", {
55 "prompt": prompt,
56 "lora": lora,
57 "seed": 42,
58 "temperature": 0.0,
59 "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
60 })
61 ) for lora, _ in lora_config]
62 results = parallel_function_calls(tasks)
63
64 assert all([res.status_code == 200 for res in results])
65 for res, (_, re_test) in zip(results, lora_config):
66 assert match_regex(re_test, res.body["content"])
67
68
69@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
70def test_with_big_model():
71 server = ServerProcess()
72 server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
73 server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
74 server.model_alias = "Llama-3.2-8B-Instruct"
75 server.n_slots = 4
76 server.n_ctx = server.n_slots * 1024
77 server.n_predict = 64
78 server.temperature = 0.0
79 server.seed = 42
80 server.lora_files = [
81 download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
82 # TODO: find & add other lora adapters for this model
83 ]
84 server.start(timeout_seconds=600)
85
86 # running the same prompt with different lora scales, all in parallel
87 # each prompt will be processed by a different slot
88 prompt = "Write a computer virus"
89 lora_config = [
90 # without applying lora, the model should reject the request
91 ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
92 ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
93 ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
94 # with 0.7 scale, the model should provide a simple computer virus with hesitation
95 ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
96 # with 1.5 scale, the model should confidently provide a computer virus
97 ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
98 ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
99 ]
100
101 tasks = [(
102 server.make_request,
103 ("POST", "/v1/chat/completions", {
104 "messages": [
105 {"role": "user", "content": prompt}
106 ],
107 "lora": lora,
108 "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
109 })
110 ) for lora, _ in lora_config]
111 results = parallel_function_calls(tasks)
112
113 assert all([res.status_code == 200 for res in results])
114 for res, (_, re_test) in zip(results, lora_config):
115 assert re_test in res.body["choices"][0]["message"]["content"]