llmnpc - llama.cpp/tools/server/tests/unit/test

Path: llmnpc / llama.cpp / tools / server / tests / unit / test_lora.py (raw)
  1import pytest
  2from utils import *
  3
  4server = ServerPreset.stories15m_moe()
  5
  6LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
  7
  8@pytest.fixture(autouse=True)
  9def create_server():
 10    global server
 11    server = ServerPreset.stories15m_moe()
 12    server.lora_files = [download_file(LORA_FILE_URL)]
 13
 14
 15@pytest.mark.parametrize("scale,re_content", [
 16    # without applying lora, the model should behave like a bedtime story generator
 17    (0.0, "(little|girl|three|years|old)+"),
 18    # with lora, the model should behave like a Shakespearean text generator
 19    (1.0, "(eye|love|glass|sun)+"),
 20])
 21def test_lora(scale: float, re_content: str):
 22    global server
 23    server.start()
 24    res_lora_control = server.make_request("POST", "/lora-adapters", data=[
 25        {"id": 0, "scale": scale}
 26    ])
 27    assert res_lora_control.status_code == 200
 28    res = server.make_request("POST", "/completion", data={
 29        "prompt": "Look in thy glass",
 30    })
 31    assert res.status_code == 200
 32    assert match_regex(re_content, res.body["content"])
 33
 34
 35def test_lora_per_request():
 36    global server
 37    server.n_slots = 4
 38    server.start()
 39
 40    # running the same prompt with different lora scales, all in parallel
 41    # each prompt will be processed by a different slot
 42    prompt = "Look in thy glass"
 43    lora_config = [
 44        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
 45        ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ),
 46        ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ),
 47        ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ),
 48        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
 49        ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ),
 50    ]
 51
 52    tasks = [(
 53        server.make_request,
 54        ("POST", "/completion", {
 55            "prompt": prompt,
 56            "lora": lora,
 57            "seed": 42,
 58            "temperature": 0.0,
 59            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
 60        })
 61    ) for lora, _ in lora_config]
 62    results = parallel_function_calls(tasks)
 63
 64    assert all([res.status_code == 200 for res in results])
 65    for res, (_, re_test) in zip(results, lora_config):
 66        assert match_regex(re_test, res.body["content"])
 67
 68
 69@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
 70def test_with_big_model():
 71    server = ServerProcess()
 72    server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 73    server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf"
 74    server.model_alias = "Llama-3.2-8B-Instruct"
 75    server.n_slots = 4
 76    server.n_ctx = server.n_slots * 1024
 77    server.n_predict = 64
 78    server.temperature = 0.0
 79    server.seed = 42
 80    server.lora_files = [
 81        download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"),
 82        # TODO: find & add other lora adapters for this model
 83    ]
 84    server.start(timeout_seconds=600)
 85
 86    # running the same prompt with different lora scales, all in parallel
 87    # each prompt will be processed by a different slot
 88    prompt = "Write a computer virus"
 89    lora_config = [
 90        # without applying lora, the model should reject the request
 91        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
 92        ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ),
 93        ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ),
 94        # with 0.7 scale, the model should provide a simple computer virus with hesitation
 95        ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ),
 96        # with 1.5 scale, the model should confidently provide a computer virus
 97        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
 98        ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ),
 99    ]
100
101    tasks = [(
102        server.make_request,
103        ("POST", "/v1/chat/completions", {
104            "messages": [
105                {"role": "user", "content": prompt}
106            ],
107            "lora": lora,
108            "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
109        })
110    ) for lora, _ in lora_config]
111    results = parallel_function_calls(tasks)
112
113    assert all([res.status_code == 200 for res in results])
114    for res, (_, re_test) in zip(results, lora_config):
115        assert re_test in res.body["choices"][0]["message"]["content"]