llmnpc - llama.cpp/tools/server/server-task.cpp

Path: llmnpc / llama.cpp / tools / server / server-task.cpp (raw)
   1#include "server-common.h"
   2#include "server-task.h"
   3
   4#include "common.h"
   5#include "llama.h"
   6#include "chat.h"
   7#include "sampling.h"
   8#include "speculative.h"
   9#include "json-schema-to-grammar.h"
  10
  11using json = nlohmann::ordered_json;
  12
  13//
  14// task_params
  15//
  16
  17json task_params::format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const {
  18    json data = json::array();
  19    for (const auto & lb : logit_bias) {
  20        data.push_back(json{
  21            {"bias", lb.bias},
  22            {"token", lb.token},
  23        });
  24    }
  25    return data;
  26}
  27
  28json task_params::to_json(bool only_metrics) const {
  29    std::vector<std::string> samplers;
  30    samplers.reserve(sampling.samplers.size());
  31    for (const auto & sampler : sampling.samplers) {
  32        samplers.emplace_back(common_sampler_type_to_str(sampler));
  33    }
  34
  35    json lora = json::array();
  36    for (auto & it : this->lora) {
  37        lora.push_back({{"id", it.first}, {"scale", it.second}});
  38    }
  39
  40    if (only_metrics) {
  41        return json {
  42            {"seed",                      sampling.seed},
  43            {"temperature",               sampling.temp},
  44            {"dynatemp_range",            sampling.dynatemp_range},
  45            {"dynatemp_exponent",         sampling.dynatemp_exponent},
  46            {"top_k",                     sampling.top_k},
  47            {"top_p",                     sampling.top_p},
  48            {"min_p",                     sampling.min_p},
  49            {"top_n_sigma",               sampling.top_n_sigma},
  50            {"xtc_probability",           sampling.xtc_probability},
  51            {"xtc_threshold",             sampling.xtc_threshold},
  52            {"typical_p",                 sampling.typ_p},
  53            {"repeat_last_n",             sampling.penalty_last_n},
  54            {"repeat_penalty",            sampling.penalty_repeat},
  55            {"presence_penalty",          sampling.penalty_present},
  56            {"frequency_penalty",         sampling.penalty_freq},
  57            {"dry_multiplier",            sampling.dry_multiplier},
  58            {"dry_base",                  sampling.dry_base},
  59            {"dry_allowed_length",        sampling.dry_allowed_length},
  60            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
  61            {"mirostat",                  sampling.mirostat},
  62            {"mirostat_tau",              sampling.mirostat_tau},
  63            {"mirostat_eta",              sampling.mirostat_eta},
  64            {"max_tokens",                n_predict},
  65            {"n_predict",                 n_predict}, // TODO: deduplicate?
  66            {"n_keep",                    n_keep},
  67            {"n_discard",                 n_discard},
  68            {"ignore_eos",                sampling.ignore_eos},
  69            {"stream",                    stream},
  70            {"n_probs",                   sampling.n_probs},
  71            {"min_keep",                  sampling.min_keep},
  72            {"chat_format",               common_chat_format_name(chat_parser_params.format)},
  73            {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
  74            {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
  75            {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
  76            {"samplers",                  samplers},
  77            {"speculative.n_max",         speculative.n_max},
  78            {"speculative.n_min",         speculative.n_min},
  79            {"speculative.p_min",         speculative.p_min},
  80            {"speculative.type",          common_speculative_type_to_str(speculative.type)},
  81            {"speculative.ngram_size_n",  speculative.ngram_size_n},
  82            {"speculative.ngram_size_m",  speculative.ngram_size_m},
  83            {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
  84            {"timings_per_token",         timings_per_token},
  85            {"post_sampling_probs",       post_sampling_probs},
  86            {"backend_sampling",          sampling.backend_sampling},
  87            {"lora",                      lora},
  88        };
  89    }
  90
  91    auto grammar_triggers = json::array();
  92    for (const auto & trigger : sampling.grammar_triggers) {
  93        server_grammar_trigger ct(trigger);
  94        grammar_triggers.push_back(ct.to_json());
  95    }
  96
  97    return json {
  98        {"seed",                      sampling.seed},
  99        {"temperature",               sampling.temp},
 100        {"dynatemp_range",            sampling.dynatemp_range},
 101        {"dynatemp_exponent",         sampling.dynatemp_exponent},
 102        {"top_k",                     sampling.top_k},
 103        {"top_p",                     sampling.top_p},
 104        {"min_p",                     sampling.min_p},
 105        {"top_n_sigma",               sampling.top_n_sigma},
 106        {"xtc_probability",           sampling.xtc_probability},
 107        {"xtc_threshold",             sampling.xtc_threshold},
 108        {"typical_p",                 sampling.typ_p},
 109        {"repeat_last_n",             sampling.penalty_last_n},
 110        {"repeat_penalty",            sampling.penalty_repeat},
 111        {"presence_penalty",          sampling.penalty_present},
 112        {"frequency_penalty",         sampling.penalty_freq},
 113        {"dry_multiplier",            sampling.dry_multiplier},
 114        {"dry_base",                  sampling.dry_base},
 115        {"dry_allowed_length",        sampling.dry_allowed_length},
 116        {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
 117        {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
 118        {"mirostat",                  sampling.mirostat},
 119        {"mirostat_tau",              sampling.mirostat_tau},
 120        {"mirostat_eta",              sampling.mirostat_eta},
 121        {"stop",                      antiprompt},
 122        {"max_tokens",                n_predict},
 123        {"n_predict",                 n_predict}, // TODO: deduplicate?
 124        {"n_keep",                    n_keep},
 125        {"n_discard",                 n_discard},
 126        {"ignore_eos",                sampling.ignore_eos},
 127        {"stream",                    stream},
 128        {"logit_bias",                format_logit_bias(sampling.logit_bias)},
 129        {"n_probs",                   sampling.n_probs},
 130        {"min_keep",                  sampling.min_keep},
 131        {"grammar",                   sampling.grammar},
 132        {"grammar_lazy",              sampling.grammar_lazy},
 133        {"grammar_triggers",          grammar_triggers},
 134        {"preserved_tokens",          sampling.preserved_tokens},
 135        {"chat_format",               common_chat_format_name(chat_parser_params.format)},
 136        {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
 137        {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
 138        {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
 139        {"samplers",                  samplers},
 140        {"speculative.n_max",         speculative.n_max},
 141        {"speculative.n_min",         speculative.n_min},
 142        {"speculative.p_min",         speculative.p_min},
 143        {"speculative.type",          common_speculative_type_to_str(speculative.type)},
 144        {"speculative.ngram_size_n",  speculative.ngram_size_n},
 145        {"speculative.ngram_size_m",  speculative.ngram_size_m},
 146        {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
 147        {"timings_per_token",         timings_per_token},
 148        {"post_sampling_probs",       post_sampling_probs},
 149        {"backend_sampling",          sampling.backend_sampling},
 150        {"lora",                      lora},
 151    };
 152}
 153
 154//
 155// task_result_state
 156//
 157common_chat_msg task_result_state::update_chat_msg(
 158        const std::string & text_added,
 159        bool is_partial,
 160        std::vector<common_chat_msg_diff> & diffs) {
 161    generated_text += text_added;
 162    auto msg_prv_copy = chat_msg;
 163    SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
 164    auto new_msg = common_chat_parse(
 165        generated_text,
 166        is_partial,
 167        chat_parser_params);
 168    if (!new_msg.empty()) {
 169        new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
 170        chat_msg = new_msg;
 171        diffs = common_chat_msg_diff::compute_diffs(msg_prv_copy, new_msg.empty() ? msg_prv_copy : new_msg);
 172    }
 173    return chat_msg;
 174}
 175
 176//
 177// server_task
 178//
 179
 180task_params server_task::params_from_json_cmpl(
 181        const llama_vocab * vocab,
 182        const common_params & params_base,
 183        const int n_ctx_slot,
 184        const json & data) {
 185    task_params params;
 186
 187    // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
 188    task_params defaults;
 189    defaults.sampling      = params_base.sampling;
 190    defaults.speculative   = params_base.speculative;
 191    defaults.n_keep        = params_base.n_keep;
 192    defaults.n_predict     = params_base.n_predict;
 193    defaults.n_cache_reuse = params_base.n_cache_reuse;
 194    defaults.cache_prompt  = params_base.cache_prompt;
 195    defaults.antiprompt    = params_base.antiprompt;
 196
 197    // enabling this will output extra debug information in the HTTP responses from the server
 198    params.verbose           = params_base.verbosity > 9;
 199    params.timings_per_token = json_value(data, "timings_per_token", false);
 200
 201    params.stream           = json_value(data,       "stream",             false);
 202    auto stream_opt         = json_value(data,       "stream_options",     json::object());
 203    params.include_usage    = json_value(stream_opt, "include_usage",      false);
 204    params.cache_prompt     = json_value(data,       "cache_prompt",       defaults.cache_prompt);
 205    params.return_tokens    = json_value(data,       "return_tokens",      false);
 206    params.return_progress  = json_value(data,       "return_progress",    false);
 207    params.n_predict        = json_value(data,       "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
 208    params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
 209    params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
 210    params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
 211    params.n_cmpl           = json_value(data,       "n_cmpl",             json_value(data, "n", 1));
 212    params.n_cache_reuse    = json_value(data,       "n_cache_reuse",      defaults.n_cache_reuse);
 213    //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
 214    params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms);
 215    params.response_fields  = json_value(data,       "response_fields",    std::vector<std::string>());
 216
 217    params.sampling.top_k              = json_value(data, "top_k",               defaults.sampling.top_k);
 218    params.sampling.top_p              = json_value(data, "top_p",               defaults.sampling.top_p);
 219    params.sampling.min_p              = json_value(data, "min_p",               defaults.sampling.min_p);
 220    params.sampling.top_n_sigma        = json_value(data, "top_n_sigma",         defaults.sampling.top_n_sigma);
 221    params.sampling.xtc_probability    = json_value(data, "xtc_probability",     defaults.sampling.xtc_probability);
 222    params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",       defaults.sampling.xtc_threshold);
 223    params.sampling.typ_p              = json_value(data, "typical_p",           defaults.sampling.typ_p);
 224    params.sampling.temp               = json_value(data, "temperature",         defaults.sampling.temp);
 225    params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",      defaults.sampling.dynatemp_range);
 226    params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",   defaults.sampling.dynatemp_exponent);
 227    params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",       defaults.sampling.penalty_last_n);
 228    params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",      defaults.sampling.penalty_repeat);
 229    params.sampling.penalty_freq       = json_value(data, "frequency_penalty",   defaults.sampling.penalty_freq);
 230    params.sampling.penalty_present    = json_value(data, "presence_penalty",    defaults.sampling.penalty_present);
 231    params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",      defaults.sampling.dry_multiplier);
 232    params.sampling.dry_base           = json_value(data, "dry_base",            defaults.sampling.dry_base);
 233    params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length",  defaults.sampling.dry_allowed_length);
 234    params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n",  defaults.sampling.dry_penalty_last_n);
 235    params.sampling.mirostat           = json_value(data, "mirostat",            defaults.sampling.mirostat);
 236    params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",        defaults.sampling.mirostat_tau);
 237    params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",        defaults.sampling.mirostat_eta);
 238    params.sampling.adaptive_target    = json_value(data, "adaptive_target",     defaults.sampling.adaptive_target);
 239    params.sampling.adaptive_decay     = json_value(data, "adaptive_decay",      defaults.sampling.adaptive_decay);
 240    params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
 241    params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
 242    params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);
 243    params.sampling.backend_sampling   = json_value(data, "backend_sampling",    defaults.sampling.backend_sampling);
 244    params.post_sampling_probs         = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
 245
 246    params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
 247    params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
 248    params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
 249
 250    params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
 251    params.speculative.n_min = std::max(params.speculative.n_min, 0);
 252    params.speculative.n_max = std::max(params.speculative.n_max, 0);
 253
 254    params.speculative.type = common_speculative_type_from_name(json_value(data, "speculative.type", common_speculative_type_to_str(defaults.speculative.type)));
 255
 256    params.speculative.ngram_size_n     = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
 257    params.speculative.ngram_size_m     = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
 258    params.speculative.ngram_min_hits   = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);
 259
 260    params.speculative.ngram_size_n     = std::max(std::min(1, (int) params.speculative.ngram_size_n),     1024);
 261    params.speculative.ngram_size_m     = std::max(std::min(1, (int) params.speculative.ngram_size_m),     1024);
 262    params.speculative.ngram_min_hits   = std::max(std::min(1, (int) params.speculative.ngram_min_hits),   1024);
 263
 264    // Use OpenAI API logprobs only if n_probs wasn't provided
 265    if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
 266        params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
 267    }
 268
 269    if (data.contains("lora")) {
 270        if (data.at("lora").is_array()) {
 271            params.lora = parse_lora_request(data.at("lora"));
 272        } else {
 273            throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
 274        }
 275    } else {
 276        params.lora = {};
 277    }
 278
 279    // TODO: add more sanity checks for the input parameters
 280
 281    if (params.sampling.penalty_last_n < -1) {
 282        throw std::runtime_error("Error: repeat_last_n must be >= -1");
 283    }
 284
 285    if (params.sampling.dry_penalty_last_n < -1) {
 286        throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
 287    }
 288
 289    if (params.sampling.penalty_last_n == -1) {
 290        // note: should be the slot's context and not the full context, but it's ok
 291        params.sampling.penalty_last_n = n_ctx_slot;
 292    }
 293
 294    if (params.sampling.dry_penalty_last_n == -1) {
 295        params.sampling.dry_penalty_last_n = n_ctx_slot;
 296    }
 297
 298    if (params.sampling.dry_base < 1.0f) {
 299        params.sampling.dry_base = defaults.sampling.dry_base;
 300    }
 301
 302    // sequence breakers for DRY
 303    {
 304        // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
 305        // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
 306
 307        if (data.contains("dry_sequence_breakers")) {
 308            params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
 309            if (params.sampling.dry_sequence_breakers.empty()) {
 310                throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
 311            }
 312        }
 313    }
 314
 315    // process "json_schema" and "grammar"
 316    if (data.contains("json_schema") && !data.contains("grammar")) {
 317        try {
 318            auto schema                  = json_value(data, "json_schema", json::object());
 319            SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
 320            params.sampling.grammar      = json_schema_to_grammar(schema);
 321            SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
 322        } catch (const std::exception & e) {
 323            throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
 324        }
 325    } else {
 326        params.sampling.grammar      = json_value(data, "grammar", defaults.sampling.grammar);
 327        SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
 328        params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
 329        SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
 330    }
 331
 332    {
 333        auto it = data.find("chat_format");
 334        if (it != data.end()) {
 335            params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
 336            SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
 337        } else {
 338            params.chat_parser_params.format = defaults.chat_parser_params.format;
 339        }
 340        common_reasoning_format reasoning_format = params_base.reasoning_format;
 341        if (data.contains("reasoning_format")) {
 342            reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
 343        }
 344        params.chat_parser_params.reasoning_format = reasoning_format;
 345        params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
 346        params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
 347        params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
 348        if (data.contains("chat_parser")) {
 349            params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
 350        }
 351    }
 352
 353    {
 354        const auto preserved_tokens = data.find("preserved_tokens");
 355        if (preserved_tokens != data.end()) {
 356            for (const auto & t : *preserved_tokens) {
 357                auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
 358                if (ids.size() == 1) {
 359                    SRV_DBG("Preserved token: %d\n", ids[0]);
 360                    params.sampling.preserved_tokens.insert(ids[0]);
 361                } else {
 362                    // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
 363                    SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
 364                }
 365            }
 366        }
 367        const auto grammar_triggers = data.find("grammar_triggers");
 368        if (grammar_triggers != data.end()) {
 369            for (const auto & t : *grammar_triggers) {
 370                server_grammar_trigger ct(t);
 371                if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
 372                    const auto & word = ct.value.value;
 373                    auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
 374                    if (ids.size() == 1) {
 375                        auto token = ids[0];
 376                        if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
 377                            throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
 378                        }
 379                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
 380                        common_grammar_trigger trigger;
 381                        trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
 382                        trigger.value = word;
 383                        trigger.token = token;
 384                        params.sampling.grammar_triggers.push_back(std::move(trigger));
 385                    } else {
 386                        SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
 387                        params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
 388                    }
 389                } else {
 390                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
 391                        SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
 392                    } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
 393                        SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
 394                    } else {
 395                        throw std::runtime_error("Unknown grammar trigger type");
 396                    }
 397                    params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
 398                }
 399            }
 400        }
 401        if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
 402            throw std::runtime_error("Error: no triggers set for lazy grammar!");
 403        }
 404    }
 405
 406    {
 407        params.sampling.logit_bias.clear();
 408
 409        const auto & logit_bias = data.find("logit_bias");
 410        if (logit_bias != data.end() && logit_bias->is_array()) {
 411            const int n_vocab = llama_vocab_n_tokens(vocab);
 412            for (const auto & el : *logit_bias) {
 413                // TODO: we may want to throw errors here, in case "el" is incorrect
 414                if (el.is_array() && el.size() == 2) {
 415                    float bias;
 416                    if (el[1].is_number()) {
 417                        bias = el[1].get<float>();
 418                    } else if (el[1].is_boolean() && !el[1].get<bool>()) {
 419                        bias = -INFINITY;
 420                    } else {
 421                        continue;
 422                    }
 423
 424                    if (el[0].is_number_integer()) {
 425                        llama_token tok = el[0].get<llama_token>();
 426                        if (tok >= 0 && tok < n_vocab) {
 427                            params.sampling.logit_bias.push_back({tok, bias});
 428                        }
 429                    } else if (el[0].is_string()) {
 430                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
 431                        for (auto tok : toks) {
 432                            params.sampling.logit_bias.push_back({tok, bias});
 433                        }
 434                    }
 435                }
 436            }
 437        } else if (logit_bias != data.end() && logit_bias->is_object()) {
 438            const int n_vocab = llama_vocab_n_tokens(vocab);
 439            for (const auto & el : logit_bias->items()) {
 440                float bias;
 441                const auto & key = el.key();
 442                const auto & value = el.value();
 443                if (value.is_number()) {
 444                    bias = value.get<float>();
 445                } else if (value.is_boolean() && !value.get<bool>()) {
 446                    bias = -INFINITY;
 447                } else {
 448                    continue;
 449                }
 450
 451                char *end;
 452                llama_token tok = strtol(key.c_str(), &end, 10);
 453                if (*end == 0) {
 454                    if (tok >= 0 && tok < n_vocab) {
 455                        params.sampling.logit_bias.push_back({tok, bias});
 456                    }
 457                } else {
 458                    auto toks = common_tokenize(vocab, key, false);
 459                    for (auto tok : toks) {
 460                        params.sampling.logit_bias.push_back({tok, bias});
 461                    }
 462                }
 463            }
 464        }
 465
 466        params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
 467        if (params.sampling.ignore_eos) {
 468            params.sampling.logit_bias.insert(
 469                    params.sampling.logit_bias.end(),
 470                    defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
 471        }
 472    }
 473
 474    {
 475        params.antiprompt.clear();
 476
 477        const auto & stop = data.find("stop");
 478        if (stop != data.end() && stop->is_array()) {
 479            for (const auto & word : *stop) {
 480                if (!word.empty()) {
 481                    params.antiprompt.push_back(word);
 482                }
 483            }
 484        }
 485        // set reverse prompt from cli args if not set in the request
 486        if (params.antiprompt.empty()) {
 487            params.antiprompt = defaults.antiprompt;
 488        }
 489    }
 490
 491    {
 492        const auto samplers = data.find("samplers");
 493        if (samplers != data.end()) {
 494            if (samplers->is_array()) {
 495                params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
 496            } else if (samplers->is_string()){
 497                params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
 498            }
 499        } else {
 500            params.sampling.samplers = defaults.sampling.samplers;
 501        }
 502    }
 503
 504    if (params.n_cmpl > params_base.n_parallel) {
 505        throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np");
 506    }
 507
 508    return params;
 509}
 510
 511//
 512// result_timings
 513//
 514
 515json result_timings::to_json() const {
 516    json base = {
 517        {"cache_n",                cache_n},
 518
 519        {"prompt_n",               prompt_n},
 520        {"prompt_ms",              prompt_ms},
 521        {"prompt_per_token_ms",    prompt_per_token_ms},
 522        {"prompt_per_second",      prompt_per_second},
 523
 524        {"predicted_n",            predicted_n},
 525        {"predicted_ms",           predicted_ms},
 526        {"predicted_per_token_ms", predicted_per_token_ms},
 527        {"predicted_per_second",   predicted_per_second},
 528    };
 529
 530    if (draft_n > 0) {
 531        base["draft_n"] = draft_n;
 532        base["draft_n_accepted"] = draft_n_accepted;
 533    }
 534
 535    return base;
 536}
 537
 538//
 539// result_prompt_progress
 540//
 541json result_prompt_progress::to_json() const {
 542    return json {
 543        {"total",     total},
 544        {"cache",     cache},
 545        {"processed", processed},
 546        {"time_ms",   time_ms},
 547    };
 548}
 549
 550static inline std::string stop_type_to_str(stop_type type) {
 551    switch (type) {
 552        case STOP_TYPE_EOS:   return "eos";
 553        case STOP_TYPE_WORD:  return "word";
 554        case STOP_TYPE_LIMIT: return "limit";
 555        default:              return "none";
 556    }
 557}
 558
 559//
 560// completion_token_output
 561//
 562
 563json completion_token_output::to_json(bool post_sampling_probs) const {
 564    json probs_for_token = json::array();
 565    for (const auto & p : probs) {
 566        std::string txt(p.txt);
 567        txt.resize(validate_utf8(txt));
 568        probs_for_token.push_back(json {
 569            {"id",      p.tok},
 570            {"token",   txt},
 571            {"bytes",   str_to_bytes(p.txt)},
 572            {
 573                post_sampling_probs ? "prob" : "logprob",
 574                post_sampling_probs ? p.prob : logarithm(p.prob)
 575            },
 576        });
 577    }
 578    return probs_for_token;
 579}
 580
 581json completion_token_output::probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
 582    json out = json::array();
 583    for (const auto & p : probs) {
 584        std::string txt(p.text_to_send);
 585        txt.resize(validate_utf8(txt));
 586        out.push_back(json {
 587            {"id",           p.tok},
 588            {"token",        txt},
 589            {"bytes",        str_to_bytes(p.text_to_send)},
 590            {
 591                post_sampling_probs ? "prob" : "logprob",
 592                post_sampling_probs ? p.prob : logarithm(p.prob)
 593            },
 594            {
 595                post_sampling_probs ? "top_probs" : "top_logprobs",
 596                p.to_json(post_sampling_probs)
 597            },
 598        });
 599    }
 600    return out;
 601}
 602
 603float completion_token_output::logarithm(float x) {
 604    // nlohmann::json converts -inf to null, so we need to prevent that
 605    return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
 606}
 607
 608std::vector<unsigned char> completion_token_output::str_to_bytes(const std::string & str) {
 609    std::vector<unsigned char> bytes;
 610    for (unsigned char c : str) {
 611        bytes.push_back(c);
 612    }
 613    return bytes;
 614}
 615
 616//
 617// server_task_result_cmpl_final
 618//
 619json server_task_result_cmpl_final::to_json() {
 620    GGML_ASSERT(is_updated && "update() must be called before to_json()");
 621    switch (res_type) {
 622        case TASK_RESPONSE_TYPE_NONE:
 623            return to_json_non_oaicompat();
 624        case TASK_RESPONSE_TYPE_OAI_CMPL:
 625            return to_json_oaicompat();
 626        case TASK_RESPONSE_TYPE_OAI_CHAT:
 627            return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
 628        case TASK_RESPONSE_TYPE_OAI_RESP:
 629            return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
 630        case TASK_RESPONSE_TYPE_ANTHROPIC:
 631            return stream ? to_json_anthropic_stream() : to_json_anthropic();
 632        default:
 633            GGML_ASSERT(false && "Invalid task_response_type");
 634    }
 635}
 636
 637json server_task_result_cmpl_final::to_json_non_oaicompat() {
 638    json res = json {
 639        {"index",               index},
 640        {"content",             content},
 641        {"tokens",              tokens},
 642        {"id_slot",             id_slot},
 643        {"stop",                true},
 644        {"model",               oaicompat_model},
 645        {"tokens_predicted",    n_decoded},
 646        {"tokens_evaluated",    n_prompt_tokens},
 647        {"generation_settings", generation_params.to_json()},
 648        {"prompt",              prompt},
 649        {"has_new_line",        has_new_line},
 650        {"truncated",           truncated},
 651        {"stop_type",           stop_type_to_str(stop)},
 652        {"stopping_word",       stopping_word},
 653        {"tokens_cached",       n_tokens_cached},
 654        {"timings",             timings.to_json()},
 655    };
 656    if (!stream && !probs_output.empty()) {
 657        res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
 658    }
 659    return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
 660}
 661
 662json server_task_result_cmpl_final::to_json_oaicompat() {
 663    std::time_t t = std::time(0);
 664    json logprobs = json(nullptr); // OAI default to null
 665    if (!stream && probs_output.size() > 0) {
 666        logprobs = json{
 667            {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
 668        };
 669    }
 670    json finish_reason = "length";
 671    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
 672        finish_reason = "stop";
 673    }
 674    json res = json {
 675        {"choices",            json::array({
 676            json{
 677                {"text",          content},
 678                {"index",         index},
 679                {"logprobs",      logprobs},
 680                {"finish_reason", finish_reason},
 681            }
 682        })},
 683        {"created",            t},
 684        {"model",              oaicompat_model},
 685        {"system_fingerprint", build_info},
 686        {"object",             "text_completion"},
 687        {"usage", json {
 688            {"completion_tokens", n_decoded},
 689            {"prompt_tokens",     n_prompt_tokens},
 690            {"total_tokens",      n_decoded + n_prompt_tokens}
 691        }},
 692        {"id", oaicompat_cmpl_id}
 693    };
 694
 695    // extra fields for debugging purposes
 696    if (verbose) {
 697        res["__verbose"] = to_json_non_oaicompat();
 698    }
 699    if (timings.prompt_n >= 0) {
 700        res.push_back({"timings", timings.to_json()});
 701    }
 702
 703    return res;
 704}
 705
 706json server_task_result_cmpl_final::to_json_oaicompat_chat() {
 707    std::string finish_reason = "length";
 708    common_chat_msg msg;
 709    if (!oaicompat_msg.empty()) {
 710        msg = oaicompat_msg;
 711    } else {
 712        msg.role = "assistant";
 713        msg.content = content;
 714    }
 715    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
 716        finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
 717    }
 718
 719    json choice {
 720        {"finish_reason", finish_reason},
 721        {"index", index},
 722        {"message", msg.to_json_oaicompat()},
 723    };
 724
 725    if (!stream && probs_output.size() > 0) {
 726        choice["logprobs"] = json{
 727            {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
 728        };
 729    }
 730
 731    std::time_t t = std::time(0);
 732
 733    json res = json {
 734        {"choices",            json::array({choice})},
 735        {"created",            t},
 736        {"model",              oaicompat_model},
 737        {"system_fingerprint", build_info},
 738        {"object",             "chat.completion"},
 739        {"usage", json {
 740            {"completion_tokens", n_decoded},
 741            {"prompt_tokens",     n_prompt_tokens},
 742            {"total_tokens",      n_decoded + n_prompt_tokens}
 743        }},
 744        {"id", oaicompat_cmpl_id}
 745    };
 746
 747    // extra fields for debugging purposes
 748    if (verbose) {
 749        res["__verbose"] = to_json_non_oaicompat();
 750    }
 751    if (timings.prompt_n >= 0) {
 752        res.push_back({"timings", timings.to_json()});
 753    }
 754
 755    return res;
 756}
 757
 758json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
 759    std::time_t t = std::time(0);
 760    std::string finish_reason = "length";
 761    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
 762        finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
 763    }
 764
 765    json deltas = json::array();
 766    for (const auto & diff : oaicompat_msg_diffs) {
 767        deltas.push_back({
 768            {"choices", json::array({
 769                json {
 770                    {"finish_reason", nullptr},
 771                    {"index", 0},
 772                    {"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
 773                },
 774            })},
 775            {"created", t},
 776            {"id", oaicompat_cmpl_id},
 777            {"model", oaicompat_model},
 778            {"system_fingerprint", build_info},
 779            {"object", "chat.completion.chunk"},
 780        });
 781    }
 782
 783    deltas.push_back({
 784        {"choices", json::array({
 785            json {
 786                {"finish_reason", finish_reason},
 787                {"index", 0},
 788                {"delta", json::object()},
 789            },
 790        })},
 791        {"created",            t},
 792        {"id",                 oaicompat_cmpl_id},
 793        {"model",              oaicompat_model},
 794        {"system_fingerprint", build_info},
 795        {"object",             "chat.completion.chunk"},
 796    });
 797
 798    if (include_usage) {
 799        // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
 800        // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
 801        deltas.push_back({
 802            {"choices", json::array()},
 803            {"created",            t},
 804            {"id",                 oaicompat_cmpl_id},
 805            {"model",              oaicompat_model},
 806            {"system_fingerprint", build_info},
 807            {"object",             "chat.completion.chunk"},
 808            {"usage", json {
 809                {"completion_tokens", n_decoded},
 810                {"prompt_tokens",     n_prompt_tokens},
 811                {"total_tokens",      n_decoded + n_prompt_tokens},
 812            }},
 813        });
 814    }
 815
 816    if (timings.prompt_n >= 0) {
 817        deltas.back().push_back({"timings", timings.to_json()});
 818    }
 819
 820    // extra fields for debugging purposes
 821    if (verbose && !deltas.empty()) {
 822        deltas.front()["__verbose"] = to_json_non_oaicompat();
 823    }
 824
 825    return deltas;
 826}
 827
 828json server_task_result_cmpl_final::to_json_oaicompat_resp() {
 829    common_chat_msg msg;
 830    if (!oaicompat_msg.empty()) {
 831        msg = oaicompat_msg;
 832    } else {
 833        msg.role = "assistant";
 834        msg.content = content;
 835    }
 836
 837    std::vector<json> output;
 838
 839    if (msg.reasoning_content != "") {
 840        output.push_back(json {
 841            {"id",      "rs_" + random_string()},
 842            {"summary", json::array()},
 843            {"type",    "reasoning"},
 844            {"content", json::array({ json {
 845                {"text", msg.reasoning_content},
 846                {"type", "reasoning_text"},
 847            }})},
 848            {"encrypted_content", ""},
 849            {"status",            "completed"},
 850        });
 851    }
 852
 853    if (msg.content != "") {
 854        output.push_back(json {
 855            {"content", json::array({ json {
 856                {"type",        "output_text"},
 857                {"annotations", json::array()},
 858                {"logprobs",    json::array()},
 859                {"text",        msg.content},
 860            }})},
 861            {"id",     "msg_" + random_string()},
 862            {"role",   msg.role},
 863            {"status", "completed"},
 864            {"type",   "message"},
 865        });
 866    }
 867
 868    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
 869        output.push_back(json {
 870            {"type",      "function_call"},
 871            {"status",    "completed"},
 872            {"arguments", tool_call.arguments},
 873            {"call_id",   "fc_" + tool_call.id},
 874            {"name",      tool_call.name},
 875        });
 876    }
 877
 878    std::time_t t = std::time(0);
 879    json res = {
 880        {"completed_at", t},
 881        {"created_at",   t},
 882        {"id",           oai_resp_id},
 883        {"model",        oaicompat_model},
 884        {"object",       "response"},
 885        {"output",       output},
 886        {"status",       "completed"},
 887        {"usage",        json {
 888            {"input_tokens",  n_prompt_tokens},
 889            {"output_tokens", n_decoded},
 890            {"total_tokens",  n_decoded + n_prompt_tokens},
 891        }},
 892    };
 893
 894    return res;
 895}
 896
 897json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
 898    std::vector<json> server_sent_events;
 899    std::vector<json> output;
 900
 901    if (oaicompat_msg.reasoning_content != "") {
 902        const json output_item = json {
 903            {"id",      oai_resp_reasoning_id},
 904            {"summary", json::array()},
 905            {"type",    "reasoning"},
 906            {"content", json::array({ json {
 907                {"text", oaicompat_msg.reasoning_content},
 908                {"type", "reasoning_text"},
 909            }})},
 910            {"encrypted_content", ""},
 911        };
 912
 913        server_sent_events.push_back(json {
 914            {"event", "response.output_item.done"},
 915            {"data", json {
 916                {"type", "response.output_item.done"},
 917                {"item", output_item}
 918            }}
 919        });
 920        output.push_back(output_item);
 921    }
 922
 923    if (oaicompat_msg.content != "") {
 924        server_sent_events.push_back(json {
 925            {"event", "response.output_text.done"},
 926            {"data", json {
 927                {"type",    "response.output_text.done"},
 928                {"item_id", oai_resp_message_id},
 929                {"text",    oaicompat_msg.content}
 930            }}
 931        });
 932
 933        const json content_part = {
 934            {"type",        "output_text"},
 935            {"annotations", json::array()},
 936            {"logprobs",    json::array()},
 937            {"text",        oaicompat_msg.content}
 938        };
 939
 940        server_sent_events.push_back(json {
 941            {"event", "response.content_part.done"},
 942            {"data", json {
 943                {"type",    "response.content_part.done"},
 944                {"item_id", oai_resp_message_id},
 945                {"part",    content_part}
 946            }}
 947        });
 948        const json output_item = {
 949            {"type",    "message"},
 950            {"status",  "completed"},
 951            {"id",      oai_resp_message_id},
 952            {"content", json::array({content_part})},
 953            {"role",    "assistant"}
 954        };
 955
 956        server_sent_events.push_back(json {
 957            {"event", "response.output_item.done"},
 958            {"data", json {
 959                {"type", "response.output_item.done"},
 960                {"item", output_item}
 961            }}
 962        });
 963        output.push_back(output_item);
 964    }
 965
 966    for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
 967        const json output_item = {
 968            {"type",      "function_call"},
 969            {"status",    "completed"},
 970            {"arguments", tool_call.arguments},
 971            {"call_id",   "fc_" + tool_call.id},
 972            {"name",      tool_call.name}
 973        };
 974        server_sent_events.push_back(json {
 975            {"event", "response.output_item.done"},
 976            {"data", json {
 977                {"type", "response.output_item.done"},
 978                {"item", output_item}
 979            }}
 980        });
 981        output.push_back(output_item);
 982    }
 983
 984    std::time_t t = std::time(0);
 985    server_sent_events.push_back(json {
 986        {"event", "response.completed"},
 987        {"data", json {
 988            {"type", "response.completed"},
 989            {"response", json {
 990                {"id",         oai_resp_id},
 991                {"object",     "response"},
 992                {"created_at", t},
 993                {"status",     "completed"},
 994                {"model",      oaicompat_model},
 995                {"output",     output},
 996                {"usage",      json {
 997                    {"input_tokens",  n_prompt_tokens},
 998                    {"output_tokens", n_decoded},
 999                    {"total_tokens",  n_decoded + n_prompt_tokens}
1000                }}
1001            }},
1002        }}
1003    });
1004
1005    return server_sent_events;
1006}
1007
1008json server_task_result_cmpl_final::to_json_anthropic() {
1009    std::string stop_reason = "max_tokens";
1010    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
1011        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
1012    }
1013
1014    json content_blocks = json::array();
1015
1016    common_chat_msg msg;
1017    if (!oaicompat_msg.empty()) {
1018        msg = oaicompat_msg;
1019    } else {
1020        msg.role = "assistant";
1021        msg.content = content;
1022    }
1023
1024    // thinking block comes first (Anthropic extended thinking format)
1025    if (!msg.reasoning_content.empty()) {
1026        content_blocks.push_back({
1027            {"type", "thinking"},
1028            {"thinking", msg.reasoning_content},
1029            {"signature", ""}  // empty signature for local models (no cryptographic verification)
1030        });
1031    }
1032
1033    if (!msg.content.empty()) {
1034        content_blocks.push_back({
1035            {"type", "text"},
1036            {"text", msg.content}
1037        });
1038    }
1039
1040    for (const auto & tool_call : msg.tool_calls) {
1041        json tool_use_block = {
1042            {"type", "tool_use"},
1043            {"id", tool_call.id},
1044            {"name", tool_call.name}
1045        };
1046
1047        try {
1048            tool_use_block["input"] = json::parse(tool_call.arguments);
1049        } catch (const std::exception &) {
1050            tool_use_block["input"] = json::object();
1051        }
1052
1053        content_blocks.push_back(tool_use_block);
1054    }
1055
1056    json res = {
1057        {"id", oaicompat_cmpl_id},
1058        {"type", "message"},
1059        {"role", "assistant"},
1060        {"content", content_blocks},
1061        {"model", oaicompat_model},
1062        {"stop_reason", stop_reason},
1063        {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
1064        {"usage", {
1065            {"input_tokens", n_prompt_tokens},
1066            {"output_tokens", n_decoded}
1067        }}
1068    };
1069
1070    return res;
1071}
1072
1073json server_task_result_cmpl_final::to_json_anthropic_stream() {
1074    json events = json::array();
1075
1076    std::string stop_reason = "max_tokens";
1077    if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
1078        stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
1079    }
1080
1081    bool has_thinking = !oaicompat_msg.reasoning_content.empty();
1082    bool has_text     = !oaicompat_msg.content.empty();
1083    size_t num_tool_calls = oaicompat_msg.tool_calls.size();
1084
1085    // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
1086    size_t thinking_block_index = 0;
1087    size_t text_block_index     = has_thinking ? 1 : 0;
1088
1089    bool thinking_block_started = false;
1090    bool text_block_started     = false;
1091    std::unordered_set<size_t> tool_calls_started;
1092
1093    for (const auto & diff : oaicompat_msg_diffs) {
1094        // handle thinking/reasoning content
1095        if (!diff.reasoning_content_delta.empty()) {
1096            if (!thinking_block_started) {
1097                events.push_back({
1098                    {"event", "content_block_start"},
1099                    {"data", {
1100                        {"type", "content_block_start"},
1101                        {"index", thinking_block_index},
1102                        {"content_block", {
1103                            {"type", "thinking"},
1104                            {"thinking", ""}
1105                        }}
1106                    }}
1107                });
1108                thinking_block_started = true;
1109            }
1110
1111            events.push_back({
1112                {"event", "content_block_delta"},
1113                {"data", {
1114                    {"type", "content_block_delta"},
1115                    {"index", thinking_block_index},
1116                    {"delta", {
1117                        {"type", "thinking_delta"},
1118                        {"thinking", diff.reasoning_content_delta}
1119                    }}
1120                }}
1121            });
1122        }
1123
1124        // handle regular text content
1125        if (!diff.content_delta.empty()) {
1126            if (!text_block_started) {
1127                events.push_back({
1128                    {"event", "content_block_start"},
1129                    {"data", {
1130                        {"type", "content_block_start"},
1131                        {"index", text_block_index},
1132                        {"content_block", {
1133                            {"type", "text"},
1134                            {"text", ""}
1135                        }}
1136                    }}
1137                });
1138                text_block_started = true;
1139            }
1140
1141            events.push_back({
1142                {"event", "content_block_delta"},
1143                {"data", {
1144                    {"type", "content_block_delta"},
1145                    {"index", text_block_index},
1146                    {"delta", {
1147                        {"type", "text_delta"},
1148                        {"text", diff.content_delta}
1149                    }}
1150                }}
1151            });
1152        }
1153
1154        // handle tool calls
1155        if (diff.tool_call_index != std::string::npos) {
1156            size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
1157
1158            if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
1159                const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
1160
1161                events.push_back({
1162                    {"event", "content_block_start"},
1163                    {"data", {
1164                        {"type", "content_block_start"},
1165                        {"index", content_block_index},
1166                        {"content_block", {
1167                            {"type", "tool_use"},
1168                            {"id", full_tool_call.id},
1169                            {"name", full_tool_call.name}
1170                        }}
1171                    }}
1172                });
1173                tool_calls_started.insert(diff.tool_call_index);
1174            }
1175
1176            if (!diff.tool_call_delta.arguments.empty()) {
1177                events.push_back({
1178                    {"event", "content_block_delta"},
1179                    {"data", {
1180                        {"type", "content_block_delta"},
1181                        {"index", content_block_index},
1182                        {"delta", {
1183                            {"type", "input_json_delta"},
1184                            {"partial_json", diff.tool_call_delta.arguments}
1185                        }}
1186                    }}
1187                });
1188            }
1189        }
1190    }
1191
1192    // close content blocks in order
1193    if (has_thinking) {
1194        // Anthropic API requires a signature_delta before closing thinking blocks
1195        // We use an empty signature since we can't generate a cryptographic signature for local models
1196        events.push_back({
1197            {"event", "content_block_delta"},
1198            {"data", {
1199                {"type", "content_block_delta"},
1200                {"index", thinking_block_index},
1201                {"delta", {
1202                    {"type", "signature_delta"},
1203                    {"signature", ""}
1204                }}
1205            }}
1206        });
1207        events.push_back({
1208            {"event", "content_block_stop"},
1209            {"data", {
1210                {"type", "content_block_stop"},
1211                {"index", thinking_block_index}
1212            }}
1213        });
1214    }
1215
1216    if (has_text) {
1217        events.push_back({
1218            {"event", "content_block_stop"},
1219            {"data", {
1220                {"type", "content_block_stop"},
1221                {"index", text_block_index}
1222            }}
1223        });
1224    }
1225
1226    for (size_t i = 0; i < num_tool_calls; i++) {
1227        size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
1228        events.push_back({
1229            {"event", "content_block_stop"},
1230            {"data", {
1231                {"type", "content_block_stop"},
1232                {"index", content_block_index}
1233            }}
1234        });
1235    }
1236
1237    events.push_back({
1238        {"event", "message_delta"},
1239        {"data", {
1240            {"type", "message_delta"},
1241            {"delta", {
1242                {"stop_reason", stop_reason},
1243                {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}
1244            }},
1245            {"usage", {
1246                {"output_tokens", n_decoded}
1247            }}
1248        }}
1249    });
1250
1251    events.push_back({
1252        {"event", "message_stop"},
1253        {"data", {
1254            {"type", "message_stop"}
1255        }}
1256    });
1257
1258    return events;
1259}
1260
1261//
1262// server_task_result_cmpl_partial
1263//
1264void server_task_result_cmpl_partial::update(task_result_state & state) {
1265    is_updated = true;
1266    state.update_chat_msg(content, true, oaicompat_msg_diffs);
1267
1268    // Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
1269    thinking_block_started = state.thinking_block_started;
1270    text_block_started     = state.text_block_started;
1271
1272    oai_resp_id            = state.oai_resp_id;
1273    oai_resp_reasoning_id  = state.oai_resp_reasoning_id;
1274    oai_resp_message_id    = state.oai_resp_message_id;
1275    oai_resp_fc_id         = state.oai_resp_fc_id;
1276
1277    // track if the accumulated message has any reasoning content
1278    anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
1279
1280    // Pre-compute state updates based on diffs (for next chunk)
1281    for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
1282        if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
1283            state.thinking_block_started = true;
1284        }
1285        if (!diff.content_delta.empty() && !state.text_block_started) {
1286            state.text_block_started = true;
1287        }
1288        if (!diff.tool_call_delta.name.empty()) {
1289            state.oai_resp_fc_id = diff.tool_call_delta.id;
1290        }
1291    }
1292}
1293
1294json server_task_result_cmpl_partial::to_json() {
1295    GGML_ASSERT(is_updated && "update() must be called before to_json()");
1296    switch (res_type) {
1297        case TASK_RESPONSE_TYPE_NONE:
1298            return to_json_non_oaicompat();
1299        case TASK_RESPONSE_TYPE_OAI_CMPL:
1300            return to_json_oaicompat();
1301        case TASK_RESPONSE_TYPE_OAI_CHAT:
1302            return to_json_oaicompat_chat();
1303        case TASK_RESPONSE_TYPE_OAI_RESP:
1304            return to_json_oaicompat_resp();
1305        case TASK_RESPONSE_TYPE_ANTHROPIC:
1306            return to_json_anthropic();
1307        default:
1308            GGML_ASSERT(false && "Invalid task_response_type");
1309    }
1310}
1311
1312json server_task_result_cmpl_partial::to_json_non_oaicompat() {
1313    // non-OAI-compat JSON
1314    json res = json {
1315        {"index",            index},
1316        {"content",          content},
1317        {"tokens",           tokens},
1318        {"stop",             false},
1319        {"id_slot",          id_slot},
1320        {"tokens_predicted", n_decoded},
1321        {"tokens_evaluated", n_prompt_tokens},
1322    };
1323    // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
1324    if (timings.prompt_n > 0) {
1325        res.push_back({"timings", timings.to_json()});
1326    }
1327    if (is_progress) {
1328        res.push_back({"prompt_progress", progress.to_json()});
1329    }
1330    if (!prob_output.probs.empty()) {
1331        res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
1332    }
1333    return res;
1334}
1335
1336json server_task_result_cmpl_partial::to_json_oaicompat() {
1337    std::time_t t = std::time(0);
1338    json logprobs = json(nullptr); // OAI default to null
1339    if (prob_output.probs.size() > 0) {
1340        logprobs = json{
1341            {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
1342        };
1343    }
1344    json res = json {
1345        {"choices",            json::array({
1346            json{
1347                {"text",          content},
1348                {"index",         index},
1349                {"logprobs",      logprobs},
1350                {"finish_reason", nullptr},
1351            }
1352        })},
1353        {"created",            t},
1354        {"model",              oaicompat_model},
1355        {"system_fingerprint", build_info},
1356        {"object",             "text_completion"},
1357        {"id",                 oaicompat_cmpl_id}
1358    };
1359
1360    // extra fields for debugging purposes
1361    if (verbose) {
1362        res["__verbose"] = to_json_non_oaicompat();
1363    }
1364    if (timings.prompt_n >= 0) {
1365        res.push_back({"timings", timings.to_json()});
1366    }
1367    if (is_progress) {
1368        res.push_back({"prompt_progress", progress.to_json()});
1369    }
1370
1371    return res;
1372}
1373
1374json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
1375    bool first = n_decoded == 1;
1376    std::time_t t = std::time(0);
1377    json choices;
1378
1379    std::vector<json> deltas;
1380    auto add_delta = [&](const json & delta) {
1381        deltas.push_back({
1382            {"choices", json::array({
1383                json {
1384                    {"finish_reason", nullptr},
1385                    {"index", index},
1386                    {"delta", delta},
1387                },
1388            })},
1389            {"created", t},
1390            {"id", oaicompat_cmpl_id},
1391            {"model", oaicompat_model},
1392            {"system_fingerprint", build_info},
1393            {"object", "chat.completion.chunk"},
1394        });
1395    };
1396    // We have to send an initial update to conform to openai behavior
1397    if (first || is_progress) {
1398        add_delta({
1399            {"role", "assistant"},
1400            {"content", nullptr},
1401        });
1402    }
1403
1404    for (const auto & diff : oaicompat_msg_diffs) {
1405        add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
1406    }
1407
1408    if (!deltas.empty()) {
1409        auto & last_json = deltas[deltas.size() - 1];
1410        GGML_ASSERT(last_json.at("choices").size() >= 1);
1411
1412        if (prob_output.probs.size() > 0) {
1413            last_json.at("choices").at(0)["logprobs"] = json {
1414                {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
1415            };
1416        }
1417
1418        if (timings.prompt_n >= 0) {
1419            last_json.push_back({"timings", timings.to_json()});
1420        }
1421        if (is_progress) {
1422            last_json.push_back({"prompt_progress", progress.to_json()});
1423        }
1424    }
1425
1426    return deltas;
1427}
1428
1429json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
1430    std::vector<json> events;
1431
1432    if (n_decoded == 1) {
1433        events.push_back(json {
1434            {"event", "response.created"},
1435            {"data", json {
1436                {"type", "response.created"},
1437                {"response", json {
1438                    {"id",     oai_resp_id},
1439                    {"object", "response"},
1440                    {"status", "in_progress"},
1441                }},
1442            }},
1443        });
1444        events.push_back(json {
1445            {"event", "response.in_progress"},
1446            {"data", json {
1447                {"type", "response.in_progress"},
1448                {"response", json {
1449                    {"id",     oai_resp_id},
1450                    {"object", "response"},
1451                    {"status", "in_progress"},
1452                }},
1453            }},
1454        });
1455    }
1456
1457    for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
1458        if (!diff.reasoning_content_delta.empty()) {
1459            if (!thinking_block_started) {
1460                events.push_back(json {
1461                    {"event", "response.output_item.added"},
1462                    {"data", json {
1463                        {"type", "response.output_item.added"},
1464                        {"item", json {
1465                            {"id",                oai_resp_reasoning_id},
1466                            {"summary",           json::array()},
1467                            {"type",              "reasoning"},
1468                            {"content",           json::array()},
1469                            {"encrypted_content", ""},
1470                            {"status",            "in_progress"},
1471                        }},
1472                    }},
1473                });
1474                thinking_block_started = true;
1475            }
1476            events.push_back(json {
1477                {"event", "response.reasoning_text.delta"},
1478                {"data", json {
1479                    {"type",    "response.reasoning_text.delta"},
1480                    {"delta",   diff.reasoning_content_delta},
1481                    {"item_id", oai_resp_reasoning_id},
1482                }},
1483            });
1484        }
1485
1486        if (!diff.content_delta.empty()) {
1487            if (!text_block_started) {
1488                events.push_back(json {
1489                    {"event", "response.output_item.added"},
1490                    {"data", json {
1491                        {"type", "response.output_item.added"},
1492                        {"item", json {
1493                            {"content", json::array()},
1494                            {"id",      oai_resp_message_id},
1495                            {"role",    "assistant"},
1496                            {"status",  "in_progress"},
1497                            {"type",    "message"},
1498                        }},
1499                    }},
1500                });
1501                events.push_back(json {
1502                    {"event", "response.content_part.added"},
1503                    {"data", json {
1504                        {"type",    "response.content_part.added"},
1505                        {"item_id", oai_resp_message_id},
1506                        {"part", json {
1507                            {"type", "output_text"},
1508                            {"text", ""},
1509                        }},
1510                    }},
1511                });
1512                text_block_started = true;
1513            }
1514            events.push_back(json {
1515                {"event", "response.output_text.delta"},
1516                {"data", json {
1517                    {"type",    "response.output_text.delta"},
1518                    {"item_id", oai_resp_message_id},
1519                    {"delta",   diff.content_delta},
1520                }},
1521            });
1522        }
1523
1524        if (!diff.tool_call_delta.name.empty()) {
1525            events.push_back(json {
1526                {"event", "response.output_item.added"},
1527                {"data", json {
1528                    {"type",  "response.output_item.added"},
1529                    {"item", json {
1530                        {"arguments", ""},
1531                        {"call_id",   "fc_" + diff.tool_call_delta.id},
1532                        {"name",      diff.tool_call_delta.name},
1533                        {"type",      "function_call"},
1534                        {"status",    "in_progress"},
1535                    }},
1536                }},
1537            });
1538            oai_resp_fc_id = diff.tool_call_delta.id;
1539        }
1540
1541        if (!diff.tool_call_delta.arguments.empty()) {
1542            events.push_back(json {
1543                {"event", "response.function_call_arguments.delta"},
1544                {"data", json {
1545                    {"type",    "response.function_call_arguments.delta"},
1546                    {"delta",   diff.tool_call_delta.arguments},
1547                    {"item_id", "fc_" + oai_resp_fc_id},
1548                }},
1549            });
1550        }
1551    }
1552    return events;
1553}
1554
1555json server_task_result_cmpl_partial::to_json_anthropic() {
1556    json events = json::array();
1557    bool first = (n_decoded == 1);
1558    // use member variables to track block state across streaming calls
1559    // (anthropic_thinking_block_started, anthropic_text_block_started)
1560
1561    if (first) {
1562        events.push_back({
1563            {"event", "message_start"},
1564            {"data", {
1565                {"type", "message_start"},
1566                {"message", {
1567                    {"id", oaicompat_cmpl_id},
1568                    {"type", "message"},
1569                    {"role", "assistant"},
1570                    {"content", json::array()},
1571                    {"model", oaicompat_model},
1572                    {"stop_reason", nullptr},
1573                    {"stop_sequence", nullptr},
1574                    {"usage", {
1575                        {"input_tokens", n_prompt_tokens},
1576                        {"output_tokens", 0}
1577                    }}
1578                }}
1579            }}
1580        });
1581    }
1582
1583    // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
1584    size_t thinking_block_index = 0;
1585    // use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
1586    size_t text_block_index     = anthropic_has_reasoning ? 1 : 0;
1587
1588    // use local copies of streaming state (copied from task_result_state in update())
1589    // these reflect the state BEFORE this chunk was processed
1590    bool thinking_started = thinking_block_started;
1591    bool text_started     = text_block_started;
1592
1593    for (const auto & diff : oaicompat_msg_diffs) {
1594        // handle thinking/reasoning content
1595        if (!diff.reasoning_content_delta.empty()) {
1596            if (!thinking_started) {
1597                events.push_back({
1598                    {"event", "content_block_start"},
1599                    {"data", {
1600                        {"type", "content_block_start"},
1601                        {"index", thinking_block_index},
1602                        {"content_block", {
1603                            {"type", "thinking"},
1604                            {"thinking", ""}
1605                        }}
1606                    }}
1607                });
1608                thinking_started = true;
1609            }
1610
1611            events.push_back({
1612                {"event", "content_block_delta"},
1613                {"data", {
1614                    {"type", "content_block_delta"},
1615                    {"index", thinking_block_index},
1616                    {"delta", {
1617                        {"type", "thinking_delta"},
1618                        {"thinking", diff.reasoning_content_delta}
1619                    }}
1620                }}
1621            });
1622        }
1623
1624        // handle regular text content
1625        if (!diff.content_delta.empty()) {
1626            if (!text_started) {
1627                events.push_back({
1628                    {"event", "content_block_start"},
1629                    {"data", {
1630                        {"type", "content_block_start"},
1631                        {"index", text_block_index},
1632                        {"content_block", {
1633                            {"type", "text"},
1634                            {"text", ""}
1635                        }}
1636                    }}
1637                });
1638                text_started = true;
1639            }
1640
1641            events.push_back({
1642                {"event", "content_block_delta"},
1643                {"data", {
1644                    {"type", "content_block_delta"},
1645                    {"index", text_block_index},
1646                    {"delta", {
1647                        {"type", "text_delta"},
1648                        {"text", diff.content_delta}
1649                    }}
1650                }}
1651            });
1652        }
1653
1654        // handle tool calls
1655        if (diff.tool_call_index != std::string::npos) {
1656            // use anthropic_has_reasoning for thinking block count (persists across calls)
1657            size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
1658
1659            if (!diff.tool_call_delta.name.empty()) {
1660                events.push_back({
1661                    {"event", "content_block_start"},
1662                    {"data", {
1663                        {"type", "content_block_start"},
1664                        {"index", content_block_index},
1665                        {"content_block", {
1666                            {"type", "tool_use"},
1667                            {"id", diff.tool_call_delta.id},
1668                            {"name", diff.tool_call_delta.name}
1669                        }}
1670                    }}
1671                });
1672            }
1673
1674            if (!diff.tool_call_delta.arguments.empty()) {
1675                events.push_back({
1676                    {"event", "content_block_delta"},
1677                    {"data", {
1678                        {"type", "content_block_delta"},
1679                        {"index", content_block_index},
1680                        {"delta", {
1681                            {"type", "input_json_delta"},
1682                            {"partial_json", diff.tool_call_delta.arguments}
1683                        }}
1684                    }}
1685                });
1686            }
1687        }
1688    }
1689
1690    return events;
1691}
1692
1693//
1694// server_task_result_embd
1695//
1696json server_task_result_embd::to_json() {
1697    return res_type == TASK_RESPONSE_TYPE_OAI_EMBD
1698        ? to_json_oaicompat()
1699        : to_json_non_oaicompat();
1700}
1701
1702json server_task_result_embd::to_json_non_oaicompat() {
1703    return json {
1704        {"index",     index},
1705        {"embedding", embedding},
1706    };
1707}
1708
1709json server_task_result_embd::to_json_oaicompat() {
1710    return json {
1711        {"index",            index},
1712        {"embedding",        embedding[0]},
1713        {"tokens_evaluated", n_tokens},
1714    };
1715}
1716
1717//
1718// server_task_result_rerank
1719//
1720json server_task_result_rerank::to_json() {
1721    return json {
1722        {"index",            index},
1723        {"score",            score},
1724        {"tokens_evaluated", n_tokens},
1725    };
1726}
1727
1728//
1729// server_task_result_error
1730//
1731json server_task_result_error::to_json() {
1732    json res = format_error_response(err_msg, err_type);
1733    if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
1734        res["n_prompt_tokens"] = n_prompt_tokens;
1735        res["n_ctx"]           = n_ctx;
1736    }
1737    return res;
1738}
1739
1740//
1741// server_task_result_metrics
1742//
1743json server_task_result_metrics::to_json() {
1744    return json {
1745        { "idle",                            n_idle_slots },
1746        { "processing",                      n_processing_slots },
1747        { "deferred",                        n_tasks_deferred },
1748        { "t_start",                         t_start },
1749
1750        { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
1751        { "t_tokens_generation_total",       t_tokens_generation_total },
1752        { "n_tokens_predicted_total",        n_tokens_predicted_total },
1753        { "t_prompt_processing_total",       t_prompt_processing_total },
1754
1755        { "n_tokens_max",                    n_tokens_max },
1756
1757        { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
1758        { "t_prompt_processing",             t_prompt_processing },
1759        { "n_tokens_predicted",              n_tokens_predicted },
1760        { "t_tokens_generation",             t_tokens_generation },
1761
1762        { "n_decode_total",                  n_decode_total },
1763        { "n_busy_slots_total",              n_busy_slots_total },
1764
1765        { "slots",                           slots_data },
1766    };
1767}
1768
1769//
1770// server_task_result_slot_save_load
1771//
1772json server_task_result_slot_save_load::to_json() {
1773    if (is_save) {
1774        return json {
1775            { "id_slot",   id_slot },
1776            { "filename",  filename },
1777            { "n_saved",   n_tokens },
1778            { "n_written", n_bytes },
1779            { "timings", {
1780                { "save_ms", t_ms }
1781            }},
1782        };
1783    }
1784
1785    return json {
1786        { "id_slot",    id_slot },
1787        { "filename",   filename },
1788        { "n_restored", n_tokens },
1789        { "n_read",     n_bytes },
1790        { "timings", {
1791            { "restore_ms", t_ms }
1792        }},
1793    };
1794}
1795
1796//
1797// server_task_result_slot_erase
1798//
1799json server_task_result_slot_erase::to_json() {
1800    return json {
1801        { "id_slot",  id_slot },
1802        { "n_erased", n_erased },
1803    };
1804}
1805
1806//
1807// server_task_result_get_lora
1808//
1809
1810json server_task_result_get_lora::to_json() {
1811    json result = json::array();
1812    for (size_t i = 0; i < loras.size(); ++i) {
1813        auto & lora = loras[i];
1814        json entry = {
1815            {"id",            i},
1816            {"path",          lora.info.path},
1817            {"scale",         lora.info.scale},
1818            {"task_name",     lora.info.task_name},
1819            {"prompt_prefix", lora.info.prompt_prefix},
1820        };
1821        if (!lora.alora_invocation_tokens.empty()) {
1822            entry["alora_invocation_string"] = lora.alora_invocation_string;
1823            entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
1824        }
1825        result.push_back(std::move(entry));
1826    }
1827    return result;
1828}
1829
1830//
1831// server_task_result_apply_lora
1832//
1833
1834json server_task_result_apply_lora::to_json() {
1835    return json {{ "success", true }};
1836}
1837
1838//
1839// server_prompt_cache
1840//
1841size_t server_prompt_cache::size() const {
1842    size_t res = 0;
1843
1844    for (const auto & state : states) {
1845        res += state.size();
1846    }
1847
1848    return res;
1849}
1850
1851size_t server_prompt_cache::n_tokens() const {
1852    size_t res = 0;
1853
1854    for (const auto & state : states) {
1855        res += state.n_tokens();
1856    }
1857
1858    return res;
1859}
1860
1861server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size) {
1862    // first check if the current state is contained fully in the cache
1863    for (auto it = states.begin(); it != states.end(); ++it) {
1864        const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
1865
1866        if (cur_lcp_len == (int) prompt.tokens.size()) {
1867            SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
1868            return nullptr;
1869        }
1870    }
1871
1872    // next, remove any cached prompts that are fully contained in the current prompt
1873    for (auto it = states.begin(); it != states.end();) {
1874        const int len = it->tokens.get_common_prefix(prompt.tokens);
1875
1876        if (len == (int) it->tokens.size()) {
1877            SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
1878
1879            it = states.erase(it);
1880        } else {
1881            ++it;
1882        }
1883    }
1884
1885    std::vector<uint8_t> state_data;
1886
1887    // check if we can allocate enough memory for the new state
1888    try {
1889        state_data.resize(state_size);
1890    } catch (const std::bad_alloc & e) {
1891        SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what());
1892
1893        limit_size = std::max<size_t>(1, 0.4*size());
1894
1895        SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));
1896
1897        update();
1898
1899        return nullptr;
1900    }
1901
1902    // TODO: for some reason we can't copy server_tokens, so we have to do this workaround
1903    auto & cur = states.emplace_back();
1904    cur = {
1905        /*.tokens      =*/ server_tokens(prompt.tokens.get_text_tokens(), false),
1906        /*.data        =*/ std::move(state_data),
1907        /*.checkpoints =*/ prompt.checkpoints,
1908    };
1909
1910    return &cur;
1911}
1912
1913bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
1914    const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
1915
1916    float f_keep_best = float(lcp_best) / prompt.tokens.size();
1917    float sim_best    = float(lcp_best) / tokens_new.size();
1918
1919    SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
1920
1921    auto it_best = states.end();
1922
1923    // find the most similar cached prompt, that would also preserve the most context
1924    for (auto it = states.begin(); it != states.end(); ++it) {
1925        const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
1926
1927        const float f_keep_cur = float(lcp_cur) / it->tokens.size();
1928        const float sim_cur    = float(lcp_cur) / tokens_new.size();
1929
1930        // don't trash large prompts
1931        if (f_keep_cur < 0.25f) {
1932            continue;
1933        }
1934
1935        if (f_keep_best < f_keep_cur && sim_best < sim_cur) {
1936            f_keep_best = f_keep_cur;
1937            sim_best    = sim_cur;
1938
1939            it_best = it;
1940        }
1941    }
1942
1943    if (it_best != states.end()) {
1944        SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
1945
1946        const size_t size = it_best->data.size();
1947        const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0);
1948        if (n != size) {
1949            SRV_WRN("failed to restore state with size %zu\n", size);
1950
1951            return false;
1952        }
1953
1954        it_best->data.clear();
1955        it_best->data.shrink_to_fit();
1956
1957        prompt = std::move(*it_best);
1958
1959        states.erase(it_best);
1960    }
1961
1962    return true;
1963}
1964
1965void server_prompt_cache::update() {
1966    if (limit_size > 0) {
1967        // always keep at least one state, regardless of the limits
1968        while (states.size() > 1 && size() > limit_size) {
1969            if (states.empty()) {
1970                break;
1971            }
1972
1973            SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
1974
1975            states.pop_front();
1976        }
1977    }
1978
1979    // average size per token
1980    const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
1981
1982    // dynamically increase the token limit if it can fit in the memory limit
1983    const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
1984
1985    if (limit_tokens > 0) {
1986        while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
1987            if (states.empty()) {
1988                break;
1989            }
1990
1991            SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
1992                    limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
1993
1994            states.pop_front();
1995        }
1996    }
1997
1998    SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
1999            states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
2000
2001    for (const auto & state : states) {
2002        SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
2003                (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
2004    }
2005}