1#include "server-common.h"
2#include "server-task.h"
3
4#include "common.h"
5#include "llama.h"
6#include "chat.h"
7#include "sampling.h"
8#include "speculative.h"
9#include "json-schema-to-grammar.h"
10
11using json = nlohmann::ordered_json;
12
13//
14// task_params
15//
16
17json task_params::format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const {
18 json data = json::array();
19 for (const auto & lb : logit_bias) {
20 data.push_back(json{
21 {"bias", lb.bias},
22 {"token", lb.token},
23 });
24 }
25 return data;
26}
27
28json task_params::to_json(bool only_metrics) const {
29 std::vector<std::string> samplers;
30 samplers.reserve(sampling.samplers.size());
31 for (const auto & sampler : sampling.samplers) {
32 samplers.emplace_back(common_sampler_type_to_str(sampler));
33 }
34
35 json lora = json::array();
36 for (auto & it : this->lora) {
37 lora.push_back({{"id", it.first}, {"scale", it.second}});
38 }
39
40 if (only_metrics) {
41 return json {
42 {"seed", sampling.seed},
43 {"temperature", sampling.temp},
44 {"dynatemp_range", sampling.dynatemp_range},
45 {"dynatemp_exponent", sampling.dynatemp_exponent},
46 {"top_k", sampling.top_k},
47 {"top_p", sampling.top_p},
48 {"min_p", sampling.min_p},
49 {"top_n_sigma", sampling.top_n_sigma},
50 {"xtc_probability", sampling.xtc_probability},
51 {"xtc_threshold", sampling.xtc_threshold},
52 {"typical_p", sampling.typ_p},
53 {"repeat_last_n", sampling.penalty_last_n},
54 {"repeat_penalty", sampling.penalty_repeat},
55 {"presence_penalty", sampling.penalty_present},
56 {"frequency_penalty", sampling.penalty_freq},
57 {"dry_multiplier", sampling.dry_multiplier},
58 {"dry_base", sampling.dry_base},
59 {"dry_allowed_length", sampling.dry_allowed_length},
60 {"dry_penalty_last_n", sampling.dry_penalty_last_n},
61 {"mirostat", sampling.mirostat},
62 {"mirostat_tau", sampling.mirostat_tau},
63 {"mirostat_eta", sampling.mirostat_eta},
64 {"max_tokens", n_predict},
65 {"n_predict", n_predict}, // TODO: deduplicate?
66 {"n_keep", n_keep},
67 {"n_discard", n_discard},
68 {"ignore_eos", sampling.ignore_eos},
69 {"stream", stream},
70 {"n_probs", sampling.n_probs},
71 {"min_keep", sampling.min_keep},
72 {"chat_format", common_chat_format_name(chat_parser_params.format)},
73 {"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
74 {"reasoning_in_content", chat_parser_params.reasoning_in_content},
75 {"thinking_forced_open", chat_parser_params.thinking_forced_open},
76 {"samplers", samplers},
77 {"speculative.n_max", speculative.n_max},
78 {"speculative.n_min", speculative.n_min},
79 {"speculative.p_min", speculative.p_min},
80 {"speculative.type", common_speculative_type_to_str(speculative.type)},
81 {"speculative.ngram_size_n", speculative.ngram_size_n},
82 {"speculative.ngram_size_m", speculative.ngram_size_m},
83 {"speculative.ngram_m_hits", speculative.ngram_min_hits},
84 {"timings_per_token", timings_per_token},
85 {"post_sampling_probs", post_sampling_probs},
86 {"backend_sampling", sampling.backend_sampling},
87 {"lora", lora},
88 };
89 }
90
91 auto grammar_triggers = json::array();
92 for (const auto & trigger : sampling.grammar_triggers) {
93 server_grammar_trigger ct(trigger);
94 grammar_triggers.push_back(ct.to_json());
95 }
96
97 return json {
98 {"seed", sampling.seed},
99 {"temperature", sampling.temp},
100 {"dynatemp_range", sampling.dynatemp_range},
101 {"dynatemp_exponent", sampling.dynatemp_exponent},
102 {"top_k", sampling.top_k},
103 {"top_p", sampling.top_p},
104 {"min_p", sampling.min_p},
105 {"top_n_sigma", sampling.top_n_sigma},
106 {"xtc_probability", sampling.xtc_probability},
107 {"xtc_threshold", sampling.xtc_threshold},
108 {"typical_p", sampling.typ_p},
109 {"repeat_last_n", sampling.penalty_last_n},
110 {"repeat_penalty", sampling.penalty_repeat},
111 {"presence_penalty", sampling.penalty_present},
112 {"frequency_penalty", sampling.penalty_freq},
113 {"dry_multiplier", sampling.dry_multiplier},
114 {"dry_base", sampling.dry_base},
115 {"dry_allowed_length", sampling.dry_allowed_length},
116 {"dry_penalty_last_n", sampling.dry_penalty_last_n},
117 {"dry_sequence_breakers", sampling.dry_sequence_breakers},
118 {"mirostat", sampling.mirostat},
119 {"mirostat_tau", sampling.mirostat_tau},
120 {"mirostat_eta", sampling.mirostat_eta},
121 {"stop", antiprompt},
122 {"max_tokens", n_predict},
123 {"n_predict", n_predict}, // TODO: deduplicate?
124 {"n_keep", n_keep},
125 {"n_discard", n_discard},
126 {"ignore_eos", sampling.ignore_eos},
127 {"stream", stream},
128 {"logit_bias", format_logit_bias(sampling.logit_bias)},
129 {"n_probs", sampling.n_probs},
130 {"min_keep", sampling.min_keep},
131 {"grammar", sampling.grammar},
132 {"grammar_lazy", sampling.grammar_lazy},
133 {"grammar_triggers", grammar_triggers},
134 {"preserved_tokens", sampling.preserved_tokens},
135 {"chat_format", common_chat_format_name(chat_parser_params.format)},
136 {"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
137 {"reasoning_in_content", chat_parser_params.reasoning_in_content},
138 {"thinking_forced_open", chat_parser_params.thinking_forced_open},
139 {"samplers", samplers},
140 {"speculative.n_max", speculative.n_max},
141 {"speculative.n_min", speculative.n_min},
142 {"speculative.p_min", speculative.p_min},
143 {"speculative.type", common_speculative_type_to_str(speculative.type)},
144 {"speculative.ngram_size_n", speculative.ngram_size_n},
145 {"speculative.ngram_size_m", speculative.ngram_size_m},
146 {"speculative.ngram_m_hits", speculative.ngram_min_hits},
147 {"timings_per_token", timings_per_token},
148 {"post_sampling_probs", post_sampling_probs},
149 {"backend_sampling", sampling.backend_sampling},
150 {"lora", lora},
151 };
152}
153
154//
155// task_result_state
156//
157common_chat_msg task_result_state::update_chat_msg(
158 const std::string & text_added,
159 bool is_partial,
160 std::vector<common_chat_msg_diff> & diffs) {
161 generated_text += text_added;
162 auto msg_prv_copy = chat_msg;
163 SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
164 auto new_msg = common_chat_parse(
165 generated_text,
166 is_partial,
167 chat_parser_params);
168 if (!new_msg.empty()) {
169 new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
170 chat_msg = new_msg;
171 diffs = common_chat_msg_diff::compute_diffs(msg_prv_copy, new_msg.empty() ? msg_prv_copy : new_msg);
172 }
173 return chat_msg;
174}
175
176//
177// server_task
178//
179
180task_params server_task::params_from_json_cmpl(
181 const llama_vocab * vocab,
182 const common_params & params_base,
183 const int n_ctx_slot,
184 const json & data) {
185 task_params params;
186
187 // Sampling parameter defaults are loaded from the global server context (but individual requests can still them)
188 task_params defaults;
189 defaults.sampling = params_base.sampling;
190 defaults.speculative = params_base.speculative;
191 defaults.n_keep = params_base.n_keep;
192 defaults.n_predict = params_base.n_predict;
193 defaults.n_cache_reuse = params_base.n_cache_reuse;
194 defaults.cache_prompt = params_base.cache_prompt;
195 defaults.antiprompt = params_base.antiprompt;
196
197 // enabling this will output extra debug information in the HTTP responses from the server
198 params.verbose = params_base.verbosity > 9;
199 params.timings_per_token = json_value(data, "timings_per_token", false);
200
201 params.stream = json_value(data, "stream", false);
202 auto stream_opt = json_value(data, "stream_options", json::object());
203 params.include_usage = json_value(stream_opt, "include_usage", false);
204 params.cache_prompt = json_value(data, "cache_prompt", defaults.cache_prompt);
205 params.return_tokens = json_value(data, "return_tokens", false);
206 params.return_progress = json_value(data, "return_progress", false);
207 params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
208 params.n_indent = json_value(data, "n_indent", defaults.n_indent);
209 params.n_keep = json_value(data, "n_keep", defaults.n_keep);
210 params.n_discard = json_value(data, "n_discard", defaults.n_discard);
211 params.n_cmpl = json_value(data, "n_cmpl", json_value(data, "n", 1));
212 params.n_cache_reuse = json_value(data, "n_cache_reuse", defaults.n_cache_reuse);
213 //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
214 params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
215 params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
216
217 params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
218 params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
219 params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p);
220 params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma);
221 params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability);
222 params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold);
223 params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p);
224 params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp);
225 params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range);
226 params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent);
227 params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n);
228 params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat);
229 params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq);
230 params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present);
231 params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier);
232 params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base);
233 params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
234 params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
235 params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
236 params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
237 params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
238 params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target);
239 params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay);
240 params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
241 params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
242 params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);
243 params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling);
244 params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs);
245
246 params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
247 params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
248 params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
249
250 params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
251 params.speculative.n_min = std::max(params.speculative.n_min, 0);
252 params.speculative.n_max = std::max(params.speculative.n_max, 0);
253
254 params.speculative.type = common_speculative_type_from_name(json_value(data, "speculative.type", common_speculative_type_to_str(defaults.speculative.type)));
255
256 params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
257 params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
258 params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);
259
260 params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024);
261 params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024);
262 params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024);
263
264 // Use OpenAI API logprobs only if n_probs wasn't provided
265 if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
266 params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
267 }
268
269 if (data.contains("lora")) {
270 if (data.at("lora").is_array()) {
271 params.lora = parse_lora_request(data.at("lora"));
272 } else {
273 throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
274 }
275 } else {
276 params.lora = {};
277 }
278
279 // TODO: add more sanity checks for the input parameters
280
281 if (params.sampling.penalty_last_n < -1) {
282 throw std::runtime_error("Error: repeat_last_n must be >= -1");
283 }
284
285 if (params.sampling.dry_penalty_last_n < -1) {
286 throw std::runtime_error("Error: dry_penalty_last_n must be >= -1");
287 }
288
289 if (params.sampling.penalty_last_n == -1) {
290 // note: should be the slot's context and not the full context, but it's ok
291 params.sampling.penalty_last_n = n_ctx_slot;
292 }
293
294 if (params.sampling.dry_penalty_last_n == -1) {
295 params.sampling.dry_penalty_last_n = n_ctx_slot;
296 }
297
298 if (params.sampling.dry_base < 1.0f) {
299 params.sampling.dry_base = defaults.sampling.dry_base;
300 }
301
302 // sequence breakers for DRY
303 {
304 // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format
305 // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
306
307 if (data.contains("dry_sequence_breakers")) {
308 params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
309 if (params.sampling.dry_sequence_breakers.empty()) {
310 throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings");
311 }
312 }
313 }
314
315 // process "json_schema" and "grammar"
316 if (data.contains("json_schema") && !data.contains("grammar")) {
317 try {
318 auto schema = json_value(data, "json_schema", json::object());
319 SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str());
320 params.sampling.grammar = json_schema_to_grammar(schema);
321 SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str());
322 } catch (const std::exception & e) {
323 throw std::runtime_error(std::string("\"json_schema\": ") + e.what());
324 }
325 } else {
326 params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
327 SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str());
328 params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy);
329 SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false");
330 }
331
332 {
333 auto it = data.find("chat_format");
334 if (it != data.end()) {
335 params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
336 SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
337 } else {
338 params.chat_parser_params.format = defaults.chat_parser_params.format;
339 }
340 common_reasoning_format reasoning_format = params_base.reasoning_format;
341 if (data.contains("reasoning_format")) {
342 reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
343 }
344 params.chat_parser_params.reasoning_format = reasoning_format;
345 params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
346 params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
347 params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
348 if (data.contains("chat_parser")) {
349 params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
350 }
351 }
352
353 {
354 const auto preserved_tokens = data.find("preserved_tokens");
355 if (preserved_tokens != data.end()) {
356 for (const auto & t : *preserved_tokens) {
357 auto ids = common_tokenize(vocab, t.get<std::string>(), /* add_special= */ false, /* parse_special= */ true);
358 if (ids.size() == 1) {
359 SRV_DBG("Preserved token: %d\n", ids[0]);
360 params.sampling.preserved_tokens.insert(ids[0]);
361 } else {
362 // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
363 SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
364 }
365 }
366 }
367 const auto grammar_triggers = data.find("grammar_triggers");
368 if (grammar_triggers != data.end()) {
369 for (const auto & t : *grammar_triggers) {
370 server_grammar_trigger ct(t);
371 if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
372 const auto & word = ct.value.value;
373 auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
374 if (ids.size() == 1) {
375 auto token = ids[0];
376 if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
377 throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
378 }
379 SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
380 common_grammar_trigger trigger;
381 trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
382 trigger.value = word;
383 trigger.token = token;
384 params.sampling.grammar_triggers.push_back(std::move(trigger));
385 } else {
386 SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
387 params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
388 }
389 } else {
390 if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
391 SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str());
392 } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) {
393 SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str());
394 } else {
395 throw std::runtime_error("Unknown grammar trigger type");
396 }
397 params.sampling.grammar_triggers.emplace_back(std::move(ct.value));
398 }
399 }
400 }
401 if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
402 throw std::runtime_error("Error: no triggers set for lazy grammar!");
403 }
404 }
405
406 {
407 params.sampling.logit_bias.clear();
408
409 const auto & logit_bias = data.find("logit_bias");
410 if (logit_bias != data.end() && logit_bias->is_array()) {
411 const int n_vocab = llama_vocab_n_tokens(vocab);
412 for (const auto & el : *logit_bias) {
413 // TODO: we may want to throw errors here, in case "el" is incorrect
414 if (el.is_array() && el.size() == 2) {
415 float bias;
416 if (el[1].is_number()) {
417 bias = el[1].get<float>();
418 } else if (el[1].is_boolean() && !el[1].get<bool>()) {
419 bias = -INFINITY;
420 } else {
421 continue;
422 }
423
424 if (el[0].is_number_integer()) {
425 llama_token tok = el[0].get<llama_token>();
426 if (tok >= 0 && tok < n_vocab) {
427 params.sampling.logit_bias.push_back({tok, bias});
428 }
429 } else if (el[0].is_string()) {
430 auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
431 for (auto tok : toks) {
432 params.sampling.logit_bias.push_back({tok, bias});
433 }
434 }
435 }
436 }
437 } else if (logit_bias != data.end() && logit_bias->is_object()) {
438 const int n_vocab = llama_vocab_n_tokens(vocab);
439 for (const auto & el : logit_bias->items()) {
440 float bias;
441 const auto & key = el.key();
442 const auto & value = el.value();
443 if (value.is_number()) {
444 bias = value.get<float>();
445 } else if (value.is_boolean() && !value.get<bool>()) {
446 bias = -INFINITY;
447 } else {
448 continue;
449 }
450
451 char *end;
452 llama_token tok = strtol(key.c_str(), &end, 10);
453 if (*end == 0) {
454 if (tok >= 0 && tok < n_vocab) {
455 params.sampling.logit_bias.push_back({tok, bias});
456 }
457 } else {
458 auto toks = common_tokenize(vocab, key, false);
459 for (auto tok : toks) {
460 params.sampling.logit_bias.push_back({tok, bias});
461 }
462 }
463 }
464 }
465
466 params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos);
467 if (params.sampling.ignore_eos) {
468 params.sampling.logit_bias.insert(
469 params.sampling.logit_bias.end(),
470 defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
471 }
472 }
473
474 {
475 params.antiprompt.clear();
476
477 const auto & stop = data.find("stop");
478 if (stop != data.end() && stop->is_array()) {
479 for (const auto & word : *stop) {
480 if (!word.empty()) {
481 params.antiprompt.push_back(word);
482 }
483 }
484 }
485 // set reverse prompt from cli args if not set in the request
486 if (params.antiprompt.empty()) {
487 params.antiprompt = defaults.antiprompt;
488 }
489 }
490
491 {
492 const auto samplers = data.find("samplers");
493 if (samplers != data.end()) {
494 if (samplers->is_array()) {
495 params.sampling.samplers = common_sampler_types_from_names(*samplers, false);
496 } else if (samplers->is_string()){
497 params.sampling.samplers = common_sampler_types_from_chars(samplers->get<std::string>());
498 }
499 } else {
500 params.sampling.samplers = defaults.sampling.samplers;
501 }
502 }
503
504 if (params.n_cmpl > params_base.n_parallel) {
505 throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np");
506 }
507
508 return params;
509}
510
511//
512// result_timings
513//
514
515json result_timings::to_json() const {
516 json base = {
517 {"cache_n", cache_n},
518
519 {"prompt_n", prompt_n},
520 {"prompt_ms", prompt_ms},
521 {"prompt_per_token_ms", prompt_per_token_ms},
522 {"prompt_per_second", prompt_per_second},
523
524 {"predicted_n", predicted_n},
525 {"predicted_ms", predicted_ms},
526 {"predicted_per_token_ms", predicted_per_token_ms},
527 {"predicted_per_second", predicted_per_second},
528 };
529
530 if (draft_n > 0) {
531 base["draft_n"] = draft_n;
532 base["draft_n_accepted"] = draft_n_accepted;
533 }
534
535 return base;
536}
537
538//
539// result_prompt_progress
540//
541json result_prompt_progress::to_json() const {
542 return json {
543 {"total", total},
544 {"cache", cache},
545 {"processed", processed},
546 {"time_ms", time_ms},
547 };
548}
549
550static inline std::string stop_type_to_str(stop_type type) {
551 switch (type) {
552 case STOP_TYPE_EOS: return "eos";
553 case STOP_TYPE_WORD: return "word";
554 case STOP_TYPE_LIMIT: return "limit";
555 default: return "none";
556 }
557}
558
559//
560// completion_token_output
561//
562
563json completion_token_output::to_json(bool post_sampling_probs) const {
564 json probs_for_token = json::array();
565 for (const auto & p : probs) {
566 std::string txt(p.txt);
567 txt.resize(validate_utf8(txt));
568 probs_for_token.push_back(json {
569 {"id", p.tok},
570 {"token", txt},
571 {"bytes", str_to_bytes(p.txt)},
572 {
573 post_sampling_probs ? "prob" : "logprob",
574 post_sampling_probs ? p.prob : logarithm(p.prob)
575 },
576 });
577 }
578 return probs_for_token;
579}
580
581json completion_token_output::probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs) {
582 json out = json::array();
583 for (const auto & p : probs) {
584 std::string txt(p.text_to_send);
585 txt.resize(validate_utf8(txt));
586 out.push_back(json {
587 {"id", p.tok},
588 {"token", txt},
589 {"bytes", str_to_bytes(p.text_to_send)},
590 {
591 post_sampling_probs ? "prob" : "logprob",
592 post_sampling_probs ? p.prob : logarithm(p.prob)
593 },
594 {
595 post_sampling_probs ? "top_probs" : "top_logprobs",
596 p.to_json(post_sampling_probs)
597 },
598 });
599 }
600 return out;
601}
602
603float completion_token_output::logarithm(float x) {
604 // nlohmann::json converts -inf to null, so we need to prevent that
605 return x == 0.0f ? std::numeric_limits<float>::lowest() : std::log(x);
606}
607
608std::vector<unsigned char> completion_token_output::str_to_bytes(const std::string & str) {
609 std::vector<unsigned char> bytes;
610 for (unsigned char c : str) {
611 bytes.push_back(c);
612 }
613 return bytes;
614}
615
616//
617// server_task_result_cmpl_final
618//
619json server_task_result_cmpl_final::to_json() {
620 GGML_ASSERT(is_updated && "update() must be called before to_json()");
621 switch (res_type) {
622 case TASK_RESPONSE_TYPE_NONE:
623 return to_json_non_oaicompat();
624 case TASK_RESPONSE_TYPE_OAI_CMPL:
625 return to_json_oaicompat();
626 case TASK_RESPONSE_TYPE_OAI_CHAT:
627 return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
628 case TASK_RESPONSE_TYPE_OAI_RESP:
629 return stream ? to_json_oaicompat_resp_stream() : to_json_oaicompat_resp();
630 case TASK_RESPONSE_TYPE_ANTHROPIC:
631 return stream ? to_json_anthropic_stream() : to_json_anthropic();
632 default:
633 GGML_ASSERT(false && "Invalid task_response_type");
634 }
635}
636
637json server_task_result_cmpl_final::to_json_non_oaicompat() {
638 json res = json {
639 {"index", index},
640 {"content", content},
641 {"tokens", tokens},
642 {"id_slot", id_slot},
643 {"stop", true},
644 {"model", oaicompat_model},
645 {"tokens_predicted", n_decoded},
646 {"tokens_evaluated", n_prompt_tokens},
647 {"generation_settings", generation_params.to_json()},
648 {"prompt", prompt},
649 {"has_new_line", has_new_line},
650 {"truncated", truncated},
651 {"stop_type", stop_type_to_str(stop)},
652 {"stopping_word", stopping_word},
653 {"tokens_cached", n_tokens_cached},
654 {"timings", timings.to_json()},
655 };
656 if (!stream && !probs_output.empty()) {
657 res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
658 }
659 return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
660}
661
662json server_task_result_cmpl_final::to_json_oaicompat() {
663 std::time_t t = std::time(0);
664 json logprobs = json(nullptr); // OAI default to null
665 if (!stream && probs_output.size() > 0) {
666 logprobs = json{
667 {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
668 };
669 }
670 json finish_reason = "length";
671 if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
672 finish_reason = "stop";
673 }
674 json res = json {
675 {"choices", json::array({
676 json{
677 {"text", content},
678 {"index", index},
679 {"logprobs", logprobs},
680 {"finish_reason", finish_reason},
681 }
682 })},
683 {"created", t},
684 {"model", oaicompat_model},
685 {"system_fingerprint", build_info},
686 {"object", "text_completion"},
687 {"usage", json {
688 {"completion_tokens", n_decoded},
689 {"prompt_tokens", n_prompt_tokens},
690 {"total_tokens", n_decoded + n_prompt_tokens}
691 }},
692 {"id", oaicompat_cmpl_id}
693 };
694
695 // extra fields for debugging purposes
696 if (verbose) {
697 res["__verbose"] = to_json_non_oaicompat();
698 }
699 if (timings.prompt_n >= 0) {
700 res.push_back({"timings", timings.to_json()});
701 }
702
703 return res;
704}
705
706json server_task_result_cmpl_final::to_json_oaicompat_chat() {
707 std::string finish_reason = "length";
708 common_chat_msg msg;
709 if (!oaicompat_msg.empty()) {
710 msg = oaicompat_msg;
711 } else {
712 msg.role = "assistant";
713 msg.content = content;
714 }
715 if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
716 finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls";
717 }
718
719 json choice {
720 {"finish_reason", finish_reason},
721 {"index", index},
722 {"message", msg.to_json_oaicompat()},
723 };
724
725 if (!stream && probs_output.size() > 0) {
726 choice["logprobs"] = json{
727 {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
728 };
729 }
730
731 std::time_t t = std::time(0);
732
733 json res = json {
734 {"choices", json::array({choice})},
735 {"created", t},
736 {"model", oaicompat_model},
737 {"system_fingerprint", build_info},
738 {"object", "chat.completion"},
739 {"usage", json {
740 {"completion_tokens", n_decoded},
741 {"prompt_tokens", n_prompt_tokens},
742 {"total_tokens", n_decoded + n_prompt_tokens}
743 }},
744 {"id", oaicompat_cmpl_id}
745 };
746
747 // extra fields for debugging purposes
748 if (verbose) {
749 res["__verbose"] = to_json_non_oaicompat();
750 }
751 if (timings.prompt_n >= 0) {
752 res.push_back({"timings", timings.to_json()});
753 }
754
755 return res;
756}
757
758json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
759 std::time_t t = std::time(0);
760 std::string finish_reason = "length";
761 if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
762 finish_reason = oaicompat_msg.tool_calls.empty() ? "stop" : "tool_calls";
763 }
764
765 json deltas = json::array();
766 for (const auto & diff : oaicompat_msg_diffs) {
767 deltas.push_back({
768 {"choices", json::array({
769 json {
770 {"finish_reason", nullptr},
771 {"index", 0},
772 {"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
773 },
774 })},
775 {"created", t},
776 {"id", oaicompat_cmpl_id},
777 {"model", oaicompat_model},
778 {"system_fingerprint", build_info},
779 {"object", "chat.completion.chunk"},
780 });
781 }
782
783 deltas.push_back({
784 {"choices", json::array({
785 json {
786 {"finish_reason", finish_reason},
787 {"index", 0},
788 {"delta", json::object()},
789 },
790 })},
791 {"created", t},
792 {"id", oaicompat_cmpl_id},
793 {"model", oaicompat_model},
794 {"system_fingerprint", build_info},
795 {"object", "chat.completion.chunk"},
796 });
797
798 if (include_usage) {
799 // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
800 // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
801 deltas.push_back({
802 {"choices", json::array()},
803 {"created", t},
804 {"id", oaicompat_cmpl_id},
805 {"model", oaicompat_model},
806 {"system_fingerprint", build_info},
807 {"object", "chat.completion.chunk"},
808 {"usage", json {
809 {"completion_tokens", n_decoded},
810 {"prompt_tokens", n_prompt_tokens},
811 {"total_tokens", n_decoded + n_prompt_tokens},
812 }},
813 });
814 }
815
816 if (timings.prompt_n >= 0) {
817 deltas.back().push_back({"timings", timings.to_json()});
818 }
819
820 // extra fields for debugging purposes
821 if (verbose && !deltas.empty()) {
822 deltas.front()["__verbose"] = to_json_non_oaicompat();
823 }
824
825 return deltas;
826}
827
828json server_task_result_cmpl_final::to_json_oaicompat_resp() {
829 common_chat_msg msg;
830 if (!oaicompat_msg.empty()) {
831 msg = oaicompat_msg;
832 } else {
833 msg.role = "assistant";
834 msg.content = content;
835 }
836
837 std::vector<json> output;
838
839 if (msg.reasoning_content != "") {
840 output.push_back(json {
841 {"id", "rs_" + random_string()},
842 {"summary", json::array()},
843 {"type", "reasoning"},
844 {"content", json::array({ json {
845 {"text", msg.reasoning_content},
846 {"type", "reasoning_text"},
847 }})},
848 {"encrypted_content", ""},
849 {"status", "completed"},
850 });
851 }
852
853 if (msg.content != "") {
854 output.push_back(json {
855 {"content", json::array({ json {
856 {"type", "output_text"},
857 {"annotations", json::array()},
858 {"logprobs", json::array()},
859 {"text", msg.content},
860 }})},
861 {"id", "msg_" + random_string()},
862 {"role", msg.role},
863 {"status", "completed"},
864 {"type", "message"},
865 });
866 }
867
868 for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
869 output.push_back(json {
870 {"type", "function_call"},
871 {"status", "completed"},
872 {"arguments", tool_call.arguments},
873 {"call_id", "fc_" + tool_call.id},
874 {"name", tool_call.name},
875 });
876 }
877
878 std::time_t t = std::time(0);
879 json res = {
880 {"completed_at", t},
881 {"created_at", t},
882 {"id", oai_resp_id},
883 {"model", oaicompat_model},
884 {"object", "response"},
885 {"output", output},
886 {"status", "completed"},
887 {"usage", json {
888 {"input_tokens", n_prompt_tokens},
889 {"output_tokens", n_decoded},
890 {"total_tokens", n_decoded + n_prompt_tokens},
891 }},
892 };
893
894 return res;
895}
896
897json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
898 std::vector<json> server_sent_events;
899 std::vector<json> output;
900
901 if (oaicompat_msg.reasoning_content != "") {
902 const json output_item = json {
903 {"id", oai_resp_reasoning_id},
904 {"summary", json::array()},
905 {"type", "reasoning"},
906 {"content", json::array({ json {
907 {"text", oaicompat_msg.reasoning_content},
908 {"type", "reasoning_text"},
909 }})},
910 {"encrypted_content", ""},
911 };
912
913 server_sent_events.push_back(json {
914 {"event", "response.output_item.done"},
915 {"data", json {
916 {"type", "response.output_item.done"},
917 {"item", output_item}
918 }}
919 });
920 output.push_back(output_item);
921 }
922
923 if (oaicompat_msg.content != "") {
924 server_sent_events.push_back(json {
925 {"event", "response.output_text.done"},
926 {"data", json {
927 {"type", "response.output_text.done"},
928 {"item_id", oai_resp_message_id},
929 {"text", oaicompat_msg.content}
930 }}
931 });
932
933 const json content_part = {
934 {"type", "output_text"},
935 {"annotations", json::array()},
936 {"logprobs", json::array()},
937 {"text", oaicompat_msg.content}
938 };
939
940 server_sent_events.push_back(json {
941 {"event", "response.content_part.done"},
942 {"data", json {
943 {"type", "response.content_part.done"},
944 {"item_id", oai_resp_message_id},
945 {"part", content_part}
946 }}
947 });
948 const json output_item = {
949 {"type", "message"},
950 {"status", "completed"},
951 {"id", oai_resp_message_id},
952 {"content", json::array({content_part})},
953 {"role", "assistant"}
954 };
955
956 server_sent_events.push_back(json {
957 {"event", "response.output_item.done"},
958 {"data", json {
959 {"type", "response.output_item.done"},
960 {"item", output_item}
961 }}
962 });
963 output.push_back(output_item);
964 }
965
966 for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) {
967 const json output_item = {
968 {"type", "function_call"},
969 {"status", "completed"},
970 {"arguments", tool_call.arguments},
971 {"call_id", "fc_" + tool_call.id},
972 {"name", tool_call.name}
973 };
974 server_sent_events.push_back(json {
975 {"event", "response.output_item.done"},
976 {"data", json {
977 {"type", "response.output_item.done"},
978 {"item", output_item}
979 }}
980 });
981 output.push_back(output_item);
982 }
983
984 std::time_t t = std::time(0);
985 server_sent_events.push_back(json {
986 {"event", "response.completed"},
987 {"data", json {
988 {"type", "response.completed"},
989 {"response", json {
990 {"id", oai_resp_id},
991 {"object", "response"},
992 {"created_at", t},
993 {"status", "completed"},
994 {"model", oaicompat_model},
995 {"output", output},
996 {"usage", json {
997 {"input_tokens", n_prompt_tokens},
998 {"output_tokens", n_decoded},
999 {"total_tokens", n_decoded + n_prompt_tokens}
1000 }}
1001 }},
1002 }}
1003 });
1004
1005 return server_sent_events;
1006}
1007
1008json server_task_result_cmpl_final::to_json_anthropic() {
1009 std::string stop_reason = "max_tokens";
1010 if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
1011 stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
1012 }
1013
1014 json content_blocks = json::array();
1015
1016 common_chat_msg msg;
1017 if (!oaicompat_msg.empty()) {
1018 msg = oaicompat_msg;
1019 } else {
1020 msg.role = "assistant";
1021 msg.content = content;
1022 }
1023
1024 // thinking block comes first (Anthropic extended thinking format)
1025 if (!msg.reasoning_content.empty()) {
1026 content_blocks.push_back({
1027 {"type", "thinking"},
1028 {"thinking", msg.reasoning_content},
1029 {"signature", ""} // empty signature for local models (no cryptographic verification)
1030 });
1031 }
1032
1033 if (!msg.content.empty()) {
1034 content_blocks.push_back({
1035 {"type", "text"},
1036 {"text", msg.content}
1037 });
1038 }
1039
1040 for (const auto & tool_call : msg.tool_calls) {
1041 json tool_use_block = {
1042 {"type", "tool_use"},
1043 {"id", tool_call.id},
1044 {"name", tool_call.name}
1045 };
1046
1047 try {
1048 tool_use_block["input"] = json::parse(tool_call.arguments);
1049 } catch (const std::exception &) {
1050 tool_use_block["input"] = json::object();
1051 }
1052
1053 content_blocks.push_back(tool_use_block);
1054 }
1055
1056 json res = {
1057 {"id", oaicompat_cmpl_id},
1058 {"type", "message"},
1059 {"role", "assistant"},
1060 {"content", content_blocks},
1061 {"model", oaicompat_model},
1062 {"stop_reason", stop_reason},
1063 {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)},
1064 {"usage", {
1065 {"input_tokens", n_prompt_tokens},
1066 {"output_tokens", n_decoded}
1067 }}
1068 };
1069
1070 return res;
1071}
1072
1073json server_task_result_cmpl_final::to_json_anthropic_stream() {
1074 json events = json::array();
1075
1076 std::string stop_reason = "max_tokens";
1077 if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
1078 stop_reason = oaicompat_msg.tool_calls.empty() ? "end_turn" : "tool_use";
1079 }
1080
1081 bool has_thinking = !oaicompat_msg.reasoning_content.empty();
1082 bool has_text = !oaicompat_msg.content.empty();
1083 size_t num_tool_calls = oaicompat_msg.tool_calls.size();
1084
1085 // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
1086 size_t thinking_block_index = 0;
1087 size_t text_block_index = has_thinking ? 1 : 0;
1088
1089 bool thinking_block_started = false;
1090 bool text_block_started = false;
1091 std::unordered_set<size_t> tool_calls_started;
1092
1093 for (const auto & diff : oaicompat_msg_diffs) {
1094 // handle thinking/reasoning content
1095 if (!diff.reasoning_content_delta.empty()) {
1096 if (!thinking_block_started) {
1097 events.push_back({
1098 {"event", "content_block_start"},
1099 {"data", {
1100 {"type", "content_block_start"},
1101 {"index", thinking_block_index},
1102 {"content_block", {
1103 {"type", "thinking"},
1104 {"thinking", ""}
1105 }}
1106 }}
1107 });
1108 thinking_block_started = true;
1109 }
1110
1111 events.push_back({
1112 {"event", "content_block_delta"},
1113 {"data", {
1114 {"type", "content_block_delta"},
1115 {"index", thinking_block_index},
1116 {"delta", {
1117 {"type", "thinking_delta"},
1118 {"thinking", diff.reasoning_content_delta}
1119 }}
1120 }}
1121 });
1122 }
1123
1124 // handle regular text content
1125 if (!diff.content_delta.empty()) {
1126 if (!text_block_started) {
1127 events.push_back({
1128 {"event", "content_block_start"},
1129 {"data", {
1130 {"type", "content_block_start"},
1131 {"index", text_block_index},
1132 {"content_block", {
1133 {"type", "text"},
1134 {"text", ""}
1135 }}
1136 }}
1137 });
1138 text_block_started = true;
1139 }
1140
1141 events.push_back({
1142 {"event", "content_block_delta"},
1143 {"data", {
1144 {"type", "content_block_delta"},
1145 {"index", text_block_index},
1146 {"delta", {
1147 {"type", "text_delta"},
1148 {"text", diff.content_delta}
1149 }}
1150 }}
1151 });
1152 }
1153
1154 // handle tool calls
1155 if (diff.tool_call_index != std::string::npos) {
1156 size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + diff.tool_call_index;
1157
1158 if (tool_calls_started.find(diff.tool_call_index) == tool_calls_started.end()) {
1159 const auto & full_tool_call = oaicompat_msg.tool_calls[diff.tool_call_index];
1160
1161 events.push_back({
1162 {"event", "content_block_start"},
1163 {"data", {
1164 {"type", "content_block_start"},
1165 {"index", content_block_index},
1166 {"content_block", {
1167 {"type", "tool_use"},
1168 {"id", full_tool_call.id},
1169 {"name", full_tool_call.name}
1170 }}
1171 }}
1172 });
1173 tool_calls_started.insert(diff.tool_call_index);
1174 }
1175
1176 if (!diff.tool_call_delta.arguments.empty()) {
1177 events.push_back({
1178 {"event", "content_block_delta"},
1179 {"data", {
1180 {"type", "content_block_delta"},
1181 {"index", content_block_index},
1182 {"delta", {
1183 {"type", "input_json_delta"},
1184 {"partial_json", diff.tool_call_delta.arguments}
1185 }}
1186 }}
1187 });
1188 }
1189 }
1190 }
1191
1192 // close content blocks in order
1193 if (has_thinking) {
1194 // Anthropic API requires a signature_delta before closing thinking blocks
1195 // We use an empty signature since we can't generate a cryptographic signature for local models
1196 events.push_back({
1197 {"event", "content_block_delta"},
1198 {"data", {
1199 {"type", "content_block_delta"},
1200 {"index", thinking_block_index},
1201 {"delta", {
1202 {"type", "signature_delta"},
1203 {"signature", ""}
1204 }}
1205 }}
1206 });
1207 events.push_back({
1208 {"event", "content_block_stop"},
1209 {"data", {
1210 {"type", "content_block_stop"},
1211 {"index", thinking_block_index}
1212 }}
1213 });
1214 }
1215
1216 if (has_text) {
1217 events.push_back({
1218 {"event", "content_block_stop"},
1219 {"data", {
1220 {"type", "content_block_stop"},
1221 {"index", text_block_index}
1222 }}
1223 });
1224 }
1225
1226 for (size_t i = 0; i < num_tool_calls; i++) {
1227 size_t content_block_index = (has_thinking ? 1 : 0) + (has_text ? 1 : 0) + i;
1228 events.push_back({
1229 {"event", "content_block_stop"},
1230 {"data", {
1231 {"type", "content_block_stop"},
1232 {"index", content_block_index}
1233 }}
1234 });
1235 }
1236
1237 events.push_back({
1238 {"event", "message_delta"},
1239 {"data", {
1240 {"type", "message_delta"},
1241 {"delta", {
1242 {"stop_reason", stop_reason},
1243 {"stop_sequence", stopping_word.empty() ? nullptr : json(stopping_word)}
1244 }},
1245 {"usage", {
1246 {"output_tokens", n_decoded}
1247 }}
1248 }}
1249 });
1250
1251 events.push_back({
1252 {"event", "message_stop"},
1253 {"data", {
1254 {"type", "message_stop"}
1255 }}
1256 });
1257
1258 return events;
1259}
1260
1261//
1262// server_task_result_cmpl_partial
1263//
1264void server_task_result_cmpl_partial::update(task_result_state & state) {
1265 is_updated = true;
1266 state.update_chat_msg(content, true, oaicompat_msg_diffs);
1267
1268 // Copy current state for use in to_json_*() (reflects state BEFORE this chunk)
1269 thinking_block_started = state.thinking_block_started;
1270 text_block_started = state.text_block_started;
1271
1272 oai_resp_id = state.oai_resp_id;
1273 oai_resp_reasoning_id = state.oai_resp_reasoning_id;
1274 oai_resp_message_id = state.oai_resp_message_id;
1275 oai_resp_fc_id = state.oai_resp_fc_id;
1276
1277 // track if the accumulated message has any reasoning content
1278 anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
1279
1280 // Pre-compute state updates based on diffs (for next chunk)
1281 for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
1282 if (!diff.reasoning_content_delta.empty() && !state.thinking_block_started) {
1283 state.thinking_block_started = true;
1284 }
1285 if (!diff.content_delta.empty() && !state.text_block_started) {
1286 state.text_block_started = true;
1287 }
1288 if (!diff.tool_call_delta.name.empty()) {
1289 state.oai_resp_fc_id = diff.tool_call_delta.id;
1290 }
1291 }
1292}
1293
1294json server_task_result_cmpl_partial::to_json() {
1295 GGML_ASSERT(is_updated && "update() must be called before to_json()");
1296 switch (res_type) {
1297 case TASK_RESPONSE_TYPE_NONE:
1298 return to_json_non_oaicompat();
1299 case TASK_RESPONSE_TYPE_OAI_CMPL:
1300 return to_json_oaicompat();
1301 case TASK_RESPONSE_TYPE_OAI_CHAT:
1302 return to_json_oaicompat_chat();
1303 case TASK_RESPONSE_TYPE_OAI_RESP:
1304 return to_json_oaicompat_resp();
1305 case TASK_RESPONSE_TYPE_ANTHROPIC:
1306 return to_json_anthropic();
1307 default:
1308 GGML_ASSERT(false && "Invalid task_response_type");
1309 }
1310}
1311
1312json server_task_result_cmpl_partial::to_json_non_oaicompat() {
1313 // non-OAI-compat JSON
1314 json res = json {
1315 {"index", index},
1316 {"content", content},
1317 {"tokens", tokens},
1318 {"stop", false},
1319 {"id_slot", id_slot},
1320 {"tokens_predicted", n_decoded},
1321 {"tokens_evaluated", n_prompt_tokens},
1322 };
1323 // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
1324 if (timings.prompt_n > 0) {
1325 res.push_back({"timings", timings.to_json()});
1326 }
1327 if (is_progress) {
1328 res.push_back({"prompt_progress", progress.to_json()});
1329 }
1330 if (!prob_output.probs.empty()) {
1331 res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs);
1332 }
1333 return res;
1334}
1335
1336json server_task_result_cmpl_partial::to_json_oaicompat() {
1337 std::time_t t = std::time(0);
1338 json logprobs = json(nullptr); // OAI default to null
1339 if (prob_output.probs.size() > 0) {
1340 logprobs = json{
1341 {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
1342 };
1343 }
1344 json res = json {
1345 {"choices", json::array({
1346 json{
1347 {"text", content},
1348 {"index", index},
1349 {"logprobs", logprobs},
1350 {"finish_reason", nullptr},
1351 }
1352 })},
1353 {"created", t},
1354 {"model", oaicompat_model},
1355 {"system_fingerprint", build_info},
1356 {"object", "text_completion"},
1357 {"id", oaicompat_cmpl_id}
1358 };
1359
1360 // extra fields for debugging purposes
1361 if (verbose) {
1362 res["__verbose"] = to_json_non_oaicompat();
1363 }
1364 if (timings.prompt_n >= 0) {
1365 res.push_back({"timings", timings.to_json()});
1366 }
1367 if (is_progress) {
1368 res.push_back({"prompt_progress", progress.to_json()});
1369 }
1370
1371 return res;
1372}
1373
1374json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
1375 bool first = n_decoded == 1;
1376 std::time_t t = std::time(0);
1377 json choices;
1378
1379 std::vector<json> deltas;
1380 auto add_delta = [&](const json & delta) {
1381 deltas.push_back({
1382 {"choices", json::array({
1383 json {
1384 {"finish_reason", nullptr},
1385 {"index", index},
1386 {"delta", delta},
1387 },
1388 })},
1389 {"created", t},
1390 {"id", oaicompat_cmpl_id},
1391 {"model", oaicompat_model},
1392 {"system_fingerprint", build_info},
1393 {"object", "chat.completion.chunk"},
1394 });
1395 };
1396 // We have to send an initial update to conform to openai behavior
1397 if (first || is_progress) {
1398 add_delta({
1399 {"role", "assistant"},
1400 {"content", nullptr},
1401 });
1402 }
1403
1404 for (const auto & diff : oaicompat_msg_diffs) {
1405 add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
1406 }
1407
1408 if (!deltas.empty()) {
1409 auto & last_json = deltas[deltas.size() - 1];
1410 GGML_ASSERT(last_json.at("choices").size() >= 1);
1411
1412 if (prob_output.probs.size() > 0) {
1413 last_json.at("choices").at(0)["logprobs"] = json {
1414 {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
1415 };
1416 }
1417
1418 if (timings.prompt_n >= 0) {
1419 last_json.push_back({"timings", timings.to_json()});
1420 }
1421 if (is_progress) {
1422 last_json.push_back({"prompt_progress", progress.to_json()});
1423 }
1424 }
1425
1426 return deltas;
1427}
1428
1429json server_task_result_cmpl_partial::to_json_oaicompat_resp() {
1430 std::vector<json> events;
1431
1432 if (n_decoded == 1) {
1433 events.push_back(json {
1434 {"event", "response.created"},
1435 {"data", json {
1436 {"type", "response.created"},
1437 {"response", json {
1438 {"id", oai_resp_id},
1439 {"object", "response"},
1440 {"status", "in_progress"},
1441 }},
1442 }},
1443 });
1444 events.push_back(json {
1445 {"event", "response.in_progress"},
1446 {"data", json {
1447 {"type", "response.in_progress"},
1448 {"response", json {
1449 {"id", oai_resp_id},
1450 {"object", "response"},
1451 {"status", "in_progress"},
1452 }},
1453 }},
1454 });
1455 }
1456
1457 for (const common_chat_msg_diff & diff : oaicompat_msg_diffs) {
1458 if (!diff.reasoning_content_delta.empty()) {
1459 if (!thinking_block_started) {
1460 events.push_back(json {
1461 {"event", "response.output_item.added"},
1462 {"data", json {
1463 {"type", "response.output_item.added"},
1464 {"item", json {
1465 {"id", oai_resp_reasoning_id},
1466 {"summary", json::array()},
1467 {"type", "reasoning"},
1468 {"content", json::array()},
1469 {"encrypted_content", ""},
1470 {"status", "in_progress"},
1471 }},
1472 }},
1473 });
1474 thinking_block_started = true;
1475 }
1476 events.push_back(json {
1477 {"event", "response.reasoning_text.delta"},
1478 {"data", json {
1479 {"type", "response.reasoning_text.delta"},
1480 {"delta", diff.reasoning_content_delta},
1481 {"item_id", oai_resp_reasoning_id},
1482 }},
1483 });
1484 }
1485
1486 if (!diff.content_delta.empty()) {
1487 if (!text_block_started) {
1488 events.push_back(json {
1489 {"event", "response.output_item.added"},
1490 {"data", json {
1491 {"type", "response.output_item.added"},
1492 {"item", json {
1493 {"content", json::array()},
1494 {"id", oai_resp_message_id},
1495 {"role", "assistant"},
1496 {"status", "in_progress"},
1497 {"type", "message"},
1498 }},
1499 }},
1500 });
1501 events.push_back(json {
1502 {"event", "response.content_part.added"},
1503 {"data", json {
1504 {"type", "response.content_part.added"},
1505 {"item_id", oai_resp_message_id},
1506 {"part", json {
1507 {"type", "output_text"},
1508 {"text", ""},
1509 }},
1510 }},
1511 });
1512 text_block_started = true;
1513 }
1514 events.push_back(json {
1515 {"event", "response.output_text.delta"},
1516 {"data", json {
1517 {"type", "response.output_text.delta"},
1518 {"item_id", oai_resp_message_id},
1519 {"delta", diff.content_delta},
1520 }},
1521 });
1522 }
1523
1524 if (!diff.tool_call_delta.name.empty()) {
1525 events.push_back(json {
1526 {"event", "response.output_item.added"},
1527 {"data", json {
1528 {"type", "response.output_item.added"},
1529 {"item", json {
1530 {"arguments", ""},
1531 {"call_id", "fc_" + diff.tool_call_delta.id},
1532 {"name", diff.tool_call_delta.name},
1533 {"type", "function_call"},
1534 {"status", "in_progress"},
1535 }},
1536 }},
1537 });
1538 oai_resp_fc_id = diff.tool_call_delta.id;
1539 }
1540
1541 if (!diff.tool_call_delta.arguments.empty()) {
1542 events.push_back(json {
1543 {"event", "response.function_call_arguments.delta"},
1544 {"data", json {
1545 {"type", "response.function_call_arguments.delta"},
1546 {"delta", diff.tool_call_delta.arguments},
1547 {"item_id", "fc_" + oai_resp_fc_id},
1548 }},
1549 });
1550 }
1551 }
1552 return events;
1553}
1554
1555json server_task_result_cmpl_partial::to_json_anthropic() {
1556 json events = json::array();
1557 bool first = (n_decoded == 1);
1558 // use member variables to track block state across streaming calls
1559 // (anthropic_thinking_block_started, anthropic_text_block_started)
1560
1561 if (first) {
1562 events.push_back({
1563 {"event", "message_start"},
1564 {"data", {
1565 {"type", "message_start"},
1566 {"message", {
1567 {"id", oaicompat_cmpl_id},
1568 {"type", "message"},
1569 {"role", "assistant"},
1570 {"content", json::array()},
1571 {"model", oaicompat_model},
1572 {"stop_reason", nullptr},
1573 {"stop_sequence", nullptr},
1574 {"usage", {
1575 {"input_tokens", n_prompt_tokens},
1576 {"output_tokens", 0}
1577 }}
1578 }}
1579 }}
1580 });
1581 }
1582
1583 // content block indices: thinking (0) -> text (0 or 1) -> tool_use (n+)
1584 size_t thinking_block_index = 0;
1585 // use anthropic_has_reasoning (set in update()) to know if ANY reasoning was generated
1586 size_t text_block_index = anthropic_has_reasoning ? 1 : 0;
1587
1588 // use local copies of streaming state (copied from task_result_state in update())
1589 // these reflect the state BEFORE this chunk was processed
1590 bool thinking_started = thinking_block_started;
1591 bool text_started = text_block_started;
1592
1593 for (const auto & diff : oaicompat_msg_diffs) {
1594 // handle thinking/reasoning content
1595 if (!diff.reasoning_content_delta.empty()) {
1596 if (!thinking_started) {
1597 events.push_back({
1598 {"event", "content_block_start"},
1599 {"data", {
1600 {"type", "content_block_start"},
1601 {"index", thinking_block_index},
1602 {"content_block", {
1603 {"type", "thinking"},
1604 {"thinking", ""}
1605 }}
1606 }}
1607 });
1608 thinking_started = true;
1609 }
1610
1611 events.push_back({
1612 {"event", "content_block_delta"},
1613 {"data", {
1614 {"type", "content_block_delta"},
1615 {"index", thinking_block_index},
1616 {"delta", {
1617 {"type", "thinking_delta"},
1618 {"thinking", diff.reasoning_content_delta}
1619 }}
1620 }}
1621 });
1622 }
1623
1624 // handle regular text content
1625 if (!diff.content_delta.empty()) {
1626 if (!text_started) {
1627 events.push_back({
1628 {"event", "content_block_start"},
1629 {"data", {
1630 {"type", "content_block_start"},
1631 {"index", text_block_index},
1632 {"content_block", {
1633 {"type", "text"},
1634 {"text", ""}
1635 }}
1636 }}
1637 });
1638 text_started = true;
1639 }
1640
1641 events.push_back({
1642 {"event", "content_block_delta"},
1643 {"data", {
1644 {"type", "content_block_delta"},
1645 {"index", text_block_index},
1646 {"delta", {
1647 {"type", "text_delta"},
1648 {"text", diff.content_delta}
1649 }}
1650 }}
1651 });
1652 }
1653
1654 // handle tool calls
1655 if (diff.tool_call_index != std::string::npos) {
1656 // use anthropic_has_reasoning for thinking block count (persists across calls)
1657 size_t content_block_index = (anthropic_has_reasoning ? 1 : 0) + (text_started ? 1 : 0) + diff.tool_call_index;
1658
1659 if (!diff.tool_call_delta.name.empty()) {
1660 events.push_back({
1661 {"event", "content_block_start"},
1662 {"data", {
1663 {"type", "content_block_start"},
1664 {"index", content_block_index},
1665 {"content_block", {
1666 {"type", "tool_use"},
1667 {"id", diff.tool_call_delta.id},
1668 {"name", diff.tool_call_delta.name}
1669 }}
1670 }}
1671 });
1672 }
1673
1674 if (!diff.tool_call_delta.arguments.empty()) {
1675 events.push_back({
1676 {"event", "content_block_delta"},
1677 {"data", {
1678 {"type", "content_block_delta"},
1679 {"index", content_block_index},
1680 {"delta", {
1681 {"type", "input_json_delta"},
1682 {"partial_json", diff.tool_call_delta.arguments}
1683 }}
1684 }}
1685 });
1686 }
1687 }
1688 }
1689
1690 return events;
1691}
1692
1693//
1694// server_task_result_embd
1695//
1696json server_task_result_embd::to_json() {
1697 return res_type == TASK_RESPONSE_TYPE_OAI_EMBD
1698 ? to_json_oaicompat()
1699 : to_json_non_oaicompat();
1700}
1701
1702json server_task_result_embd::to_json_non_oaicompat() {
1703 return json {
1704 {"index", index},
1705 {"embedding", embedding},
1706 };
1707}
1708
1709json server_task_result_embd::to_json_oaicompat() {
1710 return json {
1711 {"index", index},
1712 {"embedding", embedding[0]},
1713 {"tokens_evaluated", n_tokens},
1714 };
1715}
1716
1717//
1718// server_task_result_rerank
1719//
1720json server_task_result_rerank::to_json() {
1721 return json {
1722 {"index", index},
1723 {"score", score},
1724 {"tokens_evaluated", n_tokens},
1725 };
1726}
1727
1728//
1729// server_task_result_error
1730//
1731json server_task_result_error::to_json() {
1732 json res = format_error_response(err_msg, err_type);
1733 if (err_type == ERROR_TYPE_EXCEED_CONTEXT_SIZE) {
1734 res["n_prompt_tokens"] = n_prompt_tokens;
1735 res["n_ctx"] = n_ctx;
1736 }
1737 return res;
1738}
1739
1740//
1741// server_task_result_metrics
1742//
1743json server_task_result_metrics::to_json() {
1744 return json {
1745 { "idle", n_idle_slots },
1746 { "processing", n_processing_slots },
1747 { "deferred", n_tasks_deferred },
1748 { "t_start", t_start },
1749
1750 { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
1751 { "t_tokens_generation_total", t_tokens_generation_total },
1752 { "n_tokens_predicted_total", n_tokens_predicted_total },
1753 { "t_prompt_processing_total", t_prompt_processing_total },
1754
1755 { "n_tokens_max", n_tokens_max },
1756
1757 { "n_prompt_tokens_processed", n_prompt_tokens_processed },
1758 { "t_prompt_processing", t_prompt_processing },
1759 { "n_tokens_predicted", n_tokens_predicted },
1760 { "t_tokens_generation", t_tokens_generation },
1761
1762 { "n_decode_total", n_decode_total },
1763 { "n_busy_slots_total", n_busy_slots_total },
1764
1765 { "slots", slots_data },
1766 };
1767}
1768
1769//
1770// server_task_result_slot_save_load
1771//
1772json server_task_result_slot_save_load::to_json() {
1773 if (is_save) {
1774 return json {
1775 { "id_slot", id_slot },
1776 { "filename", filename },
1777 { "n_saved", n_tokens },
1778 { "n_written", n_bytes },
1779 { "timings", {
1780 { "save_ms", t_ms }
1781 }},
1782 };
1783 }
1784
1785 return json {
1786 { "id_slot", id_slot },
1787 { "filename", filename },
1788 { "n_restored", n_tokens },
1789 { "n_read", n_bytes },
1790 { "timings", {
1791 { "restore_ms", t_ms }
1792 }},
1793 };
1794}
1795
1796//
1797// server_task_result_slot_erase
1798//
1799json server_task_result_slot_erase::to_json() {
1800 return json {
1801 { "id_slot", id_slot },
1802 { "n_erased", n_erased },
1803 };
1804}
1805
1806//
1807// server_task_result_get_lora
1808//
1809
1810json server_task_result_get_lora::to_json() {
1811 json result = json::array();
1812 for (size_t i = 0; i < loras.size(); ++i) {
1813 auto & lora = loras[i];
1814 json entry = {
1815 {"id", i},
1816 {"path", lora.info.path},
1817 {"scale", lora.info.scale},
1818 {"task_name", lora.info.task_name},
1819 {"prompt_prefix", lora.info.prompt_prefix},
1820 };
1821 if (!lora.alora_invocation_tokens.empty()) {
1822 entry["alora_invocation_string"] = lora.alora_invocation_string;
1823 entry["alora_invocation_tokens"] = lora.alora_invocation_tokens;
1824 }
1825 result.push_back(std::move(entry));
1826 }
1827 return result;
1828}
1829
1830//
1831// server_task_result_apply_lora
1832//
1833
1834json server_task_result_apply_lora::to_json() {
1835 return json {{ "success", true }};
1836}
1837
1838//
1839// server_prompt_cache
1840//
1841size_t server_prompt_cache::size() const {
1842 size_t res = 0;
1843
1844 for (const auto & state : states) {
1845 res += state.size();
1846 }
1847
1848 return res;
1849}
1850
1851size_t server_prompt_cache::n_tokens() const {
1852 size_t res = 0;
1853
1854 for (const auto & state : states) {
1855 res += state.n_tokens();
1856 }
1857
1858 return res;
1859}
1860
1861server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t state_size) {
1862 // first check if the current state is contained fully in the cache
1863 for (auto it = states.begin(); it != states.end(); ++it) {
1864 const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
1865
1866 if (cur_lcp_len == (int) prompt.tokens.size()) {
1867 SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
1868 return nullptr;
1869 }
1870 }
1871
1872 // next, remove any cached prompts that are fully contained in the current prompt
1873 for (auto it = states.begin(); it != states.end();) {
1874 const int len = it->tokens.get_common_prefix(prompt.tokens);
1875
1876 if (len == (int) it->tokens.size()) {
1877 SRV_WRN(" - removing obsolete cached prompt with length %d\n", len);
1878
1879 it = states.erase(it);
1880 } else {
1881 ++it;
1882 }
1883 }
1884
1885 std::vector<uint8_t> state_data;
1886
1887 // check if we can allocate enough memory for the new state
1888 try {
1889 state_data.resize(state_size);
1890 } catch (const std::bad_alloc & e) {
1891 SRV_ERR("failed to allocate memory for prompt cache state: %s\n", e.what());
1892
1893 limit_size = std::max<size_t>(1, 0.4*size());
1894
1895 SRV_WRN(" - cache size limit reduced to %.3f MiB\n", limit_size / (1024.0 * 1024.0));
1896
1897 update();
1898
1899 return nullptr;
1900 }
1901
1902 // TODO: for some reason we can't copy server_tokens, so we have to do this workaround
1903 auto & cur = states.emplace_back();
1904 cur = {
1905 /*.tokens =*/ server_tokens(prompt.tokens.get_text_tokens(), false),
1906 /*.data =*/ std::move(state_data),
1907 /*.checkpoints =*/ prompt.checkpoints,
1908 };
1909
1910 return &cur;
1911}
1912
1913bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
1914 const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
1915
1916 float f_keep_best = float(lcp_best) / prompt.tokens.size();
1917 float sim_best = float(lcp_best) / tokens_new.size();
1918
1919 SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
1920
1921 auto it_best = states.end();
1922
1923 // find the most similar cached prompt, that would also preserve the most context
1924 for (auto it = states.begin(); it != states.end(); ++it) {
1925 const int lcp_cur = it->tokens.get_common_prefix(tokens_new);
1926
1927 const float f_keep_cur = float(lcp_cur) / it->tokens.size();
1928 const float sim_cur = float(lcp_cur) / tokens_new.size();
1929
1930 // don't trash large prompts
1931 if (f_keep_cur < 0.25f) {
1932 continue;
1933 }
1934
1935 if (f_keep_best < f_keep_cur && sim_best < sim_cur) {
1936 f_keep_best = f_keep_cur;
1937 sim_best = sim_cur;
1938
1939 it_best = it;
1940 }
1941 }
1942
1943 if (it_best != states.end()) {
1944 SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
1945
1946 const size_t size = it_best->data.size();
1947 const size_t n = llama_state_seq_set_data_ext(ctx, it_best->data.data(), size, id_slot, 0);
1948 if (n != size) {
1949 SRV_WRN("failed to restore state with size %zu\n", size);
1950
1951 return false;
1952 }
1953
1954 it_best->data.clear();
1955 it_best->data.shrink_to_fit();
1956
1957 prompt = std::move(*it_best);
1958
1959 states.erase(it_best);
1960 }
1961
1962 return true;
1963}
1964
1965void server_prompt_cache::update() {
1966 if (limit_size > 0) {
1967 // always keep at least one state, regardless of the limits
1968 while (states.size() > 1 && size() > limit_size) {
1969 if (states.empty()) {
1970 break;
1971 }
1972
1973 SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
1974
1975 states.pop_front();
1976 }
1977 }
1978
1979 // average size per token
1980 const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
1981
1982 // dynamically increase the token limit if it can fit in the memory limit
1983 const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
1984
1985 if (limit_tokens > 0) {
1986 while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
1987 if (states.empty()) {
1988 break;
1989 }
1990
1991 SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
1992 limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
1993
1994 states.pop_front();
1995 }
1996 }
1997
1998 SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
1999 states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
2000
2001 for (const auto & state : states) {
2002 SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
2003 (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
2004 }
2005}