diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/server/server-task.h | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/tools/server/server-task.h')
| -rw-r--r-- | llama.cpp/tools/server/server-task.h | 620 |
1 files changed, 620 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/server-task.h b/llama.cpp/tools/server/server-task.h new file mode 100644 index 0000000..a69e8f1 --- /dev/null +++ b/llama.cpp/tools/server/server-task.h @@ -0,0 +1,620 @@ +#pragma once + +#include "common.h" +#include "llama.h" + +#include <string> +#include <unordered_set> +#include <list> +#include <map> + +// TODO: prevent including the whole server-common.h as we only use server_tokens +#include "server-common.h" + +using json = nlohmann::ordered_json; + +enum server_task_type { + SERVER_TASK_TYPE_COMPLETION, + SERVER_TASK_TYPE_EMBEDDING, + SERVER_TASK_TYPE_RERANK, + SERVER_TASK_TYPE_INFILL, + SERVER_TASK_TYPE_CANCEL, + SERVER_TASK_TYPE_NEXT_RESPONSE, + SERVER_TASK_TYPE_METRICS, + SERVER_TASK_TYPE_SLOT_SAVE, + SERVER_TASK_TYPE_SLOT_RESTORE, + SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_GET_LORA, + SERVER_TASK_TYPE_SET_LORA, +}; + +// TODO: change this to more generic "response_format" to replace the "format_response_*" in server-common +enum task_response_type { + TASK_RESPONSE_TYPE_NONE, // llama.cpp native format + TASK_RESPONSE_TYPE_OAI_CHAT, + TASK_RESPONSE_TYPE_OAI_CMPL, + TASK_RESPONSE_TYPE_OAI_RESP, + TASK_RESPONSE_TYPE_OAI_EMBD, + TASK_RESPONSE_TYPE_ANTHROPIC, +}; + +enum stop_type { + STOP_TYPE_NONE, + STOP_TYPE_EOS, + STOP_TYPE_WORD, + STOP_TYPE_LIMIT, +}; + +struct task_params { + bool stream = true; + bool include_usage = false; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool return_tokens = false; + bool return_progress = false; + + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + int32_t n_indent = 0; // minimum line indentation for the generated text in number of whitespace characters + int32_t n_cmpl = 1; // number of completions to generate from this prompt + + int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled) + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + + std::map<int, float> lora; // mapping adapter ID -> scale + + std::vector<std::string> antiprompt; + std::vector<std::string> response_fields; + + bool timings_per_token = false; + bool post_sampling_probs = false; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; + + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + // per-request parameters for chat parsing + common_chat_parser_params chat_parser_params; + + // Embeddings + int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm) + + json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) const; + json to_json(bool only_metrics = false) const; +}; + +// struct for tracking the state of a task (e.g., for streaming) +struct task_result_state { + // tracking diffs for partial tool calls + std::vector<common_chat_msg_diff> diffs; + common_chat_parser_params chat_parser_params; + common_chat_msg chat_msg; + std::string generated_text; // append new chunks of generated text here + std::vector<std::string> generated_tool_call_ids; + + // for OpenAI Responses and Anthropic streaming API: + // track output item / content block state across chunks + bool thinking_block_started = false; + bool text_block_started = false; + + // for OpenAI Responses streaming API + const std::string oai_resp_id; + const std::string oai_resp_reasoning_id; + const std::string oai_resp_message_id; + std::string oai_resp_fc_id; // function call ID for current args delta + + task_result_state(const common_chat_parser_params & chat_parser_params) + : chat_parser_params(chat_parser_params) + , oai_resp_id("resp_" + random_string()) + , oai_resp_reasoning_id("rs_" + random_string()) + , oai_resp_message_id("msg_" + random_string()) {} + + // parse partial tool calls and update the internal state + common_chat_msg update_chat_msg( + const std::string & text_added, + bool is_partial, + std::vector<common_chat_msg_diff> & diffs); +}; + +struct server_task { + int id = -1; // to be filled by server_queue + + // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader + size_t index = 0; // used when there are multiple prompts (batch request) + + // used by SERVER_TASK_TYPE_CANCEL + int id_target = -1; + int id_slot = -1; + + // used by parallel sampling (multiple completions from same prompt) + int id_parent = -1; + // temporary store of child tasks for scheduling + // note: accessing to elements is invalid after the task is moved to server_slot + std::vector<server_task> child_tasks; + + // used by SERVER_TASK_TYPE_INFERENCE + task_params params; + server_tokens tokens; + + // only used by CLI, this allow tokenizing CLI inputs on server side + // we need this because mtmd_context and vocab are not accessible outside of server_context + bool cli = false; + std::string cli_prompt; + std::vector<raw_buffer> cli_files; + + server_task_type type; + + // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE + struct slot_action { + int id_slot; + std::string filename; + std::string filepath; + }; + slot_action slot_action; + + // used by SERVER_TASK_TYPE_METRICS + bool metrics_reset_bucket = false; + + // used by SERVER_TASK_TYPE_SET_LORA + std::map<int, float> set_lora; // mapping adapter ID -> scale + + server_task() = default; + + server_task(server_task_type type) : type(type) {} + + int32_t n_tokens() const { + return tokens.size(); + } + + bool need_embd() const { + switch (type) { + case SERVER_TASK_TYPE_EMBEDDING: + case SERVER_TASK_TYPE_RERANK: + return true; + default: + return false; + } + } + + bool need_logits() const { + switch (type) { + case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFILL: + return true; + default: + return false; + } + } + + bool need_sampling() const { + switch (type) { + case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFILL: + return true; + default: + return false; + } + } + + static task_params params_from_json_cmpl( + const llama_vocab * vocab, + const common_params & params_base, + const int n_ctx_slot, + const json & data); + + // utility function + static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) { + std::unordered_set<int> ids(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + ids.insert(tasks[i].id); + for (auto & child : tasks[i].child_tasks) { + ids.insert(child.id); + } + } + return ids; + } + + void add_child(int id_parent, int id_child) { + server_task copy; + + copy.id = id_child; + copy.id_parent = id_parent; + copy.params = params; + copy.type = type; + copy.tokens = tokens.clone(); + copy.id_slot = -1; // child tasks cannot specify slot + + // use different sampling seed for each child + // note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723 + if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) { + copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1; + } + + child_tasks.push_back(std::move(copy)); + } + + // the task will be moved into queue, then onto slots + // however, the state must be kept by caller (e.g., HTTP thread) + task_result_state create_state() const { + return task_result_state(params.chat_parser_params); + } + + bool is_parent() const { + return child_tasks.size() > 0; + } + + bool is_child() const { + return id_parent != -1; + } +}; + +struct result_timings { + int32_t cache_n = -1; + + int32_t prompt_n = -1; + double prompt_ms; + double prompt_per_token_ms; + double prompt_per_second; + + int32_t predicted_n = -1; + double predicted_ms; + double predicted_per_token_ms; + double predicted_per_second; + + // Optional speculative metrics - only included when > 0 + int32_t draft_n = 0; + int32_t draft_n_accepted = 0; + + json to_json() const; +}; + +struct result_prompt_progress { + int32_t total = 0; + int32_t cache = 0; + int32_t processed = 0; + int64_t time_ms = 0; + + json to_json() const; +}; + +struct server_task_result { + int id = -1; + int id_slot = -1; + + // TODO @ngxson : remove this field and implement a mapping task_id -> idx in the response_reader + size_t index = 0; // to be used for batched tasks + + virtual bool is_error() { + // only used by server_task_result_error + return false; + } + virtual bool is_stop() { + // only used by server_task_result_cmpl_* + return true; + } + virtual void update(task_result_state &) { + // only used by server_task_result_cmpl_* + } + virtual json to_json() = 0; + virtual ~server_task_result() = default; +}; + +// using shared_ptr for polymorphism of server_task_result +using server_task_result_ptr = std::unique_ptr<server_task_result>; + +struct completion_token_output { + llama_token tok; + float prob; + std::string text_to_send; + struct prob_info { + llama_token tok; + std::string txt; + float prob; + }; + std::vector<prob_info> probs; + + json to_json(bool post_sampling_probs) const; + + static json probs_vector_to_json(const std::vector<completion_token_output> & probs, bool post_sampling_probs); + + static float logarithm(float x); + + static std::vector<unsigned char> str_to_bytes(const std::string & str); + +}; + +struct server_task_result_cmpl_final : server_task_result { + std::string content; + llama_tokens tokens; + + bool stream; + bool include_usage; + result_timings timings; + std::string prompt; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + int32_t n_tokens_cached; + bool has_new_line; + std::string stopping_word; + stop_type stop = STOP_TYPE_NONE; + + bool post_sampling_probs; + std::vector<completion_token_output> probs_output; + std::vector<std::string> response_fields; + + task_params generation_params; + + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + common_chat_msg oaicompat_msg; // to be populated by update() + + std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update() + bool is_updated = false; + + // for OpenAI Responses API + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + + virtual bool is_stop() override { + return true; // in stream mode, final responses are considered stop + } + + virtual json to_json() override; + + virtual void update(task_result_state & state) override { + is_updated = true; + oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs); + + oai_resp_id = state.oai_resp_id; + oai_resp_reasoning_id = state.oai_resp_reasoning_id; + oai_resp_message_id = state.oai_resp_message_id; + } + + json to_json_non_oaicompat(); + + json to_json_oaicompat(); + + json to_json_oaicompat_chat(); + + json to_json_oaicompat_chat_stream(); + + json to_json_oaicompat_resp(); + + json to_json_oaicompat_resp_stream(); + + json to_json_anthropic(); + + json to_json_anthropic_stream(); +}; + +struct server_task_result_cmpl_partial : server_task_result { + std::string content; + llama_tokens tokens; + + int32_t n_decoded; + int32_t n_prompt_tokens; + + bool post_sampling_probs; + bool is_progress = false; + completion_token_output prob_output; + result_timings timings; + result_prompt_progress progress; + + // response formatting + bool verbose = false; + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update() + bool is_updated = false; + + // Streaming state copied from task_result_state for this chunk + bool thinking_block_started = false; + bool text_block_started = false; + + // for OpenAI Responses API + std::string oai_resp_id; + std::string oai_resp_reasoning_id; + std::string oai_resp_message_id; + std::string oai_resp_fc_id; + + // for Anthropic API: track if any reasoning content has been generated + bool anthropic_has_reasoning = false; + + virtual bool is_stop() override { + return false; // in stream mode, partial responses are not considered stop + } + + virtual void update(task_result_state & state) override; + + virtual json to_json() override; + + json to_json_non_oaicompat(); + + json to_json_oaicompat(); + + json to_json_oaicompat_chat(); + + json to_json_oaicompat_resp(); + + json to_json_anthropic(); +}; + +struct server_task_result_embd : server_task_result { + std::vector<std::vector<float>> embedding; + + int32_t n_tokens; + + // response formatting + task_response_type res_type = TASK_RESPONSE_TYPE_NONE; + + virtual json to_json() override; + + json to_json_non_oaicompat(); + + json to_json_oaicompat(); +}; + +struct server_task_result_rerank : server_task_result { + float score = -1e6; + + int32_t n_tokens; + + virtual json to_json() override; +}; + +struct server_task_result_error : server_task_result { + error_type err_type = ERROR_TYPE_SERVER; + std::string err_msg; + + // for ERROR_TYPE_EXCEED_CONTEXT_SIZE + int32_t n_prompt_tokens = 0; + int32_t n_ctx = 0; + + virtual bool is_error() override { + return true; + } + + virtual json to_json() override; +}; + +struct server_task_result_metrics : server_task_result { + int n_idle_slots; + int n_processing_slots; + int n_tasks_deferred; + int64_t t_start; + + // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; + + uint64_t n_tokens_max = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy + // therefore, we use json to temporarily store the slot.to_json() result + json slots_data = json::array(); + + virtual json to_json() override; +}; + +struct server_task_result_slot_save_load : server_task_result { + std::string filename; + bool is_save; // true = save, false = load + + size_t n_tokens; + size_t n_bytes; + double t_ms; + + virtual json to_json() override; +}; + +struct server_task_result_slot_erase : server_task_result { + size_t n_erased; + + virtual json to_json() override; +}; + +struct server_task_result_get_lora : server_task_result { + struct lora { + common_adapter_lora_info info; + std::string alora_invocation_string; + llama_tokens alora_invocation_tokens; + }; + std::vector<lora> loras; + + virtual json to_json() override; +}; + +struct server_task_result_apply_lora : server_task_result { + virtual json to_json() override; +}; + +struct server_prompt_checkpoint { + llama_pos pos_min; + llama_pos pos_max; + + std::vector<uint8_t> data; + + size_t size() const { + return data.size(); + } +}; + +struct server_prompt { + server_tokens tokens; + + std::vector<uint8_t> data; + + std::list<server_prompt_checkpoint> checkpoints; + + size_t size() const { + size_t res = data.size(); + + for (const auto & checkpoint : checkpoints) { + res += checkpoint.size(); + } + + return res; + } + + int n_tokens() const { + return tokens.size(); + } + + server_prompt clone() const { + return server_prompt { + tokens.clone(), + data, + checkpoints + }; + } +}; + +struct server_prompt_cache { + server_prompt_cache(int32_t limit_size_mib, size_t limit_tokens) { + this->limit_size = 1024ull*1024ull*(limit_size_mib < 0 ? 0 : limit_size_mib); + this->limit_tokens = limit_tokens; + } + + std::list<server_prompt> states; + + // in bytes, 0 = no limit + size_t limit_size = 0; + + // in tokens, 0 = no limit + size_t limit_tokens = 0; + + size_t size() const; + + size_t n_tokens() const; + + server_prompt * alloc(const server_prompt & prompt, size_t state_size); + + bool load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot); + + void update(); +}; |
