diff options
Diffstat (limited to 'llama.cpp/tools/server/server-common.h')
| -rw-r--r-- | llama.cpp/tools/server/server-common.h | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/server-common.h b/llama.cpp/tools/server/server-common.h new file mode 100644 index 0000000..2629a6b --- /dev/null +++ b/llama.cpp/tools/server/server-common.h @@ -0,0 +1,366 @@ +#pragma once + +#include "common.h" +#include "log.h" +#include "llama.h" +#include "chat.h" +#include "mtmd.h" + +#define JSON_ASSERT GGML_ASSERT +#include <nlohmann/json.hpp> + +#include <string> +#include <vector> +#include <cinttypes> + +using json = nlohmann::ordered_json; + +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +using raw_buffer = std::vector<uint8_t>; + +template <typename T> +static T json_value(const json & body, const std::string & key, const T & default_value) { + // Fallback null to default value + if (body.contains(key) && !body.at(key).is_null()) { + try { + return body.at(key); + } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) { + LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what()); + return default_value; + } + } else { + return default_value; + } +} + +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error + ERROR_TYPE_EXCEED_CONTEXT_SIZE, // custom error +}; + +// thin wrapper around common_grammar_trigger with (de)serialization functions +struct server_grammar_trigger { + common_grammar_trigger value; + + server_grammar_trigger() = default; + server_grammar_trigger(const common_grammar_trigger & value) : value(value) {} + server_grammar_trigger(const json & in) { + value.type = (common_grammar_trigger_type) in.at("type").get<int>(); + value.value = in.at("value").get<std::string>(); + if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + value.token = (llama_token) in.at("token").get<int>(); + } + } + + json to_json() const { + json out { + {"type", (int) value.type}, + {"value", value.value}, + }; + if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + out["token"] = (int) value.token; + } + return out; + } +}; + +json format_error_response(const std::string & message, const enum error_type type); + +// +// random string / id +// + +std::string random_string(); +std::string gen_chatcmplid(); +std::string gen_tool_call_id(); + +// +// lora utils +// + +// check whether the given lora set has only aloras activated (empty => false) +bool lora_all_alora(const std::vector<common_adapter_lora_info> & loras); + +// if the two sets of loras are different, they require a cache clear unless the +// change is only from aloras to aloras. +bool lora_should_clear_cache( + const std::vector<common_adapter_lora_info> & current, + const std::vector<common_adapter_lora_info> & next); + +std::map<int, float> parse_lora_request(const json & data); + +bool are_lora_equal( + const std::vector<common_adapter_lora_info> & l1, + const std::vector<common_adapter_lora_info> & l2); + +// get the ids of all enabled loras +std::vector<size_t> lora_get_enabled_ids(const std::vector<common_adapter_lora_info> & loras); + +// +// server_tokens +// + +/** + * server_tokens is a helper to manage the input tokens and image for the server. + * it is made this way to simplify the logic of KV cache management. + */ +struct server_tokens { + bool has_mtmd = false; + +private: // disallow accessing these members directly, risking out-of-sync + + // map a **start** index in tokens to the image chunk + // note: the order need to be in-sync with tokens + std::map<size_t, mtmd::input_chunk_ptr> map_idx_to_media; + + // list of tokens + // if the token is LLAMA_TOKEN_NULL, it indicates that this position is occupied by media chunk + // otherwise, it is a normal text token + // note: a non-text chunk can occupy multiple tokens (aka memory cells) in the token list + // note(2): for M-RoPE, an image can occupy different number of pos; do not assume 1-to-1 mapping tokens <-> pos + llama_tokens tokens; + + // for ex. with input of 5 text tokens and 2 images (each image occupies 3 tokens and 2 pos): + // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] [img1] + // idx 0 1 2 3 4 5 6 7 8 9 10 + // pos 0 1 2 3 4 5 5 5 7 7 7 + // map_idx_to_media will contain: {5, img0}, {8, img1} + +public: + server_tokens() = default; + ~server_tokens() = default; + + // Prevent copying + // TODO: server_tokens should be copyable - remove this: + server_tokens(const server_tokens&) = delete; + server_tokens& operator=(const server_tokens&) = delete; + + // Allow moving (usually implicitly generated if members are movable) + server_tokens(server_tokens&&) = default; + server_tokens& operator=(server_tokens&&) = default; + + // Allow accessing elements using [] operator + llama_token operator[](size_t index) { return tokens[index]; } + const llama_token& operator[](size_t index) const { return tokens[index]; } + + server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd); + server_tokens(const llama_tokens & tokens, bool has_mtmd); + + // for debugging + std::string str() const; + + llama_pos pos_next() const; + const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; + + void push_back(llama_token tok); + + // will create a copy of the chunk if it contains non-text data + void push_back(const mtmd_input_chunk * chunk); + + // appends server tokens, updates the media map. copies media chunks. + void push_back(server_tokens & tokens); + + // for compatibility with context shift and prompt truncation + void insert(const llama_tokens & inp_tokens); + + // for compatibility with speculative decoding, ctx shift, slot save/load + const llama_tokens & get_text_tokens() const; + + // for compatibility with speculative decoding + void set_token(llama_pos pos, llama_token id); + + size_t size() const { return tokens.size(); } + + bool empty() const { return tokens.empty(); } + + void clear() { + map_idx_to_media.clear(); + tokens.clear(); + } + + void keep_first(size_t n); + + std::string detokenize(const llama_context * ctx, bool special) const; + + size_t get_common_prefix(const server_tokens & b) const; + + // make sure all text tokens are within the vocab range + bool validate(const struct llama_context * ctx) const; + + // encode and decode the image chunk + int32_t process_chunk( + llama_context * ctx, + mtmd_context * mctx, + size_t idx, + llama_pos pos, + int32_t seq_id, + size_t & n_tokens_out) const; + + server_tokens clone() const; +}; + + +// +// tokenizer and input processing utils +// + +bool json_is_array_of_numbers(const json & data); + +// is array having BOTH numbers & strings? +bool json_is_array_of_mixed_numbers_strings(const json & data); + +// does array have any individual integers/tokens? +bool json_is_array_and_contains_numbers(const json & data); + +// get value by path(key1 / key2) +json json_get_nested_values(const std::vector<std::string> & paths, const json & js); + +/** + * this handles 2 cases: + * - only string, example: "string" + * - mixed string and tokens, example: [12, 34, "string", 56, 78] + */ +llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special); + +// return the last index of character that can form a valid string +// if the last character is potentially cut in half, return the index before the cut +// if validate_utf8(text) == text.size(), then the whole text is valid utf8 +size_t validate_utf8(const std::string& text); + +// process mtmd prompt, return the server_tokens containing both text tokens and media chunks +server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files); + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } + * and multiple prompts (multi-tasks): + * - "prompt": ["string1", "string2"] + * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, 56], [78, 90, 12]] + * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}] + */ +std::vector<server_tokens> tokenize_input_prompts( + const llama_vocab * vocab, + mtmd_context * mctx, + const json & json_prompt, + bool add_special, + bool parse_special); + +// +// OAI utils +// + +// global server parameters for chat formatting / parsing +struct server_chat_params { + bool use_jinja; + bool prefill_assistant; + common_reasoning_format reasoning_format; + std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value + common_chat_templates_ptr tmpls; + bool allow_image; + bool allow_audio; + bool enable_thinking = true; + std::string media_path; +}; + +// used by /completions endpoint +json oaicompat_completion_params_parse(const json & body); + +// used by /chat/completions endpoint +json oaicompat_chat_params_parse( + json & body, /* openai api json semantics */ + const server_chat_params & opt, + std::vector<raw_buffer> & out_files); + +// convert OpenAI Responses API format to OpenAI Chat Completions API format +json convert_responses_to_chatcmpl(const json & body); + +// convert Anthropic Messages API format to OpenAI Chat Completions API format +json convert_anthropic_to_oai(const json & body); + +// TODO: move it to server-task.cpp +json format_embeddings_response_oaicompat( + const json & request, + const std::string & model_name, + const json & embeddings, + bool use_base64 = false); + +// TODO: move it to server-task.cpp +json format_response_rerank( + const json & request, + const std::string & model_name, + const json & ranks, + bool is_tei_format, + std::vector<std::string> & texts, + int top_n); + +// +// other utils +// + +std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx); + +std::string safe_json_to_str(const json & data); + +std::string tokens_to_str(llama_context * ctx, const llama_tokens & tokens); +std::string tokens_to_str(const llama_vocab * vocab, const llama_tokens & tokens); + +// format incomplete utf-8 multibyte character for output +std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token); + +// format server-sent event (SSE), return the formatted string to send +// note: if data is a json array, it will be sent as multiple events, one per item +std::string format_oai_sse(const json & data); + +std::string format_oai_resp_sse(const json & data); + +// format Anthropic-style SSE with event types +std::string format_anthropic_sse(const json & data); + +bool is_valid_utf8(const std::string & str); + +// +// formatting output responses +// TODO: move these to server-task.cpp +// + +llama_tokens format_prompt_infill( + const llama_vocab * vocab, + const json & input_prefix, + const json & input_suffix, + const json & input_extra, + const int n_batch, + const int n_predict, + const int n_ctx, + const bool spm_infill, + const llama_tokens & tokens_prompt); + +// format rerank task: [BOS]query[EOS][SEP]doc[EOS]. +server_tokens format_prompt_rerank( + const struct llama_model * model, + const struct llama_vocab * vocab, + mtmd_context * mctx, + const std::string & query, + const std::string & doc); |
