diff options
Diffstat (limited to 'llama.cpp/tools/server/server-context.h')
| -rw-r--r-- | llama.cpp/tools/server/server-context.h | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/server-context.h b/llama.cpp/tools/server/server-context.h new file mode 100644 index 0000000..c0b5d37 --- /dev/null +++ b/llama.cpp/tools/server/server-context.h @@ -0,0 +1,131 @@ +#include "server-http.h" +#include "server-task.h" +#include "server-queue.h" + +#include <nlohmann/json_fwd.hpp> + +#include <cstddef> +#include <memory> + +struct server_context_impl; // private implementation + +struct server_context_meta { + std::string build_info; + std::string model_name; + std::string model_path; + bool has_mtmd; + bool has_inp_image; + bool has_inp_audio; + json json_webui_settings; + int slot_n_ctx; + enum llama_pooling_type pooling_type; + + // chat params + server_chat_params & chat_params; + std::map<std::string, bool> chat_template_caps; + + // tokens + std::string bos_token_str; + std::string eos_token_str; + llama_token fim_pre_token; + llama_token fim_sub_token; + llama_token fim_mid_token; + + // model meta + enum llama_vocab_type model_vocab_type; + int32_t model_vocab_n_tokens; + int32_t model_n_ctx_train; + int32_t model_n_embd_inp; + uint64_t model_n_params; + uint64_t model_size; +}; + +struct server_context { + std::unique_ptr<server_context_impl> impl; + + server_context(); + ~server_context(); + + // load the model and initialize llama_context + // returns true on success + bool load_model(const common_params & params); + + // this function will block main thread until termination + void start_loop(); + + // terminate main loop (will unblock start_loop) + void terminate(); + + // get the underlaying llama_context, can return nullptr if sleeping + // not thread-safe, should only be used from the main thread + llama_context * get_llama_context() const; + + // get a new response reader, used by CLI application + server_response_reader get_response_reader(); + + // get server metadata (read-only), can only be called after load_model() + // not thread-safe, should only be used from the main thread + server_context_meta get_meta() const; +}; + + +// forward declarations +struct server_res_generator; + +struct server_routes { + server_routes(const common_params & params, server_context & ctx_server); + + void init_routes(); + + // note: this is not thread-safe and can only when ctx_http.is_ready is false + void update_meta(const server_context & ctx_server) { + this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta()); + } + + // handlers using lambda function, so that they can capture `this` without `std::bind` + // they won't be called until ctx_http.is_ready is set to true + server_http_context::handler_t get_health; + server_http_context::handler_t get_metrics; + server_http_context::handler_t get_slots; + server_http_context::handler_t post_slots; + server_http_context::handler_t get_props; + server_http_context::handler_t post_props; + server_http_context::handler_t get_api_show; + server_http_context::handler_t post_infill; + server_http_context::handler_t post_completions; + server_http_context::handler_t post_completions_oai; + server_http_context::handler_t post_chat_completions; + server_http_context::handler_t post_responses_oai; + server_http_context::handler_t post_anthropic_messages; + server_http_context::handler_t post_anthropic_count_tokens; + server_http_context::handler_t post_apply_template; + server_http_context::handler_t get_models; + server_http_context::handler_t post_tokenize; + server_http_context::handler_t post_detokenize; + server_http_context::handler_t post_embeddings; + server_http_context::handler_t post_embeddings_oai; + server_http_context::handler_t post_rerank; + server_http_context::handler_t get_lora_adapters; + server_http_context::handler_t post_lora_adapters; +private: + std::unique_ptr<server_res_generator> handle_completions_impl( + const server_http_req & req, + server_task_type type, + const json & data, + const std::vector<raw_buffer> & files, + task_response_type res_type); + std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot); + std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot); + std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot); + std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type); + + // using unique_ptr to allow late initialization of const + std::unique_ptr<const server_context_meta> meta; + + const common_params & params; + const server_context_impl & ctx_server; + + server_queue & queue_tasks; + server_response & queue_results; + std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false); +}; |
