1#include "server-http.h"
  2#include "server-task.h"
  3#include "server-queue.h"
  4
  5#include <nlohmann/json_fwd.hpp>
  6
  7#include <cstddef>
  8#include <memory>
  9
 10struct server_context_impl; // private implementation
 11
 12struct server_context_meta {
 13    std::string build_info;
 14    std::string model_name;
 15    std::string model_path;
 16    bool has_mtmd;
 17    bool has_inp_image;
 18    bool has_inp_audio;
 19    json json_webui_settings;
 20    int slot_n_ctx;
 21    enum llama_pooling_type pooling_type;
 22
 23    // chat params
 24    server_chat_params & chat_params;
 25    std::map<std::string, bool> chat_template_caps;
 26
 27    // tokens
 28    std::string bos_token_str;
 29    std::string eos_token_str;
 30    llama_token fim_pre_token;
 31    llama_token fim_sub_token;
 32    llama_token fim_mid_token;
 33
 34    // model meta
 35    enum llama_vocab_type model_vocab_type;
 36    int32_t model_vocab_n_tokens;
 37    int32_t model_n_ctx_train;
 38    int32_t model_n_embd_inp;
 39    uint64_t model_n_params;
 40    uint64_t model_size;
 41};
 42
 43struct server_context {
 44    std::unique_ptr<server_context_impl> impl;
 45
 46    server_context();
 47    ~server_context();
 48
 49    // load the model and initialize llama_context
 50    // returns true on success
 51    bool load_model(const common_params & params);
 52
 53    // this function will block main thread until termination
 54    void start_loop();
 55
 56    // terminate main loop (will unblock start_loop)
 57    void terminate();
 58
 59    // get the underlaying llama_context, can return nullptr if sleeping
 60    // not thread-safe, should only be used from the main thread
 61    llama_context * get_llama_context() const;
 62
 63    // get a new response reader, used by CLI application
 64    server_response_reader get_response_reader();
 65
 66    // get server metadata (read-only), can only be called after load_model()
 67    // not thread-safe, should only be used from the main thread
 68    server_context_meta get_meta() const;
 69};
 70
 71
 72// forward declarations
 73struct server_res_generator;
 74
 75struct server_routes {
 76    server_routes(const common_params & params, server_context & ctx_server);
 77
 78    void init_routes();
 79
 80    // note: this is not thread-safe and can only when ctx_http.is_ready is false
 81    void update_meta(const server_context & ctx_server) {
 82        this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
 83    }
 84
 85    // handlers using lambda function, so that they can capture `this` without `std::bind`
 86    // they won't be called until ctx_http.is_ready is set to true
 87    server_http_context::handler_t get_health;
 88    server_http_context::handler_t get_metrics;
 89    server_http_context::handler_t get_slots;
 90    server_http_context::handler_t post_slots;
 91    server_http_context::handler_t get_props;
 92    server_http_context::handler_t post_props;
 93    server_http_context::handler_t get_api_show;
 94    server_http_context::handler_t post_infill;
 95    server_http_context::handler_t post_completions;
 96    server_http_context::handler_t post_completions_oai;
 97    server_http_context::handler_t post_chat_completions;
 98    server_http_context::handler_t post_responses_oai;
 99    server_http_context::handler_t post_anthropic_messages;
100    server_http_context::handler_t post_anthropic_count_tokens;
101    server_http_context::handler_t post_apply_template;
102    server_http_context::handler_t get_models;
103    server_http_context::handler_t post_tokenize;
104    server_http_context::handler_t post_detokenize;
105    server_http_context::handler_t post_embeddings;
106    server_http_context::handler_t post_embeddings_oai;
107    server_http_context::handler_t post_rerank;
108    server_http_context::handler_t get_lora_adapters;
109    server_http_context::handler_t post_lora_adapters;
110private:
111    std::unique_ptr<server_res_generator> handle_completions_impl(
112            const server_http_req & req,
113            server_task_type type,
114            const json & data,
115            const std::vector<raw_buffer> & files,
116            task_response_type res_type);
117    std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
118    std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
119    std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
120    std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
121
122    // using unique_ptr to allow late initialization of const
123    std::unique_ptr<const server_context_meta> meta;
124
125    const common_params & params;
126    const server_context_impl & ctx_server;
127
128    server_queue & queue_tasks;
129    server_response & queue_results;
130    std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
131};