1#include "server-http.h"
2#include "server-task.h"
3#include "server-queue.h"
4
5#include <nlohmann/json_fwd.hpp>
6
7#include <cstddef>
8#include <memory>
9
10struct server_context_impl; // private implementation
11
12struct server_context_meta {
13 std::string build_info;
14 std::string model_name;
15 std::string model_path;
16 bool has_mtmd;
17 bool has_inp_image;
18 bool has_inp_audio;
19 json json_webui_settings;
20 int slot_n_ctx;
21 enum llama_pooling_type pooling_type;
22
23 // chat params
24 server_chat_params & chat_params;
25 std::map<std::string, bool> chat_template_caps;
26
27 // tokens
28 std::string bos_token_str;
29 std::string eos_token_str;
30 llama_token fim_pre_token;
31 llama_token fim_sub_token;
32 llama_token fim_mid_token;
33
34 // model meta
35 enum llama_vocab_type model_vocab_type;
36 int32_t model_vocab_n_tokens;
37 int32_t model_n_ctx_train;
38 int32_t model_n_embd_inp;
39 uint64_t model_n_params;
40 uint64_t model_size;
41};
42
43struct server_context {
44 std::unique_ptr<server_context_impl> impl;
45
46 server_context();
47 ~server_context();
48
49 // load the model and initialize llama_context
50 // returns true on success
51 bool load_model(const common_params & params);
52
53 // this function will block main thread until termination
54 void start_loop();
55
56 // terminate main loop (will unblock start_loop)
57 void terminate();
58
59 // get the underlaying llama_context, can return nullptr if sleeping
60 // not thread-safe, should only be used from the main thread
61 llama_context * get_llama_context() const;
62
63 // get a new response reader, used by CLI application
64 server_response_reader get_response_reader();
65
66 // get server metadata (read-only), can only be called after load_model()
67 // not thread-safe, should only be used from the main thread
68 server_context_meta get_meta() const;
69};
70
71
72// forward declarations
73struct server_res_generator;
74
75struct server_routes {
76 server_routes(const common_params & params, server_context & ctx_server);
77
78 void init_routes();
79
80 // note: this is not thread-safe and can only when ctx_http.is_ready is false
81 void update_meta(const server_context & ctx_server) {
82 this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
83 }
84
85 // handlers using lambda function, so that they can capture `this` without `std::bind`
86 // they won't be called until ctx_http.is_ready is set to true
87 server_http_context::handler_t get_health;
88 server_http_context::handler_t get_metrics;
89 server_http_context::handler_t get_slots;
90 server_http_context::handler_t post_slots;
91 server_http_context::handler_t get_props;
92 server_http_context::handler_t post_props;
93 server_http_context::handler_t get_api_show;
94 server_http_context::handler_t post_infill;
95 server_http_context::handler_t post_completions;
96 server_http_context::handler_t post_completions_oai;
97 server_http_context::handler_t post_chat_completions;
98 server_http_context::handler_t post_responses_oai;
99 server_http_context::handler_t post_anthropic_messages;
100 server_http_context::handler_t post_anthropic_count_tokens;
101 server_http_context::handler_t post_apply_template;
102 server_http_context::handler_t get_models;
103 server_http_context::handler_t post_tokenize;
104 server_http_context::handler_t post_detokenize;
105 server_http_context::handler_t post_embeddings;
106 server_http_context::handler_t post_embeddings_oai;
107 server_http_context::handler_t post_rerank;
108 server_http_context::handler_t get_lora_adapters;
109 server_http_context::handler_t post_lora_adapters;
110private:
111 std::unique_ptr<server_res_generator> handle_completions_impl(
112 const server_http_req & req,
113 server_task_type type,
114 const json & data,
115 const std::vector<raw_buffer> & files,
116 task_response_type res_type);
117 std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
118 std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
119 std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
120 std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
121
122 // using unique_ptr to allow late initialization of const
123 std::unique_ptr<const server_context_meta> meta;
124
125 const common_params & params;
126 const server_context_impl & ctx_server;
127
128 server_queue & queue_tasks;
129 server_response & queue_results;
130 std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
131};