summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/server/server-context.h
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/server/server-context.h
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/tools/server/server-context.h')
-rw-r--r--llama.cpp/tools/server/server-context.h131
1 files changed, 131 insertions, 0 deletions
diff --git a/llama.cpp/tools/server/server-context.h b/llama.cpp/tools/server/server-context.h
new file mode 100644
index 0000000..c0b5d37
--- /dev/null
+++ b/llama.cpp/tools/server/server-context.h
@@ -0,0 +1,131 @@
+#include "server-http.h"
+#include "server-task.h"
+#include "server-queue.h"
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <cstddef>
+#include <memory>
+
+struct server_context_impl; // private implementation
+
+struct server_context_meta {
+ std::string build_info;
+ std::string model_name;
+ std::string model_path;
+ bool has_mtmd;
+ bool has_inp_image;
+ bool has_inp_audio;
+ json json_webui_settings;
+ int slot_n_ctx;
+ enum llama_pooling_type pooling_type;
+
+ // chat params
+ server_chat_params & chat_params;
+ std::map<std::string, bool> chat_template_caps;
+
+ // tokens
+ std::string bos_token_str;
+ std::string eos_token_str;
+ llama_token fim_pre_token;
+ llama_token fim_sub_token;
+ llama_token fim_mid_token;
+
+ // model meta
+ enum llama_vocab_type model_vocab_type;
+ int32_t model_vocab_n_tokens;
+ int32_t model_n_ctx_train;
+ int32_t model_n_embd_inp;
+ uint64_t model_n_params;
+ uint64_t model_size;
+};
+
+struct server_context {
+ std::unique_ptr<server_context_impl> impl;
+
+ server_context();
+ ~server_context();
+
+ // load the model and initialize llama_context
+ // returns true on success
+ bool load_model(const common_params & params);
+
+ // this function will block main thread until termination
+ void start_loop();
+
+ // terminate main loop (will unblock start_loop)
+ void terminate();
+
+ // get the underlaying llama_context, can return nullptr if sleeping
+ // not thread-safe, should only be used from the main thread
+ llama_context * get_llama_context() const;
+
+ // get a new response reader, used by CLI application
+ server_response_reader get_response_reader();
+
+ // get server metadata (read-only), can only be called after load_model()
+ // not thread-safe, should only be used from the main thread
+ server_context_meta get_meta() const;
+};
+
+
+// forward declarations
+struct server_res_generator;
+
+struct server_routes {
+ server_routes(const common_params & params, server_context & ctx_server);
+
+ void init_routes();
+
+ // note: this is not thread-safe and can only when ctx_http.is_ready is false
+ void update_meta(const server_context & ctx_server) {
+ this->meta = std::make_unique<server_context_meta>(ctx_server.get_meta());
+ }
+
+ // handlers using lambda function, so that they can capture `this` without `std::bind`
+ // they won't be called until ctx_http.is_ready is set to true
+ server_http_context::handler_t get_health;
+ server_http_context::handler_t get_metrics;
+ server_http_context::handler_t get_slots;
+ server_http_context::handler_t post_slots;
+ server_http_context::handler_t get_props;
+ server_http_context::handler_t post_props;
+ server_http_context::handler_t get_api_show;
+ server_http_context::handler_t post_infill;
+ server_http_context::handler_t post_completions;
+ server_http_context::handler_t post_completions_oai;
+ server_http_context::handler_t post_chat_completions;
+ server_http_context::handler_t post_responses_oai;
+ server_http_context::handler_t post_anthropic_messages;
+ server_http_context::handler_t post_anthropic_count_tokens;
+ server_http_context::handler_t post_apply_template;
+ server_http_context::handler_t get_models;
+ server_http_context::handler_t post_tokenize;
+ server_http_context::handler_t post_detokenize;
+ server_http_context::handler_t post_embeddings;
+ server_http_context::handler_t post_embeddings_oai;
+ server_http_context::handler_t post_rerank;
+ server_http_context::handler_t get_lora_adapters;
+ server_http_context::handler_t post_lora_adapters;
+private:
+ std::unique_ptr<server_res_generator> handle_completions_impl(
+ const server_http_req & req,
+ server_task_type type,
+ const json & data,
+ const std::vector<raw_buffer> & files,
+ task_response_type res_type);
+ std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot);
+ std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot);
+ std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot);
+ std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, task_response_type res_type);
+
+ // using unique_ptr to allow late initialization of const
+ std::unique_ptr<const server_context_meta> meta;
+
+ const common_params & params;
+ const server_context_impl & ctx_server;
+
+ server_queue & queue_tasks;
+ server_response & queue_results;
+ std::unique_ptr<server_res_generator> create_response(bool bypass_sleep = false);
+};