summaryrefslogtreecommitdiff
path: root/llama.cpp/src/llama-graph.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/src/llama-graph.h')
-rw-r--r--llama.cpp/src/llama-graph.h1021
1 files changed, 1021 insertions, 0 deletions
diff --git a/llama.cpp/src/llama-graph.h b/llama.cpp/src/llama-graph.h
new file mode 100644
index 0000000..1d69ff1
--- /dev/null
+++ b/llama.cpp/src/llama-graph.h
@@ -0,0 +1,1021 @@
+#pragma once
+
+#include "llama-arch.h"
+#include "llama-batch.h"
+#include "llama-hparams.h"
+#include "llama-adapter.h"
+
+#include <cstdint>
+#include <vector>
+#include <memory>
+#include <set>
+#include <functional>
+#include <map>
+
+struct ggml_cgraph;
+struct ggml_context;
+struct ggml_tensor;
+
+struct llama_cparams;
+
+struct llama_memory_context_i;
+
+class llama_kv_cache_context;
+class llama_kv_cache_iswa_context;
+class llama_memory_recurrent_context;
+class llama_memory_hybrid_context;
+class llama_memory_hybrid_iswa_context;
+
+// certain models (typically multi-modal) can produce different types of graphs
+enum llm_graph_type {
+ LLM_GRAPH_TYPE_DEFAULT,
+ LLM_GRAPH_TYPE_ENCODER,
+ LLM_GRAPH_TYPE_DECODER,
+};
+
+enum llm_ffn_op_type {
+ LLM_FFN_SILU,
+ LLM_FFN_GELU,
+ LLM_FFN_RELU,
+ LLM_FFN_RELU_SQR,
+ LLM_FFN_SWIGLU,
+ LLM_FFN_GEGLU,
+ LLM_FFN_REGLU,
+ LLM_FFN_SWIGLU_OAI_MOE,
+};
+
+enum llm_ffn_gate_type {
+ LLM_FFN_SEQ,
+ LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
+};
+
+enum llm_norm_type {
+ LLM_NORM,
+ LLM_NORM_RMS,
+ LLM_NORM_GROUP,
+};
+
+// TODO: tmp - need something better to pass the data from the encoder to the decoder
+struct llama_cross {
+ // the output embeddings from the encoder as a ggml tensor
+ // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
+ // ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
+ //ggml_tensor * t_embd = nullptr;
+
+ int64_t n_embd = 0;
+ int64_t n_enc = 0;
+
+ // embeddings data copied to host memory (tmp)
+ std::vector<float> v_embd;
+
+ // needed to construct the cross-attention mask in the decoder
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
+};
+
+struct llm_graph_params;
+
+//
+// llm_graph_input
+//
+
+class llm_graph_input_i {
+public:
+ llm_graph_input_i() {
+ const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
+ debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
+ }
+
+ virtual ~llm_graph_input_i() = default;
+
+ virtual void set_input(const llama_ubatch * ubatch) = 0;
+
+ // return true if the resulting input tensors using the provided graph parameters would be
+ // the same as the previous input tensors that we have currently stored in the object
+ virtual bool can_reuse(const llm_graph_params & params) {
+ // returning false here by default will prevent from reusing the graph if the check
+ // for the input type has not been implemented yet
+ GGML_UNUSED(params);
+ return false;
+ }
+protected:
+ // env: LLAMA_GRAPH_INPUT_DEBUG
+ int debug = 0;
+};
+
+using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
+
+class llm_graph_input_embd : public llm_graph_input_i {
+public:
+ llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {}
+ virtual ~llm_graph_input_embd() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * tokens = nullptr; // I32 [n_batch]
+ ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
+
+ const int64_t n_embd = 0;
+};
+
+class llm_graph_input_pos : public llm_graph_input_i {
+public:
+ llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
+ virtual ~llm_graph_input_pos() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * pos = nullptr; // I32 [n_batch]
+
+ const uint32_t n_pos_per_embd = 1;
+};
+
+// temperature tuning, used by llama4
+class llm_graph_input_attn_temp : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
+ : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
+ virtual ~llm_graph_input_attn_temp() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
+
+ const uint32_t n_attn_temp_floor_scale;
+ const float f_attn_temp_scale;
+ const float f_attn_temp_offset;
+};
+
+class llm_graph_input_pos_bucket : public llm_graph_input_i {
+public:
+ llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
+ virtual ~llm_graph_input_pos_bucket() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
+
+ const llama_hparams hparams;
+};
+
+class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
+public:
+ llm_graph_input_pos_bucket_kv(
+ const llama_hparams & hparams,
+ const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {}
+ virtual ~llm_graph_input_pos_bucket_kv() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
+
+ const llama_hparams hparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
+class llm_graph_input_out_ids : public llm_graph_input_i {
+public:
+ llm_graph_input_out_ids(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+ virtual ~llm_graph_input_out_ids() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * out_ids; // I32 [n_outputs]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const uint32_t n_outputs;
+};
+
+class llm_graph_input_mean : public llm_graph_input_i {
+public:
+ llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
+ virtual ~llm_graph_input_mean() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * mean; // F32 [n_batch, n_batch]
+
+ const llama_cparams cparams;
+};
+
+class llm_graph_input_cls : public llm_graph_input_i {
+public:
+ llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
+ virtual ~llm_graph_input_cls() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * cls; // I32 [n_batch]
+
+ const llama_cparams cparams;
+ const llm_arch arch;
+};
+
+class llm_graph_input_rs : public llm_graph_input_i {
+public:
+ llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
+ virtual ~llm_graph_input_rs() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * s_copy; // I32 [n_rs]
+
+ // views of s_copy, computed once per graph
+ // and shared across layers which use build_rs
+ ggml_tensor * s_copy_main; // I32 [n_seqs]
+ ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs]
+
+ const llama_memory_recurrent_context * mctx;
+
+ // used in view offsets, need to match for valid graph reuse
+ uint32_t head;
+ int32_t rs_z;
+};
+
+class llm_graph_input_cross_embd : public llm_graph_input_i {
+public:
+ llm_graph_input_cross_embd(
+ const llama_cross * cross) : cross(cross) {}
+ virtual ~llm_graph_input_cross_embd() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
+
+ const llama_cross * cross;
+};
+
+class llm_graph_input_attn_no_cache : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
+ hparams(hparams),
+ cparams(cparams) {
+ }
+ ~llm_graph_input_attn_no_cache() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+ ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
+
+ // n_tokens == n_batch
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+};
+
+class llm_graph_input_attn_kv : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_kv(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_kv() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+ ggml_tensor * get_v_idxs() const { return self_v_idxs; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ // note: these have to be copies because in order to be able to reuse a graph, its inputs
+ // need to carry these parameters with them. otherwise, they can point to freed
+ // llm_graph_params from a previous batch, causing stack-use-after-return
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
+// V-less input for the KV cache
+// ref: https://github.com/ggml-org/llama.cpp/pull/19067
+class llm_graph_input_attn_k : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_k(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_k() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_context * mctx;
+};
+
+class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_kv_iswa(
+ const llama_hparams & hparams,
+ const llama_cparams & cparams,
+ const llama_kv_cache_iswa_context * mctx) :
+ hparams(hparams),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_attn_kv_iswa() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+ ggml_tensor * get_v_idxs() const { return self_v_idxs; }
+ ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
+ ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
+
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+ ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+ ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
+ ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
+ ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ const llama_hparams hparams;
+ const llama_cparams cparams;
+
+ const llama_kv_cache_iswa_context * mctx;
+};
+
+class llm_graph_input_attn_cross : public llm_graph_input_i {
+public:
+ llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
+ ~llm_graph_input_attn_cross() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
+
+ ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
+ ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
+
+ const llama_cross * cross = nullptr;
+};
+
+class llm_graph_input_mem_hybrid : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_kv> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_kv> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_context * mctx;
+};
+
+class llm_graph_input_mem_hybrid_k : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid_k(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_k> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid_k() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_k> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_context * mctx;
+};
+
+class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i {
+public:
+ llm_graph_input_mem_hybrid_iswa(
+ const llama_cparams & cparams,
+ std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn,
+ std::unique_ptr<llm_graph_input_rs> inp_rs,
+ const llama_memory_hybrid_iswa_context * mctx) :
+ inp_attn(std::move(inp_attn)),
+ inp_rs(std::move(inp_rs)),
+ cparams(cparams),
+ mctx(mctx) { }
+ virtual ~llm_graph_input_mem_hybrid_iswa() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn;
+ std::unique_ptr<llm_graph_input_rs> inp_rs;
+
+ llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); }
+ llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
+
+ const llama_cparams cparams;
+
+ const llama_memory_hybrid_iswa_context * mctx;
+};
+
+class llm_graph_input_sampling : public llm_graph_input_i {
+public:
+ llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) :
+ samplers(std::move(samplers)) { }
+ virtual ~llm_graph_input_sampling() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+ bool can_reuse(const llm_graph_params & params) override;
+
+ std::map<llama_seq_id, llama_sampler *> samplers;
+};
+
+//
+// llm_graph_result
+//
+
+// these objects deliver the result from the graph build process back to the llama_context
+// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
+// specific data, by calling the set_inputs() method
+// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
+// these are used by the llama_context to extact the relevant data, based on the compute parameters
+
+// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
+
+class llm_graph_result;
+
+struct llm_graph_params {
+ llm_arch arch = LLM_ARCH_UNKNOWN;
+
+ llama_hparams hparams;
+ llama_cparams cparams;
+
+ llama_ubatch ubatch; // note: intentionally make a copy
+
+ llm_graph_type gtype;
+
+ ggml_backend_sched_t sched;
+ ggml_backend_t backend_cpu;
+
+ const llama_adapter_cvec * cvec;
+ const llama_adapter_loras * loras;
+ const llama_memory_context_i * mctx;
+ const llama_cross * cross;
+
+ std::map<llama_seq_id, llama_sampler *> samplers;
+
+ static bool samplers_equal(
+ const std::map<llama_seq_id, llama_sampler *> & lhs,
+ const std::map<llama_seq_id, llama_sampler *> & rhs) {
+ if (lhs.size() != rhs.size()) {
+ return false;
+ }
+ for (const auto & [seq_id, sampler] : lhs) {
+ auto it = rhs.find(seq_id);
+ if (it == rhs.end() || it->second != sampler) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ uint32_t n_outputs;
+
+ llm_graph_cb cb;
+
+ llm_graph_result * res;
+
+ // return true if the "other" params would result in a graph with the same topology as with the current params
+ // having the same topology allows us to reuse the graph in some cases
+ bool allow_reuse(const llm_graph_params & other) const {
+ // first check the ubatch
+ bool can_reuse_ubatch =
+ ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
+ ubatch.n_tokens == other.ubatch.n_tokens &&
+ ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
+ ubatch.n_seqs == other.ubatch.n_seqs &&
+ ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
+ (
+ (!ubatch.token && !other.ubatch.token) ||
+ (!ubatch.embd && !other.ubatch.embd)
+ );
+
+ // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
+ // the reason is because the set of attention streams would be different for different sequences
+ if (can_reuse_ubatch && ubatch.equal_seqs()) {
+ if (!ubatch.data) {
+ // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
+ // therefore we cannot perform the sequence id check. normally should never happen
+ can_reuse_ubatch = false;
+ } else {
+ for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+ can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
+ }
+ }
+ }
+
+ if (!can_reuse_ubatch) {
+ return false;
+ }
+
+ if (n_outputs != other.n_outputs) {
+ return false;
+ }
+
+ if (!samplers_equal(samplers, other.samplers)) {
+ return false;
+ }
+
+ if (samplers.size() > 0) {
+ if (!ubatch.data || !other.ubatch.data) {
+ return false;
+ }
+
+ // check that the outputs are the same for all samplers
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ if (ubatch.output[i] != other.ubatch.output[i] ||
+ ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) {
+ return false;
+ }
+ }
+ }
+
+ return
+ cparams.embeddings == other.cparams.embeddings &&
+ cparams.causal_attn == other.cparams.causal_attn &&
+ arch == other.arch &&
+ gtype == other.gtype &&
+ cvec == other.cvec &&
+ loras == other.loras &&
+ cross == other.cross;
+ }
+};
+
+class llm_graph_result {
+public:
+ llm_graph_result(int64_t max_nodes);
+
+ virtual ~llm_graph_result() = default;
+
+ ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
+ ggml_tensor * get_logits() const { return t_logits; }
+ ggml_tensor * get_embd() const { return t_embd; }
+ ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
+
+ ggml_cgraph * get_gf() const { return gf; }
+ ggml_context * get_ctx() const { return ctx_compute.get(); }
+
+ int64_t get_max_nodes() const;
+
+ void reset();
+
+ void set_inputs(const llama_ubatch * ubatch);
+ void set_outputs();
+
+ // try to update the existing graph result using the new graph parameters in order to reuse it
+ // this can only be done if we determine that the resulting graph using the new graph parameters
+ // would be identical to the existing graph. in that case, we simply have to update the memory
+ // contexts of the input tensors of the graph and we can reuse it for another computation
+ // return true if the graph was updated and can be reused
+ bool can_reuse(const llm_graph_params & params);
+
+ llm_graph_input_i * add_input(llm_graph_input_ptr input);
+
+ void set_params(const llm_graph_params & params);
+
+ // important graph nodes
+ ggml_tensor * t_inp_tokens = nullptr;
+ ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens]
+ ggml_tensor * t_logits = nullptr;
+ ggml_tensor * t_embd = nullptr;
+ ggml_tensor * t_embd_pooled = nullptr;
+
+ std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
+ std::map<llama_seq_id, ggml_tensor*> t_candidates;
+ std::map<llama_seq_id, ggml_tensor*> t_sampled;
+ std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
+
+ std::vector<llm_graph_input_ptr> inputs;
+
+ ggml_context_ptr ctx_compute;
+
+ // memory buffers used to evaluate the model
+ std::vector<uint8_t> buf_compute_meta;
+
+ ggml_cgraph * gf;
+
+ int64_t max_nodes;
+
+private:
+ // keep a copy of the previous graph parameters
+ // we will use this to determine whether the graph can be reused by comparing them with the new parameters
+ // note: these are updated after constructing the new graph
+ llm_graph_params params;
+
+ // env: LLAMA_GRAPH_RESULT_DEBUG
+ int debug = 0;
+};
+
+using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
+
+//
+// llm_graph_context
+//
+
+// used in build_rs to properly order writes and avoid unnecessary copies
+using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
+
+struct llm_graph_context {
+ const llm_arch arch;
+
+ const llama_hparams & hparams;
+ const llama_cparams & cparams;
+ const llama_ubatch & ubatch;
+
+ const int64_t n_embd;
+ const int64_t n_layer;
+ const int64_t n_rot;
+ const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
+ const int64_t n_head;
+ const int64_t n_head_kv;
+ const int64_t n_embd_head_k;
+ const int64_t n_embd_k_gqa;
+ const int64_t n_embd_head_v;
+ const int64_t n_embd_v_gqa;
+ const int64_t n_expert;
+ const int64_t n_expert_used;
+
+ const float freq_base;
+ const float freq_scale;
+ const float ext_factor;
+ const float attn_factor;
+ const float beta_fast;
+ const float beta_slow;
+ const float norm_eps;
+ const float norm_rms_eps;
+
+ const int64_t n_tokens;
+ const int64_t n_outputs;
+ const int32_t n_ctx_orig; // yarn
+
+ const enum llama_pooling_type pooling_type;
+ const enum llama_rope_type rope_type;
+
+ ggml_backend_sched_t sched;
+
+ ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+
+ const llama_adapter_cvec * cvec;
+ const llama_adapter_loras * loras;
+ const llama_memory_context_i * mctx;
+ const llama_cross * cross;
+
+ std::map<llama_seq_id, llama_sampler *> samplers;
+
+ const llm_graph_cb & cb_func;
+
+ llm_graph_result * res;
+
+ ggml_context * ctx0 = nullptr;
+ ggml_cgraph * gf = nullptr;
+
+ llm_graph_context(const llm_graph_params & params);
+ virtual ~llm_graph_context() = default;
+
+ void cb(ggml_tensor * cur, const char * name, int il) const;
+
+ //
+ // common
+ //
+
+ ggml_tensor * build_cvec(
+ ggml_tensor * cur,
+ int il) const;
+
+ // do mat_mul, while optionally apply lora
+ ggml_tensor * build_lora_mm(
+ ggml_tensor * w,
+ ggml_tensor * cur) const;
+
+ // do mat_mul_id, while optionally apply lora
+ ggml_tensor * build_lora_mm_id(
+ ggml_tensor * w, // ggml_tensor * as
+ ggml_tensor * cur, // ggml_tensor * b
+ ggml_tensor * ids) const;
+
+ ggml_tensor * build_norm(
+ ggml_tensor * cur,
+ ggml_tensor * mw,
+ ggml_tensor * mb,
+ llm_norm_type type,
+ int il) const;
+
+ ggml_tensor * build_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * up,
+ ggml_tensor * up_b,
+ ggml_tensor * up_s,
+ ggml_tensor * gate,
+ ggml_tensor * gate_b,
+ ggml_tensor * gate_s,
+ ggml_tensor * down,
+ ggml_tensor * down_b,
+ ggml_tensor * down_s,
+ ggml_tensor * act_scales,
+ llm_ffn_op_type type_op,
+ llm_ffn_gate_type type_gate,
+ int il) const;
+
+ // build MoE FFN without bias tensors
+ ggml_tensor * build_moe_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * gate_inp,
+ ggml_tensor * up_exps,
+ ggml_tensor * gate_exps,
+ ggml_tensor * down_exps,
+ ggml_tensor * exp_probs_b,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ bool scale_w,
+ float w_scale,
+ llama_expert_gating_func_type gating_op,
+ int il,
+ ggml_tensor * probs_in = nullptr) const;
+
+ ggml_tensor * build_moe_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * gate_inp,
+ ggml_tensor * gate_inp_b,
+ ggml_tensor * up_exps,
+ ggml_tensor * up_exps_b,
+ ggml_tensor * gate_exps,
+ ggml_tensor * gate_exps_b,
+ ggml_tensor * down_exps,
+ ggml_tensor * down_exps_b,
+ ggml_tensor * exp_probs_b,
+ int64_t n_expert,
+ int64_t n_expert_used,
+ llm_ffn_op_type type_op,
+ bool norm_w,
+ bool scale_w,
+ float w_scale,
+ llama_expert_gating_func_type gating_op,
+ int il,
+ ggml_tensor * probs_in = nullptr) const;
+
+ //
+ // inputs
+ //
+
+ ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
+ ggml_tensor * build_inp_pos() const;
+ ggml_tensor * build_inp_attn_scale() const;
+ ggml_tensor * build_inp_out_ids() const;
+ ggml_tensor * build_inp_mean() const;
+ ggml_tensor * build_inp_cls() const;
+
+ ggml_tensor * build_inp_cross_embd() const;
+ ggml_tensor * build_inp_pos_bucket_enc() const;
+ ggml_tensor * build_inp_pos_bucket_dec() const;
+ ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
+
+ //
+ // attention
+ //
+
+ ggml_tensor * build_attn_mha(
+ ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
+ ggml_tensor * kq_b,
+ ggml_tensor * kq_mask,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_no_cache * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_kv * build_attn_inp_kv() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_kv * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_k * build_attn_inp_k() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_k * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
+
+ // note: if k_cur or v_cur are not provided, they will not be stored in the memory
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_kv_iswa * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ llm_graph_input_attn_cross * build_attn_inp_cross() const;
+
+ ggml_tensor * build_attn(
+ llm_graph_input_attn_cross * inp,
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+ ggml_tensor * kq_b,
+ ggml_tensor * sinks, // [n_head_q]
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+ float kq_scale,
+ int il) const;
+
+ //
+ // recurrent
+ //
+
+ // TODO: move this implementation to llama_memory_recurrent.
+ // this is analogous to llama_kv_cache::cpy_k / cpy_v
+ // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
+ // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
+ // `llama_memory_recurrent`
+ ggml_tensor * build_rs(
+ ggml_tensor * s,
+ ggml_tensor * state_copy_main,
+ ggml_tensor * state_copy_extra,
+ int32_t state_size,
+ int32_t n_seqs,
+ uint32_t n_rs,
+ uint32_t rs_head,
+ uint32_t rs_size,
+ int32_t rs_zero,
+ const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
+
+ llm_graph_input_rs * build_rs_inp() const;
+
+ ggml_tensor * build_rs(
+ llm_graph_input_rs * inp,
+ ggml_tensor * s,
+ int32_t state_size,
+ int32_t n_seqs,
+ const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
+
+ ggml_tensor * build_rwkv_token_shift_load(
+ llm_graph_input_rs * inp,
+ const llama_ubatch & ubatch,
+ int il) const;
+
+ ggml_tensor * build_rwkv_token_shift_store(
+ ggml_tensor * token_shift,
+ const llama_ubatch & ubatch,
+ int il) const;
+ //
+ // hybrid
+ //
+
+ llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+ llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const;
+
+ llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const;
+
+ //
+ // pooling
+ //
+
+ void build_pooling(
+ ggml_tensor * cls,
+ ggml_tensor * cls_b,
+ ggml_tensor * cls_out,
+ ggml_tensor * cls_out_b) const;
+
+ //
+ // sampling (backend sampling)
+ //
+
+ void build_sampling() const;
+
+ //
+ // dense (out)
+ //
+
+ void build_dense_out(
+ ggml_tensor * dense_2,
+ ggml_tensor * dense_3) const;
+};
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);