diff options
Diffstat (limited to 'llama.cpp/src/llama-graph.h')
| -rw-r--r-- | llama.cpp/src/llama-graph.h | 1021 |
1 files changed, 1021 insertions, 0 deletions
diff --git a/llama.cpp/src/llama-graph.h b/llama.cpp/src/llama-graph.h new file mode 100644 index 0000000..1d69ff1 --- /dev/null +++ b/llama.cpp/src/llama-graph.h @@ -0,0 +1,1021 @@ +#pragma once + +#include "llama-arch.h" +#include "llama-batch.h" +#include "llama-hparams.h" +#include "llama-adapter.h" + +#include <cstdint> +#include <vector> +#include <memory> +#include <set> +#include <functional> +#include <map> + +struct ggml_cgraph; +struct ggml_context; +struct ggml_tensor; + +struct llama_cparams; + +struct llama_memory_context_i; + +class llama_kv_cache_context; +class llama_kv_cache_iswa_context; +class llama_memory_recurrent_context; +class llama_memory_hybrid_context; +class llama_memory_hybrid_iswa_context; + +// certain models (typically multi-modal) can produce different types of graphs +enum llm_graph_type { + LLM_GRAPH_TYPE_DEFAULT, + LLM_GRAPH_TYPE_ENCODER, + LLM_GRAPH_TYPE_DECODER, +}; + +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, + LLM_FFN_SWIGLU, + LLM_FFN_GEGLU, + LLM_FFN_REGLU, + LLM_FFN_SWIGLU_OAI_MOE, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, + LLM_NORM_GROUP, +}; + +// TODO: tmp - need something better to pass the data from the encoder to the decoder +struct llama_cross { + // the output embeddings from the encoder as a ggml tensor + // TODO: this needs more work to be correct, for now copy the embeddings data to host memory + // ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524 + //ggml_tensor * t_embd = nullptr; + + int64_t n_embd = 0; + int64_t n_enc = 0; + + // embeddings data copied to host memory (tmp) + std::vector<float> v_embd; + + // needed to construct the cross-attention mask in the decoder + std::vector<std::set<llama_seq_id>> seq_ids_enc; +}; + +struct llm_graph_params; + +// +// llm_graph_input +// + +class llm_graph_input_i { +public: + llm_graph_input_i() { + const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG"); + debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0; + } + + virtual ~llm_graph_input_i() = default; + + virtual void set_input(const llama_ubatch * ubatch) = 0; + + // return true if the resulting input tensors using the provided graph parameters would be + // the same as the previous input tensors that we have currently stored in the object + virtual bool can_reuse(const llm_graph_params & params) { + // returning false here by default will prevent from reusing the graph if the check + // for the input type has not been implemented yet + GGML_UNUSED(params); + return false; + } +protected: + // env: LLAMA_GRAPH_INPUT_DEBUG + int debug = 0; +}; + +using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>; + +class llm_graph_input_embd : public llm_graph_input_i { +public: + llm_graph_input_embd(int64_t n_embd) : n_embd(n_embd) {} + virtual ~llm_graph_input_embd() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tokens = nullptr; // I32 [n_batch] + ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + + const int64_t n_embd = 0; +}; + +class llm_graph_input_pos : public llm_graph_input_i { +public: + llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} + virtual ~llm_graph_input_pos() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * pos = nullptr; // I32 [n_batch] + + const uint32_t n_pos_per_embd = 1; +}; + +// temperature tuning, used by llama4 +class llm_graph_input_attn_temp : public llm_graph_input_i { +public: + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {} + virtual ~llm_graph_input_attn_temp() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * attn_scale = nullptr; // F32 [n_batch] + + const uint32_t n_attn_temp_floor_scale; + const float f_attn_temp_scale; + const float f_attn_temp_offset; +}; + +class llm_graph_input_pos_bucket : public llm_graph_input_i { +public: + llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} + virtual ~llm_graph_input_pos_bucket() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch] + + const llama_hparams hparams; +}; + +class llm_graph_input_pos_bucket_kv : public llm_graph_input_i { +public: + llm_graph_input_pos_bucket_kv( + const llama_hparams & hparams, + const llama_kv_cache_context * mctx) : hparams(hparams), mctx(mctx) {} + virtual ~llm_graph_input_pos_bucket_kv() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch] + + const llama_hparams hparams; + + const llama_kv_cache_context * mctx; +}; + +class llm_graph_input_out_ids : public llm_graph_input_i { +public: + llm_graph_input_out_ids( + const llama_hparams & hparams, + const llama_cparams & cparams, + uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + virtual ~llm_graph_input_out_ids() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * out_ids; // I32 [n_outputs] + + const llama_hparams hparams; + const llama_cparams cparams; + + const uint32_t n_outputs; +}; + +class llm_graph_input_mean : public llm_graph_input_i { +public: + llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llm_graph_input_mean() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * mean; // F32 [n_batch, n_batch] + + const llama_cparams cparams; +}; + +class llm_graph_input_cls : public llm_graph_input_i { +public: + llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {} + virtual ~llm_graph_input_cls() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cls; // I32 [n_batch] + + const llama_cparams cparams; + const llm_arch arch; +}; + +class llm_graph_input_rs : public llm_graph_input_i { +public: + llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {} + virtual ~llm_graph_input_rs() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * s_copy; // I32 [n_rs] + + // views of s_copy, computed once per graph + // and shared across layers which use build_rs + ggml_tensor * s_copy_main; // I32 [n_seqs] + ggml_tensor * s_copy_extra; // I32 [n_rs - n_seqs] + + const llama_memory_recurrent_context * mctx; + + // used in view offsets, need to match for valid graph reuse + uint32_t head; + int32_t rs_z; +}; + +class llm_graph_input_cross_embd : public llm_graph_input_i { +public: + llm_graph_input_cross_embd( + const llama_cross * cross) : cross(cross) {} + virtual ~llm_graph_input_cross_embd() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] + + const llama_cross * cross; +}; + +class llm_graph_input_attn_no_cache : public llm_graph_input_i { +public: + llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) : + hparams(hparams), + cparams(cparams) { + } + ~llm_graph_input_attn_no_cache() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } + ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } + + // n_tokens == n_batch + ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream] + + const llama_hparams hparams; + const llama_cparams cparams; +}; + +class llm_graph_input_attn_kv : public llm_graph_input_i { +public: + llm_graph_input_attn_kv( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_context * mctx) : + hparams(hparams), + cparams(cparams), + mctx(mctx) { + } + ~llm_graph_input_attn_kv() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * get_k_idxs() const { return self_k_idxs; } + ggml_tensor * get_v_idxs() const { return self_v_idxs; } + + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } + + ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + + // note: these have to be copies because in order to be able to reuse a graph, its inputs + // need to carry these parameters with them. otherwise, they can point to freed + // llm_graph_params from a previous batch, causing stack-use-after-return + const llama_hparams hparams; + const llama_cparams cparams; + + const llama_kv_cache_context * mctx; +}; + +// V-less input for the KV cache +// ref: https://github.com/ggml-org/llama.cpp/pull/19067 +class llm_graph_input_attn_k : public llm_graph_input_i { +public: + llm_graph_input_attn_k( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_context * mctx) : + hparams(hparams), + cparams(cparams), + mctx(mctx) { + } + ~llm_graph_input_attn_k() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * get_k_idxs() const { return self_k_idxs; } + + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } + + ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + + const llama_hparams hparams; + const llama_cparams cparams; + + const llama_kv_cache_context * mctx; +}; + +class llm_graph_input_attn_kv_iswa : public llm_graph_input_i { +public: + llm_graph_input_attn_kv_iswa( + const llama_hparams & hparams, + const llama_cparams & cparams, + const llama_kv_cache_iswa_context * mctx) : + hparams(hparams), + cparams(cparams), + mctx(mctx) { + } + ~llm_graph_input_attn_kv_iswa() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * get_k_idxs() const { return self_k_idxs; } + ggml_tensor * get_v_idxs() const { return self_v_idxs; } + ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; } + ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; } + + ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; } + ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; } + + ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] + ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch] + ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa] + + ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream] + ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream] + + const llama_hparams hparams; + const llama_cparams cparams; + + const llama_kv_cache_iswa_context * mctx; +}; + +class llm_graph_input_attn_cross : public llm_graph_input_i { +public: + llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {} + ~llm_graph_input_attn_cross() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; } + + ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1] + ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1] + + const llama_cross * cross = nullptr; +}; + +class llm_graph_input_mem_hybrid : public llm_graph_input_i { +public: + llm_graph_input_mem_hybrid( + const llama_cparams & cparams, + std::unique_ptr<llm_graph_input_attn_kv> inp_attn, + std::unique_ptr<llm_graph_input_rs> inp_rs, + const llama_memory_hybrid_context * mctx) : + inp_attn(std::move(inp_attn)), + inp_rs(std::move(inp_rs)), + cparams(cparams), + mctx(mctx) { } + virtual ~llm_graph_input_mem_hybrid() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + std::unique_ptr<llm_graph_input_attn_kv> inp_attn; + std::unique_ptr<llm_graph_input_rs> inp_rs; + + llm_graph_input_attn_kv * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } + + const llama_cparams cparams; + + const llama_memory_hybrid_context * mctx; +}; + +class llm_graph_input_mem_hybrid_k : public llm_graph_input_i { +public: + llm_graph_input_mem_hybrid_k( + const llama_cparams & cparams, + std::unique_ptr<llm_graph_input_attn_k> inp_attn, + std::unique_ptr<llm_graph_input_rs> inp_rs, + const llama_memory_hybrid_context * mctx) : + inp_attn(std::move(inp_attn)), + inp_rs(std::move(inp_rs)), + cparams(cparams), + mctx(mctx) { } + virtual ~llm_graph_input_mem_hybrid_k() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + std::unique_ptr<llm_graph_input_attn_k> inp_attn; + std::unique_ptr<llm_graph_input_rs> inp_rs; + + llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } + + const llama_cparams cparams; + + const llama_memory_hybrid_context * mctx; +}; + +class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i { +public: + llm_graph_input_mem_hybrid_iswa( + const llama_cparams & cparams, + std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn, + std::unique_ptr<llm_graph_input_rs> inp_rs, + const llama_memory_hybrid_iswa_context * mctx) : + inp_attn(std::move(inp_attn)), + inp_rs(std::move(inp_rs)), + cparams(cparams), + mctx(mctx) { } + virtual ~llm_graph_input_mem_hybrid_iswa() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + std::unique_ptr<llm_graph_input_attn_kv_iswa> inp_attn; + std::unique_ptr<llm_graph_input_rs> inp_rs; + + llm_graph_input_attn_kv_iswa * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } + + const llama_cparams cparams; + + const llama_memory_hybrid_iswa_context * mctx; +}; + +class llm_graph_input_sampling : public llm_graph_input_i { +public: + llm_graph_input_sampling(std::map<llama_seq_id, llama_sampler *> samplers) : + samplers(std::move(samplers)) { } + virtual ~llm_graph_input_sampling() = default; + + void set_input(const llama_ubatch * ubatch) override; + bool can_reuse(const llm_graph_params & params) override; + + std::map<llama_seq_id, llama_sampler *> samplers; +}; + +// +// llm_graph_result +// + +// these objects deliver the result from the graph build process back to the llama_context +// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their +// specific data, by calling the set_inputs() method +// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc. +// these are used by the llama_context to extact the relevant data, based on the compute parameters + +// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) +using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>; + +class llm_graph_result; + +struct llm_graph_params { + llm_arch arch = LLM_ARCH_UNKNOWN; + + llama_hparams hparams; + llama_cparams cparams; + + llama_ubatch ubatch; // note: intentionally make a copy + + llm_graph_type gtype; + + ggml_backend_sched_t sched; + ggml_backend_t backend_cpu; + + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; + + std::map<llama_seq_id, llama_sampler *> samplers; + + static bool samplers_equal( + const std::map<llama_seq_id, llama_sampler *> & lhs, + const std::map<llama_seq_id, llama_sampler *> & rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + for (const auto & [seq_id, sampler] : lhs) { + auto it = rhs.find(seq_id); + if (it == rhs.end() || it->second != sampler) { + return false; + } + } + return true; + } + + uint32_t n_outputs; + + llm_graph_cb cb; + + llm_graph_result * res; + + // return true if the "other" params would result in a graph with the same topology as with the current params + // having the same topology allows us to reuse the graph in some cases + bool allow_reuse(const llm_graph_params & other) const { + // first check the ubatch + bool can_reuse_ubatch = + ubatch.equal_seqs() == other.ubatch.equal_seqs() && + ubatch.n_tokens == other.ubatch.n_tokens && + ubatch.n_seq_tokens == other.ubatch.n_seq_tokens && + ubatch.n_seqs == other.ubatch.n_seqs && + ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && + ( + (!ubatch.token && !other.ubatch.token) || + (!ubatch.embd && !other.ubatch.embd) + ); + + // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same + // the reason is because the set of attention streams would be different for different sequences + if (can_reuse_ubatch && ubatch.equal_seqs()) { + if (!ubatch.data) { + // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and + // therefore we cannot perform the sequence id check. normally should never happen + can_reuse_ubatch = false; + } else { + for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { + can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s]; + } + } + } + + if (!can_reuse_ubatch) { + return false; + } + + if (n_outputs != other.n_outputs) { + return false; + } + + if (!samplers_equal(samplers, other.samplers)) { + return false; + } + + if (samplers.size() > 0) { + if (!ubatch.data || !other.ubatch.data) { + return false; + } + + // check that the outputs are the same for all samplers + for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { + if (ubatch.output[i] != other.ubatch.output[i] || + ubatch.seq_id[i][0] != other.ubatch.seq_id[i][0]) { + return false; + } + } + } + + return + cparams.embeddings == other.cparams.embeddings && + cparams.causal_attn == other.cparams.causal_attn && + arch == other.arch && + gtype == other.gtype && + cvec == other.cvec && + loras == other.loras && + cross == other.cross; + } +}; + +class llm_graph_result { +public: + llm_graph_result(int64_t max_nodes); + + virtual ~llm_graph_result() = default; + + ggml_tensor * get_inp_tokens() const { return t_inp_tokens; } + ggml_tensor * get_logits() const { return t_logits; } + ggml_tensor * get_embd() const { return t_embd; } + ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } + + ggml_cgraph * get_gf() const { return gf; } + ggml_context * get_ctx() const { return ctx_compute.get(); } + + int64_t get_max_nodes() const; + + void reset(); + + void set_inputs(const llama_ubatch * ubatch); + void set_outputs(); + + // try to update the existing graph result using the new graph parameters in order to reuse it + // this can only be done if we determine that the resulting graph using the new graph parameters + // would be identical to the existing graph. in that case, we simply have to update the memory + // contexts of the input tensors of the graph and we can reuse it for another computation + // return true if the graph was updated and can be reused + bool can_reuse(const llm_graph_params & params); + + llm_graph_input_i * add_input(llm_graph_input_ptr input); + + void set_params(const llm_graph_params & params); + + // important graph nodes + ggml_tensor * t_inp_tokens = nullptr; + ggml_tensor * t_inp_embd = nullptr; // [n_embd_inp, n_tokens] + ggml_tensor * t_logits = nullptr; + ggml_tensor * t_embd = nullptr; + ggml_tensor * t_embd_pooled = nullptr; + + std::map<llama_seq_id, ggml_tensor*> t_sampled_logits; + std::map<llama_seq_id, ggml_tensor*> t_candidates; + std::map<llama_seq_id, ggml_tensor*> t_sampled; + std::map<llama_seq_id, ggml_tensor*> t_sampled_probs; + + std::vector<llm_graph_input_ptr> inputs; + + ggml_context_ptr ctx_compute; + + // memory buffers used to evaluate the model + std::vector<uint8_t> buf_compute_meta; + + ggml_cgraph * gf; + + int64_t max_nodes; + +private: + // keep a copy of the previous graph parameters + // we will use this to determine whether the graph can be reused by comparing them with the new parameters + // note: these are updated after constructing the new graph + llm_graph_params params; + + // env: LLAMA_GRAPH_RESULT_DEBUG + int debug = 0; +}; + +using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>; + +// +// llm_graph_context +// + +// used in build_rs to properly order writes and avoid unnecessary copies +using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>; + +struct llm_graph_context { + const llm_arch arch; + + const llama_hparams & hparams; + const llama_cparams & cparams; + const llama_ubatch & ubatch; + + const int64_t n_embd; + const int64_t n_layer; + const int64_t n_rot; + const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) + const int64_t n_head; + const int64_t n_head_kv; + const int64_t n_embd_head_k; + const int64_t n_embd_k_gqa; + const int64_t n_embd_head_v; + const int64_t n_embd_v_gqa; + const int64_t n_expert; + const int64_t n_expert_used; + + const float freq_base; + const float freq_scale; + const float ext_factor; + const float attn_factor; + const float beta_fast; + const float beta_slow; + const float norm_eps; + const float norm_rms_eps; + + const int64_t n_tokens; + const int64_t n_outputs; + const int32_t n_ctx_orig; // yarn + + const enum llama_pooling_type pooling_type; + const enum llama_rope_type rope_type; + + ggml_backend_sched_t sched; + + ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove? + + const llama_adapter_cvec * cvec; + const llama_adapter_loras * loras; + const llama_memory_context_i * mctx; + const llama_cross * cross; + + std::map<llama_seq_id, llama_sampler *> samplers; + + const llm_graph_cb & cb_func; + + llm_graph_result * res; + + ggml_context * ctx0 = nullptr; + ggml_cgraph * gf = nullptr; + + llm_graph_context(const llm_graph_params & params); + virtual ~llm_graph_context() = default; + + void cb(ggml_tensor * cur, const char * name, int il) const; + + // + // common + // + + ggml_tensor * build_cvec( + ggml_tensor * cur, + int il) const; + + // do mat_mul, while optionally apply lora + ggml_tensor * build_lora_mm( + ggml_tensor * w, + ggml_tensor * cur) const; + + // do mat_mul_id, while optionally apply lora + ggml_tensor * build_lora_mm_id( + ggml_tensor * w, // ggml_tensor * as + ggml_tensor * cur, // ggml_tensor * b + ggml_tensor * ids) const; + + ggml_tensor * build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + llm_norm_type type, + int il) const; + + ggml_tensor * build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * up_s, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * gate_s, + ggml_tensor * down, + ggml_tensor * down_b, + ggml_tensor * down_s, + ggml_tensor * act_scales, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + int il) const; + + // build MoE FFN without bias tensors + ggml_tensor * build_moe_ffn( + ggml_tensor * cur, + ggml_tensor * gate_inp, + ggml_tensor * up_exps, + ggml_tensor * gate_exps, + ggml_tensor * down_exps, + ggml_tensor * exp_probs_b, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, + llama_expert_gating_func_type gating_op, + int il, + ggml_tensor * probs_in = nullptr) const; + + ggml_tensor * build_moe_ffn( + ggml_tensor * cur, + ggml_tensor * gate_inp, + ggml_tensor * gate_inp_b, + ggml_tensor * up_exps, + ggml_tensor * up_exps_b, + ggml_tensor * gate_exps, + ggml_tensor * gate_exps_b, + ggml_tensor * down_exps, + ggml_tensor * down_exps_b, + ggml_tensor * exp_probs_b, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + bool scale_w, + float w_scale, + llama_expert_gating_func_type gating_op, + int il, + ggml_tensor * probs_in = nullptr) const; + + // + // inputs + // + + ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const; + ggml_tensor * build_inp_pos() const; + ggml_tensor * build_inp_attn_scale() const; + ggml_tensor * build_inp_out_ids() const; + ggml_tensor * build_inp_mean() const; + ggml_tensor * build_inp_cls() const; + + ggml_tensor * build_inp_cross_embd() const; + ggml_tensor * build_inp_pos_bucket_enc() const; + ggml_tensor * build_inp_pos_bucket_dec() const; + ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const; + + // + // attention + // + + ggml_tensor * build_attn_mha( + ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false) + ggml_tensor * kq_b, + ggml_tensor * kq_mask, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il) const; + + llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const; + + ggml_tensor * build_attn( + llm_graph_input_attn_no_cache * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il) const; + + llm_graph_input_attn_kv * build_attn_inp_kv() const; + + ggml_tensor * build_attn( + llm_graph_input_attn_kv * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] // TODO: remove + float kq_scale, + int il) const; + + llm_graph_input_attn_k * build_attn_inp_k() const; + + ggml_tensor * build_attn( + llm_graph_input_attn_k * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il) const; + + llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const; + + // note: if k_cur or v_cur are not provided, they will not be stored in the memory + ggml_tensor * build_attn( + llm_graph_input_attn_kv_iswa * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il) const; + + llm_graph_input_attn_cross * build_attn_inp_cross() const; + + ggml_tensor * build_attn( + llm_graph_input_attn_cross * inp, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * sinks, // [n_head_q] + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il) const; + + // + // recurrent + // + + // TODO: move this implementation to llama_memory_recurrent. + // this is analogous to llama_kv_cache::cpy_k / cpy_v + // when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the + // implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in + // `llama_memory_recurrent` + ggml_tensor * build_rs( + ggml_tensor * s, + ggml_tensor * state_copy_main, + ggml_tensor * state_copy_extra, + int32_t state_size, + int32_t n_seqs, + uint32_t n_rs, + uint32_t rs_head, + uint32_t rs_size, + int32_t rs_zero, + const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; + + llm_graph_input_rs * build_rs_inp() const; + + ggml_tensor * build_rs( + llm_graph_input_rs * inp, + ggml_tensor * s, + int32_t state_size, + int32_t n_seqs, + const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const; + + ggml_tensor * build_rwkv_token_shift_load( + llm_graph_input_rs * inp, + const llama_ubatch & ubatch, + int il) const; + + ggml_tensor * build_rwkv_token_shift_store( + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il) const; + // + // hybrid + // + + llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; + llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const; + + llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const; + + // + // pooling + // + + void build_pooling( + ggml_tensor * cls, + ggml_tensor * cls_b, + ggml_tensor * cls_out, + ggml_tensor * cls_out_b) const; + + // + // sampling (backend sampling) + // + + void build_sampling() const; + + // + // dense (out) + // + + void build_dense_out( + ggml_tensor * dense_2, + ggml_tensor * dense_3) const; +}; + +// TODO: better name +int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional); |
