summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd/clip-graph.h
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/mtmd/clip-graph.h
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/tools/mtmd/clip-graph.h')
-rw-r--r--llama.cpp/tools/mtmd/clip-graph.h117
1 files changed, 117 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/clip-graph.h b/llama.cpp/tools/mtmd/clip-graph.h
new file mode 100644
index 0000000..4c7f750
--- /dev/null
+++ b/llama.cpp/tools/mtmd/clip-graph.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <functional>
+
+#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
+
+struct clip_graph {
+ const clip_model & model;
+ const clip_hparams & hparams;
+ projector_type proj_type;
+
+ // we only support single image per batch
+ const clip_image_f32 & img;
+
+ const int patch_size;
+ const int n_patches_x;
+ const int n_patches_y;
+ const int n_patches;
+ const int n_embd;
+ const int n_head;
+ const int d_head;
+ const int n_layer;
+ const int n_mmproj_embd;
+ const float eps;
+ const float kq_scale;
+ const clip_flash_attn_type flash_attn_type;
+
+ ggml_context_ptr ctx0_ptr;
+ ggml_context * ctx0;
+ ggml_cgraph * gf;
+
+ clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
+
+ virtual ~clip_graph() = default;
+ virtual ggml_cgraph * build() = 0;
+
+ //
+ // utility functions
+ //
+ void cb(ggml_tensor * cur0, const char * name, int il) const;
+
+ // siglip2 naflex
+ ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE);
+
+ // build vision transformer (ViT) cgraph
+ // this function should cover most of the models
+ // if your model has specific features, you should probably duplicate this function
+ ggml_tensor * build_vit(
+ ggml_tensor * inp,
+ int64_t n_pos,
+ norm_type norm_t,
+ ffn_op_type ffn_t,
+ ggml_tensor * learned_pos_embd,
+ std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+
+ // build the input after conv2d (inp_raw --> patches)
+ // returns tensor with shape [n_embd, n_patches]
+ ggml_tensor * build_inp();
+
+ ggml_tensor * build_inp_raw(int channels = 3);
+
+ ggml_tensor * build_norm(
+ ggml_tensor * cur,
+ ggml_tensor * mw,
+ ggml_tensor * mb,
+ norm_type type,
+ float norm_eps,
+ int il) const;
+
+ ggml_tensor * build_ffn(
+ ggml_tensor * cur,
+ ggml_tensor * up,
+ ggml_tensor * up_b,
+ ggml_tensor * gate,
+ ggml_tensor * gate_b,
+ ggml_tensor * down,
+ ggml_tensor * down_b,
+ ffn_op_type type_op,
+ int il) const;
+
+ ggml_tensor * build_attn(
+ ggml_tensor * wo,
+ ggml_tensor * wo_b,
+ ggml_tensor * q_cur,
+ ggml_tensor * k_cur,
+ ggml_tensor * v_cur,
+ ggml_tensor * kq_mask,
+ float kq_scale,
+ int il) const;
+
+ // implementation of the 2D RoPE without adding a new op in ggml
+ // this is not efficient (use double the memory), but works on all backends
+ // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+ ggml_tensor * build_rope_2d(
+ ggml_context * ctx0,
+ ggml_tensor * cur,
+ ggml_tensor * pos_a, // first half
+ ggml_tensor * pos_b, // second half
+ const float freq_base,
+ const bool interleave_freq
+ );
+
+ // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+ // support dynamic resolution
+ ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
+
+ // Generic function to stack frames for audio processing
+ // Abstracts out the StackAudioFrames logic used by ultravox
+ ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
+};