1#pragma once
2
3#include "../clip-graph.h"
4
5/*
6 * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
7 * We encourage human contributors to ensure the quality and reliability of the codebase.
8 */
9
10struct clip_graph_siglip : clip_graph {
11 clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
12 ggml_cgraph * build() override;
13};
14
15struct clip_graph_pixtral : clip_graph {
16 clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
17 ggml_cgraph * build() override;
18};
19
20struct clip_graph_qwen2vl : clip_graph {
21 clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
22 ggml_cgraph * build() override;
23};
24
25struct clip_graph_qwen3vl : clip_graph {
26 clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
27 ggml_cgraph * build() override;
28};
29
30struct clip_graph_youtuvl : clip_graph {
31 clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
32 ggml_cgraph * build() override;
33};
34
35struct clip_graph_minicpmv : clip_graph {
36 clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
37 ggml_cgraph * build() override;
38};
39
40struct clip_graph_internvl : clip_graph {
41 clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
42 ggml_cgraph * build() override;
43};
44
45struct clip_graph_llama4 : clip_graph {
46 clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
47 ggml_cgraph * build() override;
48};
49
50struct clip_graph_kimivl : clip_graph {
51 clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
52 ggml_cgraph * build() override;
53};
54
55struct clip_graph_cogvlm : clip_graph {
56 clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
57 ggml_cgraph * build() override;
58};
59
60struct clip_graph_llava : clip_graph {
61 clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
62 ggml_cgraph * build() override;
63};
64
65struct clip_graph_whisper_enc : clip_graph {
66 clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
67 ggml_cgraph * build() override;
68};
69
70struct clip_graph_conformer : clip_graph {
71 clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
72 ggml_cgraph * build() override;
73};
74
75struct clip_graph_glm4v : clip_graph {
76 clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
77 ggml_cgraph * build() override;
78};
79
80struct clip_graph_mobilenetv5 : clip_graph {
81 clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
82 ggml_cgraph * build() override;
83
84 ggml_tensor * rms_norm_2d(
85 ggml_tensor * inp,
86 ggml_tensor * weight,
87 float eps = 1e-6f);
88
89 ggml_tensor* pad_same_2d(
90 ggml_tensor* inp,
91 int kernel_h,
92 int kernel_w,
93 int stride_h,
94 int stride_w,
95 int dilation_h = 1,
96 int dilation_w = 1);
97
98 ggml_tensor * build_edge_residual(
99 ggml_tensor * inp,
100 const mobilenetv5_block & block,
101 int stride);
102
103 ggml_tensor * build_inverted_residual(
104 ggml_tensor * inp,
105 const mobilenetv5_block & block,
106 int stride);
107
108 ggml_tensor * build_mobilenet_attn(
109 ggml_tensor * inp,
110 const mobilenetv5_block & block);
111};
112
113struct clip_graph_kimik25 : clip_graph {
114 clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
115 ggml_cgraph * build() override;
116
117 ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
118};