1#pragma once
  2
  3#include "../clip-graph.h"
  4
  5/*
  6 * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
  7 * We encourage human contributors to ensure the quality and reliability of the codebase.
  8 */
  9
 10struct clip_graph_siglip : clip_graph {
 11    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 12    ggml_cgraph * build() override;
 13};
 14
 15struct clip_graph_pixtral : clip_graph {
 16    clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 17    ggml_cgraph * build() override;
 18};
 19
 20struct clip_graph_qwen2vl : clip_graph {
 21    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 22    ggml_cgraph * build() override;
 23};
 24
 25struct clip_graph_qwen3vl : clip_graph {
 26    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 27    ggml_cgraph * build() override;
 28};
 29
 30struct clip_graph_youtuvl : clip_graph {
 31    clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 32    ggml_cgraph * build() override;
 33};
 34
 35struct clip_graph_minicpmv : clip_graph {
 36    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 37    ggml_cgraph * build() override;
 38};
 39
 40struct clip_graph_internvl : clip_graph {
 41    clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 42    ggml_cgraph * build() override;
 43};
 44
 45struct clip_graph_llama4 : clip_graph {
 46    clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 47    ggml_cgraph * build() override;
 48};
 49
 50struct clip_graph_kimivl : clip_graph {
 51    clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 52    ggml_cgraph * build() override;
 53};
 54
 55struct clip_graph_cogvlm : clip_graph {
 56    clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 57    ggml_cgraph * build() override;
 58};
 59
 60struct clip_graph_llava : clip_graph {
 61    clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 62    ggml_cgraph * build() override;
 63};
 64
 65struct clip_graph_whisper_enc : clip_graph {
 66    clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 67    ggml_cgraph * build() override;
 68};
 69
 70struct clip_graph_conformer : clip_graph {
 71    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 72    ggml_cgraph * build() override;
 73};
 74
 75struct clip_graph_glm4v : clip_graph {
 76    clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 77    ggml_cgraph * build() override;
 78};
 79
 80struct clip_graph_mobilenetv5 : clip_graph {
 81    clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
 82    ggml_cgraph * build() override;
 83
 84    ggml_tensor * rms_norm_2d(
 85        ggml_tensor * inp,
 86        ggml_tensor * weight,
 87        float eps = 1e-6f);
 88
 89    ggml_tensor* pad_same_2d(
 90        ggml_tensor* inp,
 91        int kernel_h,
 92        int kernel_w,
 93        int stride_h,
 94        int stride_w,
 95        int dilation_h = 1,
 96        int dilation_w = 1);
 97
 98    ggml_tensor * build_edge_residual(
 99        ggml_tensor * inp,
100        const mobilenetv5_block & block,
101        int stride);
102
103    ggml_tensor * build_inverted_residual(
104        ggml_tensor * inp,
105        const mobilenetv5_block & block,
106        int stride);
107
108    ggml_tensor * build_mobilenet_attn(
109        ggml_tensor * inp,
110        const mobilenetv5_block & block);
111};
112
113struct clip_graph_kimik25 : clip_graph {
114    clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
115    ggml_cgraph * build() override;
116
117    ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode);
118};