llmnpc - llama.cpp/tools/mtmd/clip.cpp

Path: llmnpc / llama.cpp / tools / mtmd / clip.cpp (raw)
   1#include "clip.h"
   2#include "clip-impl.h"
   3#include "clip-model.h"
   4#include "clip-graph.h"
   5#include "models/models.h"
   6
   7#include "ggml.h"
   8#include "ggml-cpp.h"
   9#include "ggml-alloc.h"
  10#include "ggml-backend.h"
  11#include "gguf.h"
  12
  13#include <algorithm>
  14#include <cassert>
  15#include <cmath>
  16#include <cstdlib>
  17#include <cstring>
  18#include <fstream>
  19#include <map>
  20#include <stdexcept>
  21#include <unordered_set>
  22#include <vector>
  23#include <cinttypes>
  24#include <limits>
  25#include <array>
  26#include <functional>
  27
  28struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};
  29
  30//#define CLIP_DEBUG_FUNCTIONS
  31
  32#ifdef CLIP_DEBUG_FUNCTIONS
  33static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
  34    std::ofstream file(filename, std::ios::binary);
  35    if (!file.is_open()) {
  36        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
  37        return;
  38    }
  39
  40    // PPM header: P6 format, width, height, and max color value
  41    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
  42
  43    // Write pixel data
  44    for (size_t i = 0; i < img.buf.size(); i += 3) {
  45        // PPM expects binary data in RGB format, which matches our image buffer
  46        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
  47    }
  48
  49    file.close();
  50}
  51
  52static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
  53    std::ofstream file(filename, std::ios::binary);
  54    if (!file.is_open()) {
  55        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
  56        return;
  57    }
  58
  59    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
  60    int bytesPerPixel = 3;
  61    int widthInBytes = img.nx * bytesPerPixel;
  62    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
  63    int stride = widthInBytes + paddingAmount;
  64
  65    // Bitmap file header
  66    unsigned char fileHeader[14] = {
  67        'B','M',     // Signature
  68        0,0,0,0,    // Image file size in bytes
  69        0,0,0,0,    // Reserved
  70        54,0,0,0    // Start of pixel array
  71    };
  72
  73    // Total file size
  74    fileSize = 54 + (stride * img.ny);
  75    fileHeader[2] = (unsigned char)(fileSize);
  76    fileHeader[3] = (unsigned char)(fileSize >> 8);
  77    fileHeader[4] = (unsigned char)(fileSize >> 16);
  78    fileHeader[5] = (unsigned char)(fileSize >> 24);
  79
  80    // Bitmap information header (BITMAPINFOHEADER)
  81    unsigned char infoHeader[40] = {
  82        40,0,0,0,   // Size of this header (40 bytes)
  83        0,0,0,0,    // Image width
  84        0,0,0,0,    // Image height
  85        1,0,        // Number of color planes
  86        24,0,       // Bits per pixel
  87        0,0,0,0,    // No compression
  88        0,0,0,0,    // Image size (can be 0 for no compression)
  89        0,0,0,0,    // X pixels per meter (not specified)
  90        0,0,0,0,    // Y pixels per meter (not specified)
  91        0,0,0,0,    // Total colors (color table not used)
  92        0,0,0,0     // Important colors (all are important)
  93    };
  94
  95    // Width and height in the information header
  96    infoHeader[4] = (unsigned char)(img.nx);
  97    infoHeader[5] = (unsigned char)(img.nx >> 8);
  98    infoHeader[6] = (unsigned char)(img.nx >> 16);
  99    infoHeader[7] = (unsigned char)(img.nx >> 24);
 100    infoHeader[8] = (unsigned char)(img.ny);
 101    infoHeader[9] = (unsigned char)(img.ny >> 8);
 102    infoHeader[10] = (unsigned char)(img.ny >> 16);
 103    infoHeader[11] = (unsigned char)(img.ny >> 24);
 104
 105    // Write file headers
 106    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
 107    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
 108
 109    // Pixel data
 110    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
 111    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
 112        for (int x = 0; x < img.nx; ++x) {
 113            // Each pixel
 114            size_t pixelIndex = (y * img.nx + x) * 3;
 115            unsigned char pixel[3] = {
 116                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
 117                img.buf[pixelIndex + 1],
 118                img.buf[pixelIndex]
 119            };
 120            file.write(reinterpret_cast<char*>(pixel), 3);
 121        }
 122        // Write padding for the row
 123        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
 124    }
 125
 126    file.close();
 127}
 128
 129// debug function to convert f32 to u8
 130static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
 131    dst.nx = src.nx;
 132    dst.ny = src.ny;
 133    dst.buf.resize(3 * src.nx * src.ny);
 134    for (size_t i = 0; i < src.buf.size(); ++i) {
 135        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
 136    }
 137}
 138#endif
 139
 140
 141struct clip_ctx {
 142    clip_model model;
 143
 144    gguf_context_ptr ctx_gguf;
 145    ggml_context_ptr ctx_data;
 146
 147    std::vector<uint8_t> buf_compute_meta;
 148
 149    std::vector<ggml_backend_t> backend_ptrs;
 150    std::vector<ggml_backend_buffer_type_t> backend_buft;
 151
 152    ggml_backend_t backend = nullptr;
 153    ggml_backend_t backend_cpu = nullptr;
 154    ggml_backend_buffer_ptr buf;
 155
 156
 157    int max_nodes = 8192;
 158    ggml_backend_sched_ptr sched;
 159    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
 160    bool is_allocated = false;
 161
 162    clip_ctx(clip_context_params & ctx_params) {
 163        flash_attn_type = ctx_params.flash_attn_type;
 164        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
 165        if (!backend_cpu) {
 166            throw std::runtime_error("failed to initialize CPU backend");
 167        }
 168        if (ctx_params.use_gpu) {
 169            auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
 170            if (backend_name != nullptr) {
 171                backend = ggml_backend_init_by_name(backend_name, nullptr);
 172                if (!backend) {
 173                    LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name);
 174                }
 175            }
 176            if (!backend) {
 177                backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
 178                backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
 179            }
 180        }
 181
 182        if (backend) {
 183            LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
 184            backend_ptrs.push_back(backend);
 185            backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
 186        } else {
 187            backend = backend_cpu;
 188            LOG_INF("%s: CLIP using CPU backend\n", __func__);
 189        }
 190
 191        if (ctx_params.image_min_tokens > 0) {
 192            model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
 193        }
 194        if (ctx_params.image_max_tokens > 0) {
 195            model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
 196        }
 197
 198        backend_ptrs.push_back(backend_cpu);
 199        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
 200
 201        sched.reset(
 202            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
 203        );
 204
 205        if (ctx_params.cb_eval != nullptr) {
 206            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
 207        }
 208    }
 209
 210    ~clip_ctx() {
 211        ggml_backend_free(backend);
 212        if (backend != backend_cpu) {
 213            ggml_backend_free(backend_cpu);
 214        }
 215    }
 216
 217    // this function is added so that we don't change too much of the existing code
 218    projector_type proj_type() const {
 219        return model.proj_type;
 220    }
 221};
 222
 223//
 224// clip_graph
 225//
 226
 227clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
 228        model(ctx->model),
 229        hparams(model.hparams),
 230        proj_type(ctx->proj_type()),
 231        img(img),
 232        patch_size(hparams.patch_size),
 233        n_patches_x(img.nx / patch_size),
 234        n_patches_y(img.ny / patch_size),
 235        n_patches(n_patches_x * n_patches_y),
 236        n_embd(hparams.n_embd),
 237        n_head(hparams.n_head),
 238        d_head(n_embd / n_head),
 239        n_layer(hparams.n_layer),
 240        n_mmproj_embd(clip_n_mmproj_embd(ctx)),
 241        eps(hparams.eps),
 242        kq_scale(1.0f / sqrtf((float)d_head)),
 243        flash_attn_type(ctx->flash_attn_type) {
 244    struct ggml_init_params params = {
 245        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
 246        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
 247        /*.no_alloc   =*/ true,
 248    };
 249    ctx0_ptr.reset(ggml_init(params));
 250    ctx0 = ctx0_ptr.get();
 251    gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
 252}
 253
 254void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
 255    if (il >= 0) {
 256        ggml_format_name(cur, "%s-%d", name, il);
 257    } else {
 258        ggml_set_name(cur, name);
 259    }
 260}
 261
 262// siglip2 naflex
 263ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) {
 264    ggml_tensor * pos_embd = model.position_embeddings;
 265    const int height       = img.ny / patch_size;
 266    const int width        = img.nx / patch_size;
 267    const uint32_t mode    = interpolation_mode;
 268    const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
 269
 270    GGML_ASSERT(pos_embd);
 271
 272    if (height == n_per_side && width == n_per_side) {
 273        return pos_embd;
 274    }
 275
 276    pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side);  // -> (n_embd, n_per_side, n_per_side)
 277    pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3);                         // -> (n_per_side, n_per_side, n_embd)
 278    pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
 279    pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3);                         // -> (n_embd, width, height)
 280    pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height);             // -> (n_embd, width * height)
 281
 282    return pos_embd;
 283}
 284
 285// build vision transformer (ViT) cgraph
 286// this function should cover most of the models
 287// if your model has specific features, you should probably duplicate this function
 288ggml_tensor * clip_graph::build_vit(
 289            ggml_tensor * inp,
 290            int64_t n_pos,
 291            norm_type norm_t,
 292            ffn_op_type ffn_t,
 293            ggml_tensor * learned_pos_embd,
 294            std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
 295        ) {
 296    if (learned_pos_embd) {
 297        inp = ggml_add(ctx0, inp, learned_pos_embd);
 298        cb(inp, "pos_embed", -1);
 299    }
 300
 301    ggml_tensor * inpL = inp;
 302
 303    // pre-layernorm
 304    if (model.pre_ln_w) {
 305        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
 306        cb(inpL, "pre_ln", -1);
 307    }
 308
 309    // loop over layers
 310    for (int il = 0; il < n_layer; il++) {
 311        auto & layer = model.layers[il];
 312        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
 313
 314        // layernorm1
 315        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
 316        cb(cur, "layer_inp_normed", il);
 317
 318        // self-attention
 319        {
 320            ggml_tensor * Qcur = nullptr;
 321            ggml_tensor * Kcur = nullptr;
 322            ggml_tensor * Vcur = nullptr;
 323            if (layer.qkv_w != nullptr) {
 324                // fused qkv
 325                cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
 326                if (layer.qkv_b != nullptr) {
 327                    cur = ggml_add(ctx0, cur, layer.qkv_b);
 328                }
 329
 330                Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
 331                    /* nb1    */ ggml_row_size(cur->type, d_head),
 332                    /* nb2    */ cur->nb[1],
 333                    /* offset */ 0);
 334
 335                Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
 336                    /* nb1    */ ggml_row_size(cur->type, d_head),
 337                    /* nb2    */ cur->nb[1],
 338                    /* offset */ ggml_row_size(cur->type, n_embd));
 339
 340                Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
 341                    /* nb1    */ ggml_row_size(cur->type, d_head),
 342                    /* nb2    */ cur->nb[1],
 343                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
 344
 345                // TODO: q/k norm requires row size == n_embd, while here it's d_head
 346                // we can add support in the future if needed
 347                GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
 348
 349            } else {
 350                // separate q, k, v
 351                Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
 352                if (layer.q_b) {
 353                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
 354                }
 355
 356                Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
 357                if (layer.k_b) {
 358                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
 359                }
 360
 361                Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
 362                if (layer.v_b) {
 363                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
 364                }
 365
 366                if (layer.q_norm) {
 367                    Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
 368                    cb(Qcur, "Qcur_norm", il);
 369                }
 370
 371                if (layer.k_norm) {
 372                    Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
 373                    cb(Kcur, "Kcur_norm", il);
 374                }
 375
 376                Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
 377                Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
 378                Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
 379            }
 380
 381            cb(Qcur, "Qcur", il);
 382            cb(Kcur, "Kcur", il);
 383            cb(Vcur, "Vcur", il);
 384
 385            if (add_pos) {
 386                Qcur = add_pos(Qcur, layer);
 387                Kcur = add_pos(Kcur, layer);
 388                cb(Qcur, "Qcur_pos", il);
 389                cb(Kcur, "Kcur_pos", il);
 390            }
 391
 392            cur = build_attn(layer.o_w, layer.o_b,
 393                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
 394            cb(cur, "attn_out", il);
 395        }
 396
 397        if (layer.ls_1_w) {
 398            cur = ggml_mul(ctx0, cur, layer.ls_1_w);
 399            cb(cur, "attn_out_scaled", il);
 400        }
 401
 402        // re-add the layer input, e.g., residual
 403        cur = ggml_add(ctx0, cur, inpL);
 404
 405        inpL = cur; // inpL = residual, cur = hidden_states
 406
 407        cb(cur, "ffn_inp", il);
 408
 409        // layernorm2
 410        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
 411        cb(cur, "ffn_inp_normed", il);
 412
 413        // ffn
 414        cur = build_ffn(cur,
 415            layer.ff_up_w, layer.ff_up_b,
 416            layer.ff_gate_w, layer.ff_gate_b,
 417            layer.ff_down_w, layer.ff_down_b,
 418            ffn_t, il);
 419
 420        cb(cur, "ffn_out", il);
 421
 422        if (layer.ls_2_w) {
 423            cur = ggml_mul(ctx0, cur, layer.ls_2_w);
 424            cb(cur, "ffn_out_scaled", il);
 425        }
 426
 427        // residual 2
 428        cur = ggml_add(ctx0, inpL, cur);
 429        cb(cur, "layer_out", il);
 430
 431        inpL = cur;
 432    }
 433
 434    if (model.audio_has_avgpool()) {
 435        ggml_tensor * cur = inpL;
 436        cur = ggml_transpose(ctx0, cur);
 437        cur = ggml_cont(ctx0, cur);
 438        cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0);
 439        cur = ggml_transpose(ctx0, cur);
 440        cur = ggml_cont(ctx0, cur);
 441        inpL = cur;
 442    }
 443
 444    // post-layernorm
 445    if (model.post_ln_w) {
 446        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1);
 447    }
 448    return inpL;
 449}
 450
 451// build the input after conv2d (inp_raw --> patches)
 452// returns tensor with shape [n_embd, n_patches]
 453ggml_tensor * clip_graph::build_inp() {
 454    ggml_tensor * inp_raw = build_inp_raw();
 455    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 456    inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd);
 457    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
 458    if (model.patch_bias) {
 459        inp = ggml_add(ctx0, inp, model.patch_bias);
 460        cb(inp, "patch_bias", -1);
 461    }
 462    return inp;
 463}
 464
 465ggml_tensor * clip_graph::build_inp_raw(int channels) {
 466    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
 467    ggml_set_name(inp_raw, "inp_raw");
 468    ggml_set_input(inp_raw);
 469    return inp_raw;
 470}
 471
 472ggml_tensor * clip_graph::build_norm(
 473        ggml_tensor * cur,
 474        ggml_tensor * mw,
 475        ggml_tensor * mb,
 476        norm_type type,
 477        float norm_eps,
 478        int il) const {
 479
 480    cur = type == NORM_TYPE_RMS
 481        ? ggml_rms_norm(ctx0, cur, norm_eps)
 482        : ggml_norm(ctx0, cur, norm_eps);
 483
 484    if (mw) {
 485        cur = ggml_mul(ctx0, cur, mw);
 486        cb(cur, "norm_w", il);
 487    }
 488
 489    if (mb) {
 490        cur = ggml_add(ctx0, cur, mb);
 491        cb(cur, "norm_b", il);
 492    }
 493
 494    return cur;
 495}
 496
 497ggml_tensor * clip_graph::build_ffn(
 498        ggml_tensor * cur,
 499        ggml_tensor * up,
 500        ggml_tensor * up_b,
 501        ggml_tensor * gate,
 502        ggml_tensor * gate_b,
 503        ggml_tensor * down,
 504        ggml_tensor * down_b,
 505        ffn_op_type type_op,
 506        int il) const {
 507
 508    ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
 509    cb(tmp, "ffn_up", il);
 510
 511    if (up_b) {
 512        tmp = ggml_add(ctx0, tmp, up_b);
 513        cb(tmp, "ffn_up_b", il);
 514    }
 515
 516    if (gate) {
 517        cur = ggml_mul_mat(ctx0, gate, cur);
 518        cb(cur, "ffn_gate", il);
 519
 520        if (gate_b) {
 521            cur = ggml_add(ctx0, cur, gate_b);
 522            cb(cur, "ffn_gate_b", il);
 523        }
 524    } else {
 525        cur = tmp;
 526    }
 527
 528    // we only support parallel ffn for now
 529    switch (type_op) {
 530        case FFN_SILU:
 531            if (gate) {
 532                cur = ggml_swiglu_split(ctx0, cur, tmp);
 533                cb(cur, "ffn_swiglu", il);
 534            } else {
 535                cur = ggml_silu(ctx0, cur);
 536                cb(cur, "ffn_silu", il);
 537            } break;
 538        case FFN_GELU:
 539            if (gate) {
 540                cur = ggml_geglu_split(ctx0, cur, tmp);
 541                cb(cur, "ffn_geglu", il);
 542            } else {
 543                cur = ggml_gelu(ctx0, cur);
 544                cb(cur, "ffn_gelu", il);
 545            } break;
 546        case FFN_GELU_ERF:
 547            if (gate) {
 548                cur = ggml_geglu_erf_split(ctx0, cur, tmp);
 549                cb(cur, "ffn_geglu_erf", il);
 550            } else {
 551                cur = ggml_gelu_erf(ctx0, cur);
 552                cb(cur, "ffn_gelu_erf", il);
 553            } break;
 554        case FFN_GELU_QUICK:
 555            if (gate) {
 556                cur = ggml_geglu_quick_split(ctx0, cur, tmp);
 557                cb(cur, "ffn_geglu_quick", il);
 558            } else {
 559                cur = ggml_gelu_quick(ctx0, cur);
 560                cb(cur, "ffn_gelu_quick", il);
 561            } break;
 562    }
 563
 564    if (down) {
 565        cur = ggml_mul_mat(ctx0, down, cur);
 566    }
 567
 568    if (down_b) {
 569        cb(cur, "ffn_down", il);
 570    }
 571
 572    if (down_b) {
 573        cur = ggml_add(ctx0, cur, down_b);
 574    }
 575
 576    return cur;
 577}
 578
 579ggml_tensor * clip_graph::build_attn(
 580        ggml_tensor * wo,
 581        ggml_tensor * wo_b,
 582        ggml_tensor * q_cur,
 583        ggml_tensor * k_cur,
 584        ggml_tensor * v_cur,
 585        ggml_tensor * kq_mask,
 586        float kq_scale,
 587        int il) const {
 588    // these nodes are added to the graph together so that they are not reordered
 589    // by doing so, the number of splits in the graph is reduced
 590    ggml_build_forward_expand(gf, q_cur);
 591    ggml_build_forward_expand(gf, k_cur);
 592    ggml_build_forward_expand(gf, v_cur);
 593
 594    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
 595    //cb(q, "q", il);
 596
 597    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
 598    //cb(k, "k", il);
 599
 600    ggml_tensor * cur;
 601
 602    if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
 603        ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
 604
 605        k = ggml_cast(ctx0, k, GGML_TYPE_F16);
 606        v = ggml_cast(ctx0, v, GGML_TYPE_F16);
 607
 608        cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
 609        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
 610
 611        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
 612
 613    } else {
 614        ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
 615        v = ggml_cont(ctx0, v);
 616
 617        const auto n_tokens = q->ne[1];
 618        const auto n_head   = q->ne[2];
 619
 620        ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
 621        // F32 may not needed for vision encoders?
 622        // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
 623
 624        kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
 625
 626        ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
 627        cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
 628        cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
 629    }
 630
 631    cb(cur, "kqv_out", il);
 632
 633    if (wo) {
 634        cur = ggml_mul_mat(ctx0, wo, cur);
 635    }
 636
 637    if (wo_b) {
 638        cur = ggml_add(ctx0, cur, wo_b);
 639    }
 640
 641    return cur;
 642}
 643
 644// implementation of the 2D RoPE without adding a new op in ggml
 645// this is not efficient (use double the memory), but works on all backends
 646// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
 647ggml_tensor * clip_graph::build_rope_2d(
 648    ggml_context * ctx0,
 649    ggml_tensor * cur,
 650    ggml_tensor * pos_a, // first half
 651    ggml_tensor * pos_b, // second half
 652    const float freq_base,
 653    const bool interleave_freq
 654) {
 655    const int64_t n_dim  = cur->ne[0];
 656    const int64_t n_head = cur->ne[1];
 657    const int64_t n_pos  = cur->ne[2];
 658
 659    // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
 660    // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
 661    // first half of cur will use 1e-0, 1e-2 (even)
 662    // second half of cur will use 1e-1, 1e-3 (odd)
 663    // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
 664    //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
 665    // then for the second half, we use freq_scale to shift the inv_freq
 666    //  ^ why? replace (2i) with (2i+1) in the above equation
 667    const float freq_scale_odd = interleave_freq
 668                                ? std::pow(freq_base, (float)-2/n_dim)
 669                                : 1.0;
 670
 671    // first half
 672    ggml_tensor * first;
 673    {
 674        first = ggml_view_3d(ctx0, cur,
 675            n_dim/2, n_head, n_pos,
 676            cur->nb[1],
 677            cur->nb[2],
 678            0);
 679        first = ggml_rope_ext(
 680            ctx0,
 681            first,
 682            pos_a,      // positions
 683            nullptr,    // freq factors
 684            n_dim/2,    // n_dims
 685            0, 0, freq_base,
 686            1.0f, 0.0f, 1.0f, 0.0f, 0.0f
 687        );
 688    }
 689
 690    // second half
 691    ggml_tensor * second;
 692    {
 693        second = ggml_view_3d(ctx0, cur,
 694            n_dim/2, n_head, n_pos,
 695            cur->nb[1],
 696            cur->nb[2],
 697            n_dim/2 * ggml_element_size(cur));
 698        second = ggml_rope_ext(
 699            ctx0,
 700            second,
 701            pos_b,      // positions
 702            nullptr,    // freq factors
 703            n_dim/2,    // n_dims
 704            0, 0, freq_base,
 705            freq_scale_odd,
 706            0.0f, 1.0f, 0.0f, 0.0f
 707        );
 708    }
 709
 710    cur = ggml_concat(ctx0, first, second, 0);
 711    return cur;
 712}
 713
 714// Generic function to stack frames for audio processing
 715// Abstracts out the StackAudioFrames logic used by ultravox
 716ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
 717    if (stack_factor <= 1) {
 718        return cur;
 719    }
 720
 721    int64_t total_elements = ggml_nelements(cur);
 722    int64_t stride = n_embed * stack_factor;
 723
 724    // Calculate padded length
 725    int64_t padded_len = GGML_PAD(total_elements, stride);
 726    int64_t pad = padded_len - total_elements;
 727
 728    if (pad > 0) {
 729        // Pad the tensor to make it divisible by stride
 730        cur = ggml_view_1d(ctx0, cur, total_elements, 0);
 731        cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
 732    }
 733
 734    // Reshape to [stride, padded_len / stride]
 735    cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
 736                        ggml_row_size(cur->type, stride), 0);
 737    return cur;
 738}
 739
 740// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
 741// support dynamic resolution
 742ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
 743    GGML_ASSERT(scale_factor > 1);
 744
 745    const int n_embd = cur->ne[0];
 746    int width  = img.nx / patch_size;
 747    int height = img.ny / patch_size;
 748
 749    // pad width and height to factor
 750    const int64_t pad_width  = CLIP_ALIGN(width,  scale_factor) - width;
 751    const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
 752    cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
 753    if (pad_width || pad_height) {
 754        cur     = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
 755        width  += pad_width;
 756        height += pad_height;
 757    }
 758
 759    // unshuffle h
 760    cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
 761    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 762
 763    // unshuffle w
 764    cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
 765    cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
 766
 767    cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
 768    cb(cur, "pixel_shuffle", -1);
 769
 770    return cur;
 771}
 772
 773static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
 774    GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");
 775
 776    const clip_image_f32 & img = *imgs.entries[0];
 777    std::unique_ptr<clip_graph> builder;
 778
 779    switch (ctx->proj_type()) {
 780        case PROJECTOR_TYPE_GEMMA3:
 781        case PROJECTOR_TYPE_IDEFICS3:
 782        case PROJECTOR_TYPE_LFM2:
 783        case PROJECTOR_TYPE_JANUS_PRO:
 784            {
 785                builder = std::make_unique<clip_graph_siglip>(ctx, img);
 786            } break;
 787        case PROJECTOR_TYPE_GEMMA3NV:
 788            {
 789                builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
 790            } break;
 791        case PROJECTOR_TYPE_PIXTRAL:
 792        case PROJECTOR_TYPE_LIGHTONOCR:
 793            {
 794                builder = std::make_unique<clip_graph_pixtral>(ctx, img);
 795            } break;
 796        case PROJECTOR_TYPE_QWEN2VL:
 797        case PROJECTOR_TYPE_QWEN25VL:
 798            {
 799                builder = std::make_unique<clip_graph_qwen2vl>(ctx, img);
 800            } break;
 801        case PROJECTOR_TYPE_QWEN3VL:
 802            {
 803                builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
 804            } break;
 805        case PROJECTOR_TYPE_MINICPMV:
 806            {
 807                builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
 808            } break;
 809        case PROJECTOR_TYPE_INTERNVL:
 810            {
 811                builder = std::make_unique<clip_graph_internvl>(ctx, img);
 812            } break;
 813        case PROJECTOR_TYPE_LLAMA4:
 814            {
 815                builder = std::make_unique<clip_graph_llama4>(ctx, img);
 816            } break;
 817        case PROJECTOR_TYPE_ULTRAVOX:
 818        case PROJECTOR_TYPE_VOXTRAL:
 819        case PROJECTOR_TYPE_QWEN2A:
 820        case PROJECTOR_TYPE_GLMA:
 821        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
 822            {
 823                builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
 824            } break;
 825        case PROJECTOR_TYPE_KIMIVL:
 826            {
 827                builder = std::make_unique<clip_graph_kimivl>(ctx, img);
 828            } break;
 829        case PROJECTOR_TYPE_KIMIK25:
 830            {
 831                builder = std::make_unique<clip_graph_kimik25>(ctx, img);
 832            } break;
 833        case PROJECTOR_TYPE_COGVLM:
 834            {
 835                builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
 836            } break;
 837        case PROJECTOR_TYPE_MLP:
 838        case PROJECTOR_TYPE_MLP_NORM:
 839        case PROJECTOR_TYPE_LDP:
 840        case PROJECTOR_TYPE_LDPV2:
 841        case PROJECTOR_TYPE_GLM_EDGE:
 842            {
 843                builder = std::make_unique<clip_graph_llava>(ctx, img);
 844            } break;
 845        case PROJECTOR_TYPE_LFM2A:
 846            {
 847                builder = std::make_unique<clip_graph_conformer>(ctx, img);
 848            } break;
 849        case PROJECTOR_TYPE_GLM4V:
 850            {
 851                builder = std::make_unique<clip_graph_glm4v>(ctx, img);
 852            } break;
 853        case PROJECTOR_TYPE_YOUTUVL:
 854            {
 855                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
 856            } break;
 857        default:
 858            GGML_ABORT("missing cgraph builder");
 859    }
 860
 861    return builder->build();
 862}
 863
 864//
 865// clip_model_loader
 866//
 867
 868struct clip_model_loader {
 869    ggml_context_ptr ctx_meta;
 870    gguf_context_ptr ctx_gguf;
 871
 872    std::string fname;
 873
 874    size_t model_size = 0; // in bytes
 875
 876    bool has_vision = false;
 877    bool has_audio  = false;
 878
 879    // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
 880    clip_model_loader(const char * fname) : fname(fname) {
 881        struct ggml_context * meta = nullptr;
 882
 883        struct gguf_init_params params = {
 884            /*.no_alloc = */ true,
 885            /*.ctx      = */ &meta,
 886        };
 887
 888        ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params));
 889        if (!ctx_gguf.get()) {
 890            throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
 891        }
 892
 893        ctx_meta.reset(meta);
 894
 895        const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
 896
 897        // print gguf info
 898        {
 899            std::string name;
 900            get_string(KEY_NAME, name, false);
 901            std::string description;
 902            get_string(KEY_DESCRIPTION, description, false);
 903            LOG_INF("%s: model name:   %s\n",  __func__, name.c_str());
 904            LOG_INF("%s: description:  %s\n",  __func__, description.c_str());
 905            LOG_INF("%s: GGUF version: %d\n",  __func__, gguf_get_version(ctx_gguf.get()));
 906            LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx_gguf.get()));
 907            LOG_INF("%s: n_tensors:    %d\n",  __func__, n_tensors);
 908            LOG_INF("%s: n_kv:         %d\n",  __func__, (int)gguf_get_n_kv(ctx_gguf.get()));
 909            LOG_INF("\n");
 910        }
 911
 912        // modalities
 913        {
 914            get_bool(KEY_HAS_VISION_ENC, has_vision, false);
 915            get_bool(KEY_HAS_AUDIO_ENC,  has_audio,  false);
 916
 917            if (has_vision) {
 918                LOG_INF("%s: has vision encoder\n", __func__);
 919            }
 920            if (has_audio) {
 921                LOG_INF("%s: has audio encoder\n", __func__);
 922            }
 923        }
 924
 925        // tensors
 926        {
 927            for (int i = 0; i < n_tensors; ++i) {
 928                const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
 929                const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i);
 930                enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i);
 931                ggml_tensor * cur = ggml_get_tensor(meta, name);
 932                size_t tensor_size = ggml_nbytes(cur);
 933                model_size += tensor_size;
 934                LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
 935                    __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
 936            }
 937        }
 938    }
 939
 940    void load_hparams(clip_model & model, clip_modality modality) {
 941        auto & hparams = model.hparams;
 942        std::string log_ffn_op; // for logging
 943
 944        // sanity check
 945        if (modality == CLIP_MODALITY_VISION) {
 946            GGML_ASSERT(has_vision);
 947        } else if (modality == CLIP_MODALITY_AUDIO) {
 948            GGML_ASSERT(has_audio);
 949        }
 950        model.modality = modality;
 951
 952
 953        // projector type
 954        std::string proj_type;
 955        {
 956            // default key
 957            get_string(KEY_PROJ_TYPE, proj_type, false);
 958
 959            // for models with mixed modalities
 960            if (proj_type.empty()) {
 961                if (modality == CLIP_MODALITY_VISION) {
 962                    get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
 963                } else if (modality == CLIP_MODALITY_AUDIO) {
 964                    get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
 965                } else {
 966                    GGML_ABORT("unknown modality");
 967                }
 968            }
 969
 970            model.proj_type = clip_projector_type_from_string(proj_type);
 971
 972            if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
 973                throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
 974            }
 975
 976            // correct arch for multimodal models (legacy method)
 977            if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
 978                model.proj_type = modality == CLIP_MODALITY_VISION
 979                                    ? PROJECTOR_TYPE_QWEN25VL
 980                                    : PROJECTOR_TYPE_QWEN2A;
 981            }
 982        }
 983
 984        const bool is_vision = model.modality == CLIP_MODALITY_VISION;
 985        const bool is_audio  = model.modality == CLIP_MODALITY_AUDIO;
 986
 987        // other hparams
 988        {
 989            const char * prefix = is_vision ? "vision" : "audio";
 990            get_u32(string_format(KEY_N_EMBD,         prefix), hparams.n_embd);
 991            get_u32(string_format(KEY_N_HEAD,         prefix), hparams.n_head);
 992            get_u32(string_format(KEY_N_FF,           prefix), hparams.n_ff);
 993            get_u32(string_format(KEY_N_BLOCK,        prefix), hparams.n_layer);
 994            get_u32(string_format(KEY_PROJ_DIM,       prefix), hparams.projection_dim);
 995            get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
 996
 997            if (is_vision) {
 998                get_u32(KEY_IMAGE_SIZE, hparams.image_size);
 999                get_u32(KEY_PATCH_SIZE, hparams.patch_size);
1000                get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
1001                get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
1002                get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
1003                if (hparams.minicpmv_query_num == 0) {
1004                    // Fallback to hardcoded values for legacy models
1005                    if (hparams.minicpmv_version == 3) {
1006                        hparams.minicpmv_query_num = 64;
1007                    } else if (hparams.minicpmv_version == 4) {
1008                        hparams.minicpmv_query_num = 64;
1009                    } else if (hparams.minicpmv_version == 5) {
1010                        hparams.minicpmv_query_num = 64;
1011                    } else if (hparams.minicpmv_version == 6) {
1012                        hparams.minicpmv_query_num = 64;
1013                    } else if (hparams.minicpmv_version == 100045) {
1014                        hparams.minicpmv_query_num = 64;
1015                    } else {
1016                        hparams.minicpmv_query_num = 96;
1017                    }
1018                }
1019            } else if (is_audio) {
1020                get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
1021                // some hparams are unused, but still need to set to avoid issues
1022                hparams.image_size = 0;
1023                hparams.patch_size = 1;
1024
1025            } else {
1026                GGML_ASSERT(false && "unknown modality");
1027            }
1028
1029            // for pinpoints, we need to convert it into a list of resolution candidates
1030            {
1031                std::vector<int> pinpoints;
1032                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
1033                if (!pinpoints.empty()) {
1034                    for (size_t i = 0; i < pinpoints.size(); i += 2) {
1035                        hparams.image_res_candidates.push_back({
1036                            pinpoints[i],
1037                            pinpoints[i+1],
1038                        });
1039                    }
1040                }
1041            }
1042
1043            // default warmup value
1044            hparams.warmup_image_size = hparams.image_size;
1045
1046            hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
1047                                       || model.proj_type == PROJECTOR_TYPE_MLP_NORM
1048                                       || model.proj_type == PROJECTOR_TYPE_LDP
1049                                       || model.proj_type == PROJECTOR_TYPE_LDPV2;
1050
1051            {
1052                bool use_gelu = false;
1053                bool use_silu = false;
1054                get_bool(KEY_USE_GELU, use_gelu, false);
1055                get_bool(KEY_USE_SILU, use_silu, false);
1056                if (use_gelu && use_silu) {
1057                    throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__));
1058                }
1059                if (use_gelu) {
1060                    hparams.ffn_op = FFN_GELU;
1061                    log_ffn_op = "gelu";
1062                } else if (use_silu) {
1063                    hparams.ffn_op = FFN_SILU;
1064                    log_ffn_op = "silu";
1065                } else {
1066                    hparams.ffn_op = FFN_GELU_QUICK;
1067                    log_ffn_op = "gelu_quick";
1068                }
1069            }
1070
1071            {
1072                std::string mm_patch_merge_type;
1073                get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
1074                if (mm_patch_merge_type == "spatial_unpad") {
1075                    hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
1076                }
1077            }
1078
1079            if (is_vision) {
1080                int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
1081                int idx_std  = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
1082                GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
1083                GGML_ASSERT(idx_std >= 0  && "image_std not found");
1084                const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean);
1085                const float * std_data  = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std);
1086                for (int i = 0; i < 3; ++i) {
1087                    hparams.image_mean[i] = mean_data[i];
1088                    hparams.image_std[i]  = std_data[i];
1089                }
1090            }
1091
1092            // Load the vision feature layer indices if they are explicitly provided;
1093            // if multiple vision feature layers are present, the values will be concatenated
1094            // to form the final visual features.
1095            // NOTE: gguf conversions should standardize the values of the vision feature layer to
1096            // be non-negative, since we use -1 to mark values as unset here.
1097            std::vector<int> vision_feature_layer;
1098            get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false);
1099            // convert std::vector to std::unordered_set
1100            for (auto & layer : vision_feature_layer) {
1101                hparams.vision_feature_layer.insert(layer);
1102            }
1103
1104            // model-specific params
1105            switch (model.proj_type) {
1106                case PROJECTOR_TYPE_MINICPMV:
1107                    {
1108                        if (hparams.minicpmv_version == 0) {
1109                            hparams.minicpmv_version = 2; // default to 2 if not set
1110                        }
1111                    } break;
1112                case PROJECTOR_TYPE_INTERNVL:
1113                    {
1114                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1115                    } break;
1116                case PROJECTOR_TYPE_IDEFICS3:
1117                    {
1118                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1119                        get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
1120                    } break;
1121                case PROJECTOR_TYPE_LFM2:
1122                    {
1123                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1124                        // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
1125                        hparams.set_limit_image_tokens(64, 256);
1126                    } break;
1127                case PROJECTOR_TYPE_PIXTRAL:
1128                case PROJECTOR_TYPE_LIGHTONOCR:
1129                    {
1130                        // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
1131                        // TODO: verify the image_min_tokens
1132                        hparams.n_merge = 1; // the original pixtral does not use patch merging
1133                        hparams.rope_theta = 10000.0f;
1134                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1135                        hparams.set_limit_image_tokens(8, 1024);
1136                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
1137                    } break;
1138                case PROJECTOR_TYPE_KIMIVL:
1139                    {
1140                        hparams.rope_theta = 10000.0f;
1141                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1142                        // TODO: check kimivl preprocessor for exact values
1143                        hparams.set_limit_image_tokens(8, 1024);
1144                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
1145                    } break;
1146                case PROJECTOR_TYPE_KIMIK25:
1147                    {
1148                        hparams.rope_theta = 10000.0f;
1149                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1150
1151                        int min_pixels = 0, max_pixels = 0;
1152                        get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
1153                        get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
1154                        if (min_pixels > 0 && max_pixels > 0) {
1155                            hparams.image_min_pixels = min_pixels;
1156                            hparams.image_max_pixels = max_pixels;
1157                            hparams.warmup_image_size = static_cast<int>(std::sqrt(max_pixels));
1158                        } else {
1159                            hparams.set_limit_image_tokens(2, 4096);
1160                        }
1161                    } break;
1162                case PROJECTOR_TYPE_GEMMA3:
1163                    {
1164                        // default value (used by all model sizes in gemma 3 family)
1165                        // number of patches for each **side** is reduced by a factor of 4
1166                        hparams.n_merge = 4;
1167                        // test model (tinygemma3) has a different value, we optionally read it
1168                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1169                    } break;
1170
1171                case PROJECTOR_TYPE_GEMMA3NV:
1172                    {
1173                        // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
1174                        // Similar configuration to Gemma3
1175                        hparams.n_merge = 1;  // MobileNetV5 handles resizing internally
1176                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1177                    } break;
1178                case PROJECTOR_TYPE_QWEN2VL:
1179                case PROJECTOR_TYPE_QWEN25VL:
1180                case PROJECTOR_TYPE_QWEN3VL:
1181                    {
1182                        hparams.n_merge = 2; // default value for Qwen 2 and 2.5
1183                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1184                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
1185                        // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
1186                        hparams.set_limit_image_tokens(8, 4096);
1187                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
1188                        const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
1189                        if (hparams.image_min_pixels < warn_min_pixels) {
1190                            LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
1191                            LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
1192                            LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
1193                        }
1194                    } break;
1195                case PROJECTOR_TYPE_YOUTUVL:
1196                    {
1197                        hparams.n_merge = 2;
1198                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1199                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
1200                        std::vector<int> wa_layer_indexes_vec;
1201                        get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true);
1202                        for (auto & layer : wa_layer_indexes_vec) {
1203                            hparams.wa_layer_indexes.insert(layer);
1204                        }
1205                        // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens
1206                        hparams.set_limit_image_tokens(1, 62500);
1207                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
1208                    } break;
1209                case PROJECTOR_TYPE_GLM4V:
1210                    {
1211                        hparams.rope_theta = 10000.0f;
1212                        hparams.n_merge = 2; // default value for GLM4-V
1213                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
1214                        hparams.set_limit_image_tokens(8, 4096);
1215                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
1216                    } break;
1217                case PROJECTOR_TYPE_LLAMA4:
1218                    {
1219                        hparams.rope_theta = 10000.0f;
1220                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
1221                        set_llava_uhd_res_candidates(model, 3);
1222                    } break;
1223                case PROJECTOR_TYPE_ULTRAVOX:
1224                case PROJECTOR_TYPE_QWEN2A:
1225                case PROJECTOR_TYPE_GLMA:
1226                case PROJECTOR_TYPE_VOXTRAL:
1227                case PROJECTOR_TYPE_MUSIC_FLAMINGO:
1228                    {
1229                        bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
1230                                             model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
1231                                             model.proj_type == PROJECTOR_TYPE_GLMA;
1232                        get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
1233                        hparams.ffn_op = FFN_GELU_ERF;
1234                        log_ffn_op = "gelu_erf"; // temporary solution for logging
1235
1236                        // audio preprocessing params
1237                        hparams.audio_chunk_len    = 30; // in seconds
1238                        hparams.audio_sample_rate  = 16000;
1239                        hparams.audio_n_fft        = 400;
1240                        hparams.audio_window_len   = 400;
1241                        hparams.audio_hop_len      = 160;
1242                    } break;
1243                case PROJECTOR_TYPE_LFM2A:
1244                    {
1245                        // audio preprocessing params
1246                        hparams.audio_chunk_len        = 1; // in seconds
1247                        hparams.audio_sample_rate      = 16000;
1248                        hparams.audio_n_fft            = 512;
1249                        hparams.audio_window_len       = 400;
1250                        hparams.audio_hop_len          = 160;
1251                    } break;
1252                default:
1253                    break;
1254            }
1255
1256            // sanity check
1257            {
1258                if (hparams.image_max_pixels < hparams.image_min_pixels) {
1259                    throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
1260                }
1261            }
1262
1263            LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
1264            LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
1265            LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
1266            LOG_INF("%s: n_ff:               %d\n", __func__, hparams.n_ff);
1267            LOG_INF("%s: n_layer:            %d\n", __func__, hparams.n_layer);
1268            LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
1269            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
1270            if (is_vision) {
1271                LOG_INF("\n--- vision hparams ---\n");
1272                LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
1273                LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
1274                LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
1275                LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
1276                LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
1277                LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
1278                if (!hparams.wa_layer_indexes.empty()) {
1279                    LOG_INF("%s: wa_layer_indexes:  ", __func__);
1280                    for (auto & layer : hparams.wa_layer_indexes) {
1281                        LOG_INF("%d ", layer);
1282                    }
1283                    LOG_INF("\n");
1284                }
1285                if (hparams.image_min_pixels > 0) {
1286                    LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
1287                }
1288                if (hparams.image_max_pixels > 0) {
1289                    LOG_INF("%s: image_max_pixels:   %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
1290                }
1291            } else if (is_audio) {
1292                LOG_INF("\n--- audio hparams ---\n");
1293                LOG_INF("%s: n_mel_bins:         %d\n", __func__, hparams.n_mel_bins);
1294                LOG_INF("%s: proj_stack_factor:  %d\n", __func__, hparams.proj_stack_factor);
1295                LOG_INF("%s: audio_chunk_len:    %d\n", __func__, hparams.audio_chunk_len);
1296                LOG_INF("%s: audio_sample_rate:  %d\n", __func__, hparams.audio_sample_rate);
1297                LOG_INF("%s: audio_n_fft:        %d\n", __func__, hparams.audio_n_fft);
1298                LOG_INF("%s: audio_window_len:   %d\n", __func__, hparams.audio_window_len);
1299                LOG_INF("%s: audio_hop_len:      %d\n", __func__, hparams.audio_hop_len);
1300            }
1301            LOG_INF("\n");
1302            LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
1303            LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
1304        }
1305    }
1306
1307    void load_tensors(clip_ctx & ctx_clip) {
1308        auto & model = ctx_clip.model;
1309        auto & hparams = model.hparams;
1310        std::map<std::string, size_t> tensor_offset;
1311        std::vector<ggml_tensor *> tensors_to_load;
1312
1313        // TODO @ngxson : support both audio and video in the future
1314        const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v";
1315
1316        // get offsets
1317        for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
1318            const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
1319            tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i);
1320        }
1321
1322        // create data context
1323        struct ggml_init_params params = {
1324            /*.mem_size =*/ static_cast<size_t>(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(),
1325            /*.mem_buffer =*/ NULL,
1326            /*.no_alloc =*/ true,
1327        };
1328        ctx_clip.ctx_data.reset(ggml_init(params));
1329        if (!ctx_clip.ctx_data) {
1330            throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
1331        }
1332
1333        // helper function
1334        auto get_tensor = [&](const std::string & name, bool required = true) {
1335            ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str());
1336            if (!cur && required) {
1337                throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str()));
1338            }
1339            if (cur) {
1340                tensors_to_load.push_back(cur);
1341                // add tensors to context
1342                ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
1343                ggml_set_name(data_tensor, cur->name);
1344                cur = data_tensor;
1345            }
1346            return cur;
1347        };
1348
1349        model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
1350
1351        model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
1352        model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"),   false);
1353
1354        model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
1355        model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"),   false);
1356
1357        model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
1358        model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
1359        model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
1360
1361        model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false);
1362        model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"),   false);
1363
1364        model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
1365
1366        if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
1367            hparams.n_layer = 0; // gemma3n does not use normal layer structure
1368        }
1369
1370        // layers
1371        model.layers.resize(hparams.n_layer);
1372        for (int il = 0; il < hparams.n_layer; ++il) {
1373            auto & layer = model.layers[il];
1374            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "weight"), false);
1375            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "weight"), false);
1376            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "weight"), false);
1377            layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
1378            layer.qkv_w  = get_tensor(string_format(TN_ATTN_QKV,    prefix, il, "weight"), false);
1379            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
1380            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
1381            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        prefix, il, "weight"), false);
1382            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        prefix, il, "weight"), false);
1383            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        prefix, il, "weight"), false); // no bias
1384            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        prefix, il, "weight"), false); // no bias
1385
1386            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "bias"), false);
1387            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "bias"), false);
1388            layer.v_b    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "bias"), false);
1389            layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
1390            layer.qkv_b  = get_tensor(string_format(TN_ATTN_QKV,    prefix, il, "bias"), false);
1391            layer.ln_1_b = get_tensor(string_format(TN_LN_1,        prefix, il, "bias"), false);
1392            layer.ln_2_b = get_tensor(string_format(TN_LN_2,        prefix, il, "bias"), false);
1393
1394            // ffn
1395            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "weight"));
1396            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "bias"),   false);
1397            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
1398            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"),   false);
1399            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
1400            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"),   false);
1401
1402
1403            // qwen3vl deepstack layer
1404            layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
1405            layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
1406            layer.deepstack_fc1_w  = get_tensor(string_format(TN_DEEPSTACK_FC1,  il, "weight"), false);
1407            layer.deepstack_fc1_b  = get_tensor(string_format(TN_DEEPSTACK_FC1,  il, "bias"), false);
1408            layer.deepstack_fc2_w  = get_tensor(string_format(TN_DEEPSTACK_FC2,  il, "weight"), false);
1409            layer.deepstack_fc2_b  = get_tensor(string_format(TN_DEEPSTACK_FC2,  il, "bias"), false);
1410            if (layer.has_deepstack()) {
1411                model.n_deepstack_layers++;
1412            }
1413
1414            // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
1415            // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
1416            bool is_ffn_swapped = (
1417                    // only old models need this fix
1418                    model.proj_type == PROJECTOR_TYPE_MLP
1419                    || model.proj_type == PROJECTOR_TYPE_MLP_NORM
1420                    || model.proj_type == PROJECTOR_TYPE_LDP
1421                    || model.proj_type == PROJECTOR_TYPE_LDPV2
1422                    || model.proj_type == PROJECTOR_TYPE_QWEN2VL
1423                    || model.proj_type == PROJECTOR_TYPE_QWEN25VL
1424                    || model.proj_type == PROJECTOR_TYPE_GLM_EDGE
1425                    || model.proj_type == PROJECTOR_TYPE_GEMMA3
1426                    || model.proj_type == PROJECTOR_TYPE_IDEFICS3
1427                    || model.proj_type == PROJECTOR_TYPE_MINICPMV
1428                ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
1429            if (is_ffn_swapped) {
1430                // swap up and down weights
1431                ggml_tensor * tmp = layer.ff_up_w;
1432                layer.ff_up_w = layer.ff_down_w;
1433                layer.ff_down_w = tmp;
1434                // swap up and down biases
1435                tmp = layer.ff_up_b;
1436                layer.ff_up_b = layer.ff_down_b;
1437                layer.ff_down_b = tmp;
1438                if (il == 0) {
1439                    LOG_WRN("%s: ffn up/down are swapped\n", __func__);
1440                }
1441            }
1442        }
1443
1444
1445        switch (model.proj_type) {
1446            case PROJECTOR_TYPE_MLP:
1447            case PROJECTOR_TYPE_MLP_NORM:
1448                {
1449                    // LLaVA projection
1450                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false);
1451                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
1452                    // Yi-type llava
1453                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false);
1454                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
1455                    // missing in Yi-type llava
1456                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false);
1457                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
1458                    // Yi-type llava
1459                    model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false);
1460                    model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false);
1461                    model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false);
1462                    model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false);
1463                    if (model.mm_3_w) {
1464                        // TODO: this is a hack to support Yi-type llava
1465                        model.proj_type = PROJECTOR_TYPE_MLP_NORM;
1466                    }
1467                    model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
1468                } break;
1469            case PROJECTOR_TYPE_LDP:
1470                {
1471                    // MobileVLM projection
1472                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
1473                    model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
1474                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
1475                    model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
1476                    model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
1477                    model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
1478                    model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
1479                    model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
1480                    model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
1481                    model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
1482                    model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
1483                    model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
1484                    model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
1485                    model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
1486                    model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
1487                    model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
1488                    model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
1489                    model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
1490                    model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
1491                    model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
1492                    model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
1493                    model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
1494                    model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
1495                    model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1496                } break;
1497            case PROJECTOR_TYPE_LDPV2:
1498                {
1499                    // MobilVLM_V2 projection
1500                    model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
1501                    model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
1502                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
1503                    model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias"));
1504                    model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
1505                    model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
1506                } break;
1507            case PROJECTOR_TYPE_MINICPMV:
1508                {
1509                    // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1510                    model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
1511                    model.mm_model_query = get_tensor(TN_MINICPMV_QUERY);
1512                    model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ);
1513                    model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ);
1514                    model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight"));
1515                    model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight"));
1516                    model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight"));
1517                    model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias"));
1518                    model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias"));
1519                    model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias"));
1520                    model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight"));
1521                    model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias"));
1522                    model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight"));
1523                    model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias"));
1524                    model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight"));
1525                    model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias"));
1526                    model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight"));
1527                    model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias"));
1528                } break;
1529            case PROJECTOR_TYPE_GLM_EDGE:
1530                {
1531                    model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight"));
1532                    model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias"));
1533                    model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight"));
1534                    model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight"));
1535                    model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias"));
1536                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight"));
1537                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight"));
1538                    model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight"));
1539                    model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI));
1540                    model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI));
1541                } break;
1542            case PROJECTOR_TYPE_QWEN2VL:
1543            case PROJECTOR_TYPE_QWEN25VL:
1544                {
1545                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
1546                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1547                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1548                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1549                } break;
1550            case PROJECTOR_TYPE_QWEN3VL:
1551                {
1552                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
1553                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1554                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1555                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1556                } break;
1557            case PROJECTOR_TYPE_YOUTUVL:
1558                {
1559                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);        // merger.ln_q (RMS norm)
1560                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));  // merger.mlp.0
1561                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1562                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
1563                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1564                } break;
1565            case PROJECTOR_TYPE_GLM4V:
1566                {
1567                    model.projection     = get_tensor(TN_MM_PROJECTOR);
1568                    model.mm_ffn_up_w    = get_tensor(string_format(TN_MM_UP,        "weight"));
1569                    model.mm_ffn_up_b    = get_tensor(string_format(TN_MM_UP,        "bias"), false);
1570                    model.mm_ffn_gate_w  = get_tensor(string_format(TN_MM_GATE,      "weight"));
1571                    model.mm_ffn_gate_b  = get_tensor(string_format(TN_MM_GATE,      "bias"), false);
1572                    model.mm_ffn_down_w  = get_tensor(string_format(TN_MM_DOWN,      "weight"));
1573                    model.mm_ffn_down_b  = get_tensor(string_format(TN_MM_DOWN,      "bias"), false);
1574                    model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
1575                    model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false);
1576                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"));
1577                    model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias"));
1578                } break;
1579            case PROJECTOR_TYPE_GEMMA3:
1580                {
1581                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
1582                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
1583                } break;
1584            case PROJECTOR_TYPE_GEMMA3NV:
1585                {
1586                    model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
1587                    model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
1588                    model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
1589
1590                    model.msfa_ffn_expand_w  = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
1591                    model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
1592                    model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
1593                    model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
1594
1595                    model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
1596
1597                    // Dynamically load blocks stage by stage
1598                    for (int stage = 0; stage < 4; ++stage) {
1599                        int blocks_found_in_stage = 0;
1600
1601                        for (int blk_idx = 0; ; ++blk_idx) {
1602                            bool found_block = false;
1603                            mobilenetv5_block block;
1604
1605                            // 1. Check for Edge Residual (S0)
1606                            block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
1607                            if (block.s0_conv_exp_w) {
1608                                found_block = true;
1609                                block.s0_bn1_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
1610                                block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
1611                                block.s0_bn2_w      = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
1612                            }
1613                            // 2. Check for UIR (Universal Inverted Residual)
1614                            else {
1615                                // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
1616                                block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
1617                                block.pw_exp_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
1618
1619                                if (block.dw_start_w || block.pw_exp_w) {
1620                                    found_block = true;
1621                                    if (block.dw_start_w) {
1622                                        block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
1623                                    }
1624                                    if (block.pw_exp_w) {
1625                                        block.pw_exp_bn_w   = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
1626                                    }
1627                                    block.dw_mid_w      = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
1628                                    if (block.dw_mid_w) {
1629                                        block.dw_mid_bn_w   = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
1630                                    }
1631                                    block.pw_proj_w     = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
1632                                    if (block.pw_proj_w) {
1633                                        block.pw_proj_bn_w  = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
1634                                    }
1635                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
1636                                }
1637                            }
1638
1639                            // 3. Check for Attention (MQA)
1640                            // Even if UIR/Edge check failed, this might be a pure attention block
1641                            ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
1642                            if (attn_q_check) {
1643                                found_block = true;
1644                                block.attn_q_w = attn_q_check;
1645                                block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
1646                                block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
1647                                block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
1648                                block.attn_k_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
1649                                block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
1650                                block.attn_v_dw_w   = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
1651                                block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
1652                                block.attn_norm_w   = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
1653                                // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
1654                                if (!block.layer_scale_w) {
1655                                    block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
1656                                }
1657                            }
1658
1659                            if (found_block) {
1660                                model.mobilenet_blocks.push_back(block);
1661                                blocks_found_in_stage++;
1662                            } else {
1663                                // End of blocks for this stage
1664                                break;
1665                            }
1666                        }
1667
1668                        // Track where this stage ends in the flat vector
1669                        if (blocks_found_in_stage > 0) {
1670                            model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
1671                            LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
1672                        }
1673                    }
1674                    model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
1675                    model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
1676                } break;
1677            case PROJECTOR_TYPE_IDEFICS3:
1678                {
1679                    model.projection = get_tensor(TN_MM_PROJECTOR);
1680                } break;
1681            case PROJECTOR_TYPE_LFM2:
1682                {
1683                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
1684                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false);
1685                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1686                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
1687                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1688                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1689                } break;
1690            case PROJECTOR_TYPE_KIMIVL:
1691            case PROJECTOR_TYPE_KIMIK25:
1692                {
1693                    model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
1694                    model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
1695                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1696                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
1697                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1698                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
1699                } break;
1700            case PROJECTOR_TYPE_PIXTRAL:
1701                {
1702                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1703                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
1704                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1705                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
1706                    // [IMG_BREAK] token embedding
1707                    model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
1708                    // for mistral small 3.1
1709                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
1710                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
1711                } break;
1712            case PROJECTOR_TYPE_LIGHTONOCR:
1713                {
1714                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1715                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
1716                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
1717                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
1718                    model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM, false);
1719                    model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
1720                } break;
1721            case PROJECTOR_TYPE_ULTRAVOX:
1722                {
1723                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1724                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1725                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1726                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1727                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1728                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1729                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
1730                    model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
1731                } break;
1732            case PROJECTOR_TYPE_QWEN2A:
1733                {
1734                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1735                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1736                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1737                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1738                    model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight"));
1739                    model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias"));
1740                } break;
1741            case PROJECTOR_TYPE_VOXTRAL:
1742                {
1743                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1744                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1745                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1746                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1747                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1748                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1749                } break;
1750            case PROJECTOR_TYPE_MUSIC_FLAMINGO:
1751                {
1752                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1753                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1754                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1755                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1756                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1757                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
1758                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1759                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
1760                } break;
1761            case PROJECTOR_TYPE_INTERNVL:
1762                {
1763                    model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
1764                    model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias"));
1765                    model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
1766                    model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias"));
1767                    model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
1768                    model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
1769                } break;
1770            case PROJECTOR_TYPE_GLMA:
1771                {
1772                    model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
1773                    model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
1774                    model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
1775                    model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
1776                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1777                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
1778                    model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
1779                    model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
1780                    model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
1781                    model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
1782                    model.mm_boi = get_tensor(string_format(TN_TOK_BOI));
1783                    model.mm_eoi = get_tensor(string_format(TN_TOK_EOI));
1784                } break;
1785            case PROJECTOR_TYPE_LLAMA4:
1786                {
1787                    model.mm_model_proj    = get_tensor(TN_MM_PROJECTOR);
1788                    model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
1789                    model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
1790                } break;
1791            case PROJECTOR_TYPE_COGVLM:
1792                {
1793                    model.mm_model_proj     = get_tensor(TN_MM_PROJECTOR);
1794                    model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight"));
1795                    model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias"));
1796                    model.mm_h_to_4h_w      = get_tensor(string_format(TN_MM_H_TO_4H,      "weight"));
1797                    model.mm_gate_w         = get_tensor(string_format(TN_MM_GATE,         "weight"));
1798                    model.mm_4h_to_h_w      = get_tensor(string_format(TN_MM_4H_TO_H,      "weight"));
1799                    model.mm_boi            = get_tensor(TN_TOK_BOI);
1800                    model.mm_eoi            = get_tensor(TN_TOK_EOI);
1801                } break;
1802            case PROJECTOR_TYPE_JANUS_PRO:
1803                {
1804                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
1805                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
1806                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
1807                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
1808                } break;
1809            case PROJECTOR_TYPE_LFM2A:
1810                {
1811                    for (int i : {0, 2, 3, 5, 6}) {
1812                        model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
1813                        model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
1814                    }
1815                    model.pre_encode_out_w    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
1816                    model.pre_encode_out_b    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
1817
1818                    model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
1819                    model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
1820                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
1821                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
1822                    model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
1823                    model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
1824
1825                    for (int il = 0; il < hparams.n_layer; ++il) {
1826                        auto & layer = model.layers[il];
1827
1828                        layer.ff_norm_w   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "weight"));
1829                        layer.ff_norm_b   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "bias"));
1830                        layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
1831                        layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
1832                        layer.ff_up_1_w   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "weight"));
1833                        layer.ff_up_1_b   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "bias"));
1834                        layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
1835                        layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
1836
1837                        layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
1838                        layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
1839
1840                        layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
1841                        layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
1842
1843                        layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
1844
1845                        layer.conv_norm_w  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
1846                        layer.conv_norm_b  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
1847                        layer.conv_dw_w    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "weight"));
1848                        layer.conv_dw_b    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "bias"));
1849                        layer.conv_pw1_w   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "weight"));
1850                        layer.conv_pw1_b   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "bias"));
1851                        layer.conv_pw2_w   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "weight"));
1852                        layer.conv_pw2_b   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
1853                    }
1854                } break;
1855            default:
1856                GGML_ASSERT(false && "unknown projector type");
1857        }
1858
1859        // load data
1860        {
1861            std::vector<uint8_t> read_buf;
1862
1863            auto fin = std::ifstream(fname, std::ios::binary);
1864            if (!fin) {
1865                throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
1866            }
1867
1868            // alloc memory and offload data
1869            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
1870            ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
1871            ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
1872            for (auto & t : tensors_to_load) {
1873                ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
1874                const size_t offset = tensor_offset[t->name];
1875                fin.seekg(offset, std::ios::beg);
1876                if (!fin) {
1877                    throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
1878                }
1879                size_t num_bytes = ggml_nbytes(cur);
1880                if (ggml_backend_buft_is_host(buft)) {
1881                    // for the CPU and Metal backend, we can read directly into the tensor
1882                    fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
1883                } else {
1884                    // read into a temporary buffer first, then copy to device memory
1885                    read_buf.resize(num_bytes);
1886                    fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
1887                    ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
1888                }
1889            }
1890            fin.close();
1891
1892            LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
1893        }
1894    }
1895
1896    struct support_info_op {
1897        ggml_tensor * op;
1898
1899        // true if the op runs on the accelerated ctx_clip.backend
1900        bool is_accel = true;
1901    };
1902
1903    struct support_info_graph {
1904        // whether the clip_ctx.backend supports flash attention
1905        bool fattn = true;
1906        ggml_tensor * fattn_op = nullptr; // for debugging
1907
1908        std::vector<support_info_op> ops;
1909    };
1910
1911    static void warmup(clip_ctx & ctx_clip) {
1912        // create a fake batch
1913        const auto & hparams = ctx_clip.model.hparams;
1914        clip_image_f32_batch batch;
1915        clip_image_f32_ptr img(clip_image_f32_init());
1916        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
1917            img->nx = hparams.warmup_image_size;
1918            img->ny = hparams.warmup_image_size;
1919            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
1920        } else {
1921            img->nx = hparams.warmup_audio_size;
1922            img->ny = hparams.n_mel_bins;
1923            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
1924        }
1925        batch.entries.push_back(std::move(img));
1926        warmup(ctx_clip, batch);
1927    }
1928
1929    static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
1930        support_info_graph info;
1931
1932        if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
1933            // try to enable flash attention to see if it's supported
1934            ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
1935            info = alloc_compute_meta(ctx_clip, batch);
1936            if (!info.fattn && info.fattn_op) {
1937                auto op = info.fattn_op;
1938                LOG_WRN("%s: *****************************************************************\n", __func__);
1939                LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend));
1940                LOG_WRN("%s: op params: \n", __func__);
1941                static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) {
1942                    LOG_WRN("%s:   %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn,
1943                            name, ggml_type_name(t->type),
1944                            t->ne[0], t->ne[1], t->ne[2], t->ne[3],
1945                            t->nb[0], t->nb[1], t->nb[2], t->nb[3]);
1946                };
1947                print_shape(__func__, " dst", op);
1948                print_shape(__func__, "src0", op->src[0]);
1949                print_shape(__func__, "src1", op->src[1]);
1950                print_shape(__func__, "src2", op->src[2]);
1951                LOG_WRN("%s: please report this on github as an issue\n", __func__);
1952                LOG_WRN("%s: *****************************************************************\n", __func__);
1953                ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
1954                alloc_compute_meta(ctx_clip, batch);
1955            }
1956        } else {
1957            info = alloc_compute_meta(ctx_clip, batch);
1958            if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
1959                LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
1960            }
1961        }
1962
1963        ctx_clip.is_allocated = true; // mark buffers as allocated
1964
1965        LOG_INF("%s: flash attention is %s\n", __func__,
1966            (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
1967
1968        // print ops that are not supported by the GPU backend (if there is one)
1969        if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) {
1970            std::vector<support_info_op> unsupported_ops;
1971            for (const auto & op : info.ops) {
1972                if (!op.is_accel) {
1973                    unsupported_ops.push_back(op);
1974                }
1975            }
1976            if (!unsupported_ops.empty()) {
1977                LOG_WRN("%s: *****************************************************************\n", __func__);
1978                LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__);
1979                LOG_WRN("%s:          the performance will be suboptimal                      \n", __func__);
1980                LOG_WRN("%s:          list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend));
1981                for (const auto & op : unsupported_ops) {
1982                    LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__,
1983                            ggml_op_name(op.op->op),
1984                            ggml_type_name(op.op->type),
1985                            op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]);
1986                }
1987                LOG_WRN("%s: flash attention is %s\n", __func__,
1988                    (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");
1989                LOG_WRN("%s: please report this on github as an issue\n", __func__);
1990                LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__);
1991                LOG_WRN("%s: *****************************************************************\n", __func__);
1992            }
1993        }
1994    }
1995
1996    static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
1997        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
1998
1999        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
2000        ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
2001
2002        for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
2003            ggml_backend_t backend = ctx_clip.backend_ptrs[i];
2004            ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
2005            size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend);
2006            if (size > 1) {
2007                LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
2008                        ggml_backend_buft_name(buft),
2009                        size / 1024.0 / 1024.0);
2010            }
2011        }
2012
2013        const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
2014        const int n_nodes  = ggml_graph_n_nodes(gf);
2015
2016        LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__,  n_splits, n_nodes);
2017
2018        support_info_graph res {
2019            /*.fattn    = */ true,
2020            /*.fattn_op = */ nullptr,
2021            /*.ops      = */ {},
2022        };
2023
2024        // check op support
2025        for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
2026            ggml_tensor * node = ggml_graph_node(gf, i);
2027            res.ops.push_back({node, true});
2028            if (!ggml_backend_supports_op(ctx_clip.backend, node)) {
2029                res.ops.back().is_accel = false;
2030                if (node->op == GGML_OP_FLASH_ATTN_EXT) {
2031                    res.fattn    = false;
2032                    res.fattn_op = node;
2033                }
2034            }
2035        }
2036
2037        return res;
2038    }
2039
2040    void get_bool(const std::string & key, bool & output, bool required = true) const {
2041        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2042        if (i < 0) {
2043            if (required) {
2044                throw std::runtime_error("Key not found: " + key);
2045            }
2046            return;
2047        }
2048        output = gguf_get_val_bool(ctx_gguf.get(), i);
2049    }
2050
2051    void get_i32(const std::string & key, int & output, bool required = true) const {
2052        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2053        if (i < 0) {
2054            if (required) {
2055                throw std::runtime_error("Key not found: " + key);
2056            }
2057            return;
2058        }
2059        output = gguf_get_val_i32(ctx_gguf.get(), i);
2060    }
2061
2062    void get_u32(const std::string & key, int & output, bool required = true) const {
2063        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2064        if (i < 0) {
2065            if (required) {
2066                throw std::runtime_error("Key not found: " + key);
2067            }
2068            return;
2069        }
2070        output = gguf_get_val_u32(ctx_gguf.get(), i);
2071    }
2072
2073    void get_f32(const std::string & key, float & output, bool required = true) const {
2074        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2075        if (i < 0) {
2076            if (required) {
2077                throw std::runtime_error("Key not found: " + key);
2078            }
2079            return;
2080        }
2081        output = gguf_get_val_f32(ctx_gguf.get(), i);
2082    }
2083
2084    void get_string(const std::string & key, std::string & output, bool required = true) const {
2085        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2086        if (i < 0) {
2087            if (required) {
2088                throw std::runtime_error("Key not found: " + key);
2089            }
2090            return;
2091        }
2092        output = std::string(gguf_get_val_str(ctx_gguf.get(), i));
2093    }
2094
2095    void get_arr_int(const std::string & key, std::vector<int> & output, bool required = true) const {
2096        const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
2097        if (i < 0) {
2098            if (required) {
2099                throw std::runtime_error("Key not found: " + key);
2100            }
2101            return;
2102        }
2103        int n = gguf_get_arr_n(ctx_gguf.get(), i);
2104        output.resize(n);
2105        const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i);
2106        for (int i = 0; i < n; ++i) {
2107            output[i] = values[i];
2108        }
2109    }
2110
2111    static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
2112        auto & hparams = model.hparams;
2113        for (int x = 1; x <= max_patches_per_side; x++) {
2114            for (int y = 1; y <= max_patches_per_side; y++) {
2115                if (x == 1 && y == 1) {
2116                    continue; // skip the first point
2117                }
2118                hparams.image_res_candidates.push_back(clip_image_size{
2119                    x*hparams.image_size,
2120                    y*hparams.image_size,
2121                });
2122            }
2123        }
2124    }
2125};
2126
2127struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
2128    clip_ctx * ctx_vision = nullptr;
2129    clip_ctx * ctx_audio = nullptr;
2130
2131    try {
2132        clip_model_loader loader(fname);
2133        bool skip_audio = false;
2134
2135        if (loader.has_vision) {
2136            ctx_vision = new clip_ctx(ctx_params);
2137            loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION);
2138            loader.load_tensors(*ctx_vision);
2139            if (ctx_params.warmup) {
2140                loader.warmup(*ctx_vision);
2141            }
2142
2143            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
2144            // we can remove this check when we implement audio support for Gemma 3N
2145            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
2146
2147            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
2148        }
2149
2150        if (loader.has_audio && !skip_audio) {
2151            ctx_audio = new clip_ctx(ctx_params);
2152            loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
2153            loader.load_tensors(*ctx_audio);
2154            if (ctx_params.warmup) {
2155                loader.warmup(*ctx_audio);
2156            }
2157        }
2158
2159    } catch (const std::exception & e) {
2160        LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what());
2161
2162        delete ctx_vision;
2163        delete ctx_audio;
2164
2165        return {nullptr, nullptr};
2166    }
2167
2168    return {ctx_vision, ctx_audio};
2169}
2170
2171struct clip_image_size * clip_image_size_init() {
2172    struct clip_image_size * load_image_size = new struct clip_image_size();
2173    load_image_size->width = 448;
2174    load_image_size->height = 448;
2175    return load_image_size;
2176}
2177
2178struct clip_image_u8 * clip_image_u8_init() {
2179    return new clip_image_u8();
2180}
2181
2182struct clip_image_f32 * clip_image_f32_init() {
2183    return new clip_image_f32();
2184}
2185
2186struct clip_image_f32_batch * clip_image_f32_batch_init() {
2187    return new clip_image_f32_batch();
2188}
2189
2190unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
2191    if (nx) *nx = img->nx;
2192    if (ny) *ny = img->ny;
2193    return img->buf.data();
2194}
2195
2196void clip_image_size_free(struct clip_image_size * load_image_size) {
2197    if (load_image_size == nullptr) {
2198        return;
2199    }
2200    delete load_image_size;
2201}
2202void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
2203void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
2204void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; }
2205void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; }
2206
2207size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
2208    return batch->entries.size();
2209}
2210
2211size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
2212    if (idx < 0 || idx >= (int)batch->entries.size()) {
2213        LOG_ERR("%s: invalid index %d\n", __func__, idx);
2214        return 0;
2215    }
2216    return batch->entries[idx]->nx;
2217}
2218
2219size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
2220    if (idx < 0 || idx >= (int)batch->entries.size()) {
2221        LOG_ERR("%s: invalid index %d\n", __func__, idx);
2222        return 0;
2223    }
2224    return batch->entries[idx]->ny;
2225}
2226
2227clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
2228    if (idx < 0 || idx >= (int)batch->entries.size()) {
2229        LOG_ERR("%s: invalid index %d\n", __func__, idx);
2230        return nullptr;
2231    }
2232    return batch->entries[idx].get();
2233}
2234
2235void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
2236    img->nx = nx;
2237    img->ny = ny;
2238    img->buf.resize(3 * nx * ny);
2239    memcpy(img->buf.data(), rgb_pixels, img->buf.size());
2240}
2241
2242// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
2243static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
2244    dst.nx = src.nx;
2245    dst.ny = src.ny;
2246    dst.buf.resize(src.buf.size());
2247
2248    // TODO @ngxson : seems like this could be done more efficiently on cgraph
2249    for (size_t i = 0; i < src.buf.size(); ++i) {
2250        int c = i % 3; // rgb
2251        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
2252    }
2253}
2254
2255// set of tools to manupulate images
2256// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
2257struct img_tool {
2258    enum resize_algo {
2259        RESIZE_ALGO_BILINEAR,
2260        RESIZE_ALGO_BICUBIC,
2261        // RESIZE_ALGO_LANCZOS, // TODO
2262    };
2263
2264    static void resize(
2265            const clip_image_u8 & src,
2266            clip_image_u8 & dst,
2267            const clip_image_size & target_resolution,
2268            resize_algo algo,
2269            bool add_padding = true, // TODO: define the behavior for add_padding = false
2270            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
2271        dst.nx = target_resolution.width;
2272        dst.ny = target_resolution.height;
2273        dst.buf.resize(3 * dst.nx * dst.ny);
2274
2275        if (dst.nx == src.nx && dst.ny == src.ny) {
2276            // no resize needed, simple copy
2277            dst.buf = src.buf;
2278            return;
2279        }
2280
2281        if (!add_padding) {
2282            // direct resize
2283            switch (algo) {
2284                case RESIZE_ALGO_BILINEAR:
2285                    resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
2286                    break;
2287                case RESIZE_ALGO_BICUBIC:
2288                    resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
2289                    break;
2290                default:
2291                    throw std::runtime_error("Unsupported resize algorithm");
2292            }
2293        } else {
2294            // resize with padding
2295            clip_image_u8 resized_image;
2296            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
2297            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
2298            float scale = std::min(scale_w, scale_h);
2299            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
2300            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
2301
2302            switch (algo) {
2303                case RESIZE_ALGO_BILINEAR:
2304                    resize_bilinear(src, resized_image, new_width, new_height);
2305                    break;
2306                case RESIZE_ALGO_BICUBIC:
2307                    resize_bicubic(src, resized_image, new_width, new_height);
2308                    break;
2309                default:
2310                    throw std::runtime_error("Unsupported resize algorithm");
2311            }
2312
2313            // fill dst with pad_color
2314            fill(dst, pad_color);
2315
2316            int offset_x = (target_resolution.width  - new_width)  / 2;
2317            int offset_y = (target_resolution.height - new_height) / 2;
2318
2319            composite(dst, resized_image, offset_x, offset_y);
2320        }
2321    }
2322
2323    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
2324        dst.nx = w;
2325        dst.ny = h;
2326        dst.buf.resize(3 * w * h);
2327
2328        for (int i = 0; i < h; ++i) {
2329            for (int j = 0; j < w; ++j) {
2330                int src_idx = 3 * ((y + i)*image.nx + (x + j));
2331                int dst_idx = 3 * (i*w + j);
2332                dst.buf[dst_idx]     = image.buf[src_idx];
2333                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
2334                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
2335            }
2336        }
2337    }
2338
2339    // calculate the size of the **resized** image, while preserving the aspect ratio
2340    // the calculated size will be aligned to the nearest multiple of align_size
2341    // if H or W size is larger than longest_edge, it will be resized to longest_edge
2342    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
2343        GGML_ASSERT(align_size > 0);
2344        if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
2345            return {0, 0};
2346        }
2347
2348        float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
2349                               static_cast<float>(longest_edge) / inp_size.height);
2350
2351        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
2352        float target_height_f = static_cast<float>(inp_size.height) * scale;
2353
2354        auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
2355        int aligned_width  = ceil_by_factor(target_width_f);
2356        int aligned_height = ceil_by_factor(target_height_f);
2357
2358        return {aligned_width, aligned_height};
2359    }
2360
2361    // calculate the size of the **resized** image, while preserving the aspect ratio
2362    // the calculated size will have min_pixels <= W*H <= max_pixels
2363    // this is referred as "smart_resize" in transformers code
2364    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
2365        GGML_ASSERT(align_size > 0);
2366        const int width  = inp_size.width;
2367        const int height = inp_size.height;
2368
2369        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
2370        auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
2371        auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
2372
2373        // always align up first
2374        int h_bar = std::max(align_size, round_by_factor(height));
2375        int w_bar = std::max(align_size, round_by_factor(width));
2376
2377        if (h_bar * w_bar > max_pixels) {
2378            const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
2379            h_bar = std::max(align_size, floor_by_factor(height / beta));
2380            w_bar = std::max(align_size, floor_by_factor(width  / beta));
2381        } else if (h_bar * w_bar < min_pixels) {
2382            const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
2383            h_bar = ceil_by_factor(height * beta);
2384            w_bar = ceil_by_factor(width * beta);
2385        }
2386
2387        return {w_bar, h_bar};
2388    }
2389
2390    // draw src image into dst image at offset (offset_x, offset_y)
2391    static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
2392        for (int y = 0; y < src.ny; ++y) {
2393            for (int x = 0; x < src.nx; ++x) {
2394                int dx = x + offset_x;
2395                int dy = y + offset_y;
2396                // skip pixels that would be out of bounds in the destination
2397                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
2398                    continue;
2399                }
2400                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
2401                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
2402                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
2403                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
2404                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
2405            }
2406        }
2407    }
2408
2409    // fill the image with a solid color
2410    static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
2411        for (size_t i = 0; i < img.buf.size(); i += 3) {
2412            img.buf[i]     = color[0];
2413            img.buf[i + 1] = color[1];
2414            img.buf[i + 2] = color[2];
2415        }
2416    }
2417
2418private:
2419    // Bilinear resize function
2420    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
2421        dst.nx = target_width;
2422        dst.ny = target_height;
2423        dst.buf.resize(3 * target_width * target_height);
2424
2425        float x_ratio = static_cast<float>(src.nx - 1) / target_width;
2426        float y_ratio = static_cast<float>(src.ny - 1) / target_height;
2427
2428        for (int y = 0; y < target_height; y++) {
2429            for (int x = 0; x < target_width; x++) {
2430                float px = x_ratio * x;
2431                float py = y_ratio * y;
2432                int x_floor = static_cast<int>(px);
2433                int y_floor = static_cast<int>(py);
2434                float x_lerp = px - x_floor;
2435                float y_lerp = py - y_floor;
2436
2437                for (int c = 0; c < 3; c++) {
2438                    float top = lerp(
2439                        static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
2440                        static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
2441                        x_lerp
2442                    );
2443                    float bottom = lerp(
2444                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
2445                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
2446                        x_lerp
2447                    );
2448                    dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
2449                }
2450            }
2451        }
2452    }
2453
2454    // Bicubic resize function
2455    // part of image will be cropped if the aspect ratio is different
2456    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
2457        const int nx = img.nx;
2458        const int ny = img.ny;
2459
2460        dst.nx = target_width;
2461        dst.ny = target_height;
2462        dst.buf.resize(3 * target_width * target_height);
2463
2464        float Cc;
2465        float C[5] = {};
2466        float d0, d2, d3, a0, a1, a2, a3;
2467        int i, j, k, jj;
2468        int x, y;
2469        float dx, dy;
2470        float tx, ty;
2471
2472        tx = (float)nx / (float)target_width;
2473        ty = (float)ny / (float)target_height;
2474
2475        // Bicubic interpolation; adapted from ViT.cpp, inspired from :
2476        //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
2477        //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
2478
2479        for (i = 0; i < target_height; i++) {
2480            for (j = 0; j < target_width; j++) {
2481                x = (int)(tx * j);
2482                y = (int)(ty * i);
2483
2484                dx = tx * j - x;
2485                dy = ty * i - y;
2486
2487                for (k = 0; k < 3; k++) {
2488                    for (jj = 0; jj <= 3; jj++) {
2489                        d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2490                        d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2491                        d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2492                        a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
2493
2494                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2495                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
2496                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
2497
2498                        C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
2499
2500                        d0 = C[0] - C[1];
2501                        d2 = C[2] - C[1];
2502                        d3 = C[3] - C[1];
2503                        a0 = C[1];
2504                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
2505                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
2506                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
2507                        Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
2508
2509                        const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
2510                        dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
2511                    }
2512                }
2513            }
2514        }
2515
2516        return true;
2517    }
2518
2519    static inline int clip(int x, int lower, int upper) {
2520        return std::max(lower, std::min(x, upper));
2521    }
2522
2523    // Linear interpolation between two points
2524    static inline float lerp(float s, float e, float t) {
2525        return s + (e - s) * t;
2526    }
2527};
2528
2529/**
2530 * implementation of LLaVA-UHD:
2531 *  - https://arxiv.org/pdf/2403.11703
2532 *  - https://github.com/thunlp/LLaVA-UHD
2533 *  - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
2534 *
2535 * overview:
2536 *   - an image always have a single overview (downscaled image)
2537 *   - an image can have 0 or multiple slices, depending on the image size
2538 *   - each slice can then be considered as a separate image
2539 *
2540 * for example:
2541 *
2542 * [overview] --> [slice 1] --> [slice 2]
2543 *           |                |
2544 *           +--> [slice 3] --> [slice 4]
2545 */
2546struct llava_uhd {
2547    struct slice_coordinates {
2548        int x;
2549        int y;
2550        clip_image_size size;
2551    };
2552
2553    struct slice_instructions {
2554        clip_image_size overview_size; // size of downscaled image
2555        clip_image_size refined_size;  // size of image right before slicing (must be multiple of slice size)
2556        clip_image_size grid_size;     // grid_size.width * grid_size.height = number of slices
2557        std::vector<slice_coordinates> slices;
2558
2559        img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
2560        bool padding_overview = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
2561        std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
2562
2563        img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
2564        bool padding_refined = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
2565        std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
2566    };
2567
2568    static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
2569        slice_instructions res;
2570        const int patch_size      = clip_get_patch_size(ctx);
2571        const int slice_size      = clip_get_image_size(ctx);
2572        const int original_width  = original_size.width;
2573        const int original_height = original_size.height;
2574
2575        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
2576        const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
2577
2578        if (!has_slices) {
2579            // skip slicing logic
2580            res.overview_size = clip_image_size{slice_size, slice_size};
2581            res.refined_size  = clip_image_size{0, 0};
2582            res.grid_size     = clip_image_size{0, 0};
2583
2584            return res;
2585        }
2586
2587        if (has_pinpoints) {
2588            // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
2589            auto refine_size = llava_uhd::select_best_resolution(
2590                original_size,
2591                ctx->model.hparams.image_res_candidates);
2592            res.overview_size         = clip_image_size{slice_size, slice_size};
2593            res.refined_size          = refine_size;
2594            res.grid_size             = clip_image_size{0, 0};
2595            res.padding_refined       = true;
2596            res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;  // preserve old behavior when padding
2597
2598            LOG_DBG("%s: using pinpoints for slicing\n", __func__);
2599            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
2600                    __func__, original_width, original_height,
2601                    res.overview_size.width, res.overview_size.height,
2602                    res.refined_size.width,  res.refined_size.height);
2603
2604            for (int y = 0; y < refine_size.height; y += slice_size) {
2605                for (int x = 0; x < refine_size.width; x += slice_size) {
2606                    slice_coordinates slice;
2607                    slice.x = x;
2608                    slice.y = y;
2609                    slice.size.width  = std::min(slice_size, refine_size.width  - x);
2610                    slice.size.height = std::min(slice_size, refine_size.height - y);
2611                    res.slices.push_back(slice);
2612                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
2613                            __func__, (int)res.slices.size() - 1,
2614                            slice.x, slice.y, slice.size.width, slice.size.height);
2615                }
2616            }
2617
2618            res.grid_size.height = refine_size.height / slice_size;
2619            res.grid_size.width  = refine_size.width  / slice_size;
2620            LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
2621
2622            return res;
2623        }
2624
2625        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
2626
2627        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
2628        res.overview_size = best_size;
2629
2630        {
2631            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
2632            const float log_ratio = log((float)original_width / original_height);
2633            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
2634            const int multiple = fmin(ceil(ratio), max_slice_nums);
2635
2636            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
2637            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
2638            res.grid_size    = best_grid;
2639            res.refined_size = refine_size;
2640
2641            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
2642                    __func__, original_width, original_height,
2643                    res.overview_size.width, res.overview_size.height,
2644                    res.refined_size.width, res.refined_size.height,
2645                    res.grid_size.width, res.grid_size.height);
2646
2647            int width  = refine_size.width;
2648            int height = refine_size.height;
2649            int grid_x = int(width  / best_grid.width);
2650            int grid_y = int(height / best_grid.height);
2651            for (int patches_y = 0,                    ic = 0;
2652                    patches_y < refine_size.height && ic < best_grid.height;
2653                    patches_y += grid_y,              ic += 1) {
2654                for (int patches_x = 0,                   jc = 0;
2655                        patches_x < refine_size.width && jc < best_grid.width;
2656                        patches_x += grid_x,             jc += 1) {
2657                    slice_coordinates slice;
2658                    slice.x = patches_x;
2659                    slice.y = patches_y;
2660                    slice.size.width  = grid_x;
2661                    slice.size.height = grid_y;
2662                    res.slices.push_back(slice);
2663                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
2664                            __func__, (int)res.slices.size() - 1,
2665                            slice.x, slice.y, slice.size.width, slice.size.height);
2666                }
2667            }
2668        }
2669
2670        return res;
2671    }
2672
2673    static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
2674        std::vector<clip_image_u8_ptr> output;
2675
2676        // resize to overview size
2677        clip_image_u8_ptr resized_img(clip_image_u8_init());
2678        img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
2679                         inst.padding_overview, inst.pad_color_overview);
2680        output.push_back(std::move(resized_img));
2681
2682        if (inst.slices.empty()) {
2683            // no slices, just return the resized image
2684            return output;
2685        }
2686
2687        // resize to refined size
2688        clip_image_u8_ptr refined_img(clip_image_u8_init());
2689        img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
2690                         inst.padding_refined, inst.pad_color_refined);
2691
2692        // create slices
2693        for (const auto & slice : inst.slices) {
2694            int x = slice.x;
2695            int y = slice.y;
2696            int w = slice.size.width;
2697            int h = slice.size.height;
2698
2699            clip_image_u8_ptr img_slice(clip_image_u8_init());
2700            img_tool::crop(*refined_img, *img_slice, x, y, w, h);
2701            output.push_back(std::move(img_slice));
2702        }
2703
2704        return output;
2705    }
2706
2707private:
2708    static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
2709        int width  = original_size.width;
2710        int height = original_size.height;
2711        if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
2712            float r = static_cast<float>(width) / height;
2713            height  = static_cast<int>(scale_resolution / std::sqrt(r));
2714            width   = static_cast<int>(height * r);
2715        }
2716        clip_image_size res;
2717        res.width  = ensure_divide(width,  patch_size);
2718        res.height = ensure_divide(height, patch_size);
2719        return res;
2720    }
2721
2722    static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
2723        float scale_width  = static_cast<float>(target_max.width)  / orig.width;
2724        float scale_height = static_cast<float>(target_max.height) / orig.height;
2725        float scale = std::min(scale_width, scale_height);
2726        return clip_image_size{
2727            static_cast<int>(orig.width  * scale),
2728            static_cast<int>(orig.height * scale),
2729        };
2730    }
2731
2732    /**
2733     * Selects the best resolution from a list of possible resolutions based on the original size.
2734     *
2735     * For example, when given a list of resolutions:
2736     *  - 100x100
2737     *  - 200x100
2738     *  - 100x200
2739     *  - 200x200
2740     *
2741     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
2742     *
2743     * @param original_size The original size of the image
2744     * @param possible_resolutions A list of possible resolutions
2745     * @return The best fit resolution
2746     */
2747    static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
2748        clip_image_size best_fit;
2749        int min_wasted_area = std::numeric_limits<int>::max();
2750        int max_effective_resolution = 0;
2751
2752        for (const clip_image_size & candidate : possible_resolutions) {
2753            auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
2754            int effective_resolution = std::min(
2755                target_size.width * target_size.height,
2756                original_size.width * original_size.height);
2757            int wasted_area = (candidate.width * candidate.height) - effective_resolution;
2758
2759            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
2760                max_effective_resolution = effective_resolution;
2761                min_wasted_area = wasted_area;
2762                best_fit = candidate;
2763            }
2764
2765            LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
2766        }
2767
2768        return best_fit;
2769    }
2770
2771    static int ensure_divide(int length, int patch_size) {
2772        return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
2773    }
2774
2775    static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
2776        int width  = original_size.width;
2777        int height = original_size.height;
2778        int grid_x = grid.width;
2779        int grid_y = grid.height;
2780
2781        int refine_width  = ensure_divide(width, grid_x);
2782        int refine_height = ensure_divide(height, grid_y);
2783
2784        clip_image_size grid_size;
2785        grid_size.width  = refine_width  / grid_x;
2786        grid_size.height = refine_height / grid_y;
2787
2788        auto best_grid_size  = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
2789        int best_grid_width  = best_grid_size.width;
2790        int best_grid_height = best_grid_size.height;
2791
2792        clip_image_size refine_size;
2793        refine_size.width  = best_grid_width  * grid_x;
2794        refine_size.height = best_grid_height * grid_y;
2795        return refine_size;
2796    }
2797
2798    static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
2799        std::vector<int> candidate_split_grids_nums;
2800        for (int i : {multiple - 1, multiple, multiple + 1}) {
2801            if (i == 1 || i > max_slice_nums) {
2802                continue;
2803            }
2804            candidate_split_grids_nums.push_back(i);
2805        }
2806
2807        std::vector<clip_image_size> candidate_grids;
2808        for (int split_grids_nums : candidate_split_grids_nums) {
2809            int m = 1;
2810            while (m <= split_grids_nums) {
2811                if (split_grids_nums % m == 0) {
2812                    candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
2813                }
2814                ++m;
2815            }
2816        }
2817
2818        clip_image_size best_grid{1, 1};
2819        float min_error = std::numeric_limits<float>::infinity();
2820        for (const auto& grid : candidate_grids) {
2821            float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
2822            if (error < min_error) {
2823                best_grid = grid;
2824                min_error = error;
2825            }
2826        }
2827        return best_grid;
2828    }
2829};
2830
2831// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
2832// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
2833struct lfm2_vl_image_processor {
2834    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
2835    static constexpr int   min_tiles            = 2;
2836    static constexpr int   max_tiles            = 10;
2837    static constexpr float max_pixels_tolerance = 2.0f;
2838    static constexpr int   tile_size            = 512;
2839
2840    static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
2841        llava_uhd::slice_instructions inst;
2842        const auto & params  = ctx->model.hparams;
2843        const int align_size = params.patch_size * params.n_merge;
2844
2845        inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
2846        inst.interpolation_refined  = img_tool::RESIZE_ALGO_BILINEAR;
2847        inst.overview_size          = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
2848
2849        // tile if either dimension exceeds tile_size with tolerance
2850        const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
2851
2852        if (!needs_tiling) {
2853            inst.refined_size = clip_image_size{0, 0};
2854            inst.grid_size    = clip_image_size{0, 0};
2855            return inst;
2856        }
2857
2858        const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
2859
2860        inst.grid_size    = grid;
2861        inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
2862
2863        LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
2864                __func__,
2865                original_size.width, original_size.height,
2866                inst.overview_size.width, inst.overview_size.height,
2867                inst.refined_size.width, inst.refined_size.height,
2868                grid.width, grid.height);
2869
2870        for (int row = 0; row < grid.height; row++) {
2871            for (int col = 0; col < grid.width; col++) {
2872                llava_uhd::slice_coordinates slice;
2873                slice.x    = col * tile_size;
2874                slice.y    = row * tile_size;
2875                slice.size = clip_image_size{tile_size, tile_size};
2876                inst.slices.push_back(slice);
2877                LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
2878                        __func__, (int)inst.slices.size() - 1,
2879                        slice.x, slice.y, slice.size.width, slice.size.height);
2880            }
2881        }
2882
2883        return inst;
2884    }
2885
2886private:
2887    static clip_image_size find_closest_aspect_ratio(
2888            float aspect_ratio,
2889            const std::vector<clip_image_size> & target_ratios,
2890            int width, int height) {
2891        float best_ratio_diff = std::numeric_limits<float>::max();
2892        clip_image_size best_ratio = {1, 1};
2893        const float area = static_cast<float>(width * height);
2894
2895        for (const auto & ratio : target_ratios) {
2896            const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
2897            const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
2898            if (ratio_diff < best_ratio_diff) {
2899                best_ratio_diff = ratio_diff;
2900                best_ratio = ratio;
2901            } else if (ratio_diff == best_ratio_diff) {
2902                const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
2903                if (area > 0.5f * target_area) {
2904                    best_ratio = ratio;
2905                }
2906            }
2907        }
2908        return best_ratio;
2909    }
2910
2911    static std::vector<clip_image_size> get_target_ratios() {
2912        std::vector<clip_image_size> ratios;
2913        for (int n = min_tiles; n <= max_tiles; n++) {
2914            for (int w = 1; w <= n; w++) {
2915                for (int h = 1; h <= n; h++) {
2916                    if (w * h >= min_tiles && w * h <= max_tiles) {
2917                        bool found = false;
2918                        for (const auto & r : ratios) {
2919                            if (r.width == w && r.height == h) {
2920                                found = true;
2921                                break;
2922                            }
2923                        }
2924                        if (!found) {
2925                            ratios.push_back({w, h});
2926                        }
2927                    }
2928                }
2929            }
2930        }
2931        std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
2932            return a.width * a.height < b.width * b.height;
2933        });
2934        return ratios;
2935    }
2936
2937    static clip_image_size get_grid_layout(int height, int width) {
2938        const float aspect_ratio = static_cast<float>(width) / height;
2939        const auto ratios = get_target_ratios();
2940        return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
2941    }
2942};
2943
2944// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
2945// res_imgs memory is being allocated here, previous allocations will be freed if found
2946bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
2947    clip_image_size original_size{img->nx, img->ny};
2948    auto & params = ctx->model.hparams;
2949
2950    switch (ctx->proj_type()) {
2951        case PROJECTOR_TYPE_MINICPMV:
2952            {
2953                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
2954                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
2955
2956                for (size_t i = 0; i < imgs.size(); ++i) {
2957                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
2958                    clip_image_f32_ptr res(clip_image_f32_init());
2959                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
2960                    res_imgs->entries.push_back(std::move(res));
2961                }
2962
2963                res_imgs->grid_x = inst.grid_size.width;
2964                res_imgs->grid_y = inst.grid_size.height;
2965            } break;
2966
2967        case PROJECTOR_TYPE_QWEN2VL:
2968        case PROJECTOR_TYPE_QWEN25VL:
2969        case PROJECTOR_TYPE_QWEN3VL:
2970        case PROJECTOR_TYPE_GLM4V:
2971            {
2972                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
2973                clip_image_u8 resized;
2974                const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
2975                    original_size,
2976                    params.patch_size * 2,
2977                    params.image_min_pixels,
2978                    params.image_max_pixels);
2979                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
2980                // clip_image_save_to_bmp(resized, "preproc.bmp");
2981                clip_image_f32_ptr img_f32(clip_image_f32_init());
2982                // clip_image_f32_ptr res(clip_image_f32_init());
2983                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
2984                // res_imgs->data[0] = *res;
2985                res_imgs->entries.push_back(std::move(img_f32));
2986            } break;
2987        case PROJECTOR_TYPE_YOUTUVL:
2988            {
2989                const int patch_size = params.patch_size;  // typically 16
2990                const int merge_size = params.n_merge;      // typically 2
2991                const int align_size = patch_size * merge_size;  // 32
2992
2993                const int max_num_patches = params.image_max_pixels > 0 ?
2994                    params.image_max_pixels / (patch_size * patch_size) : 256;
2995
2996                // Linear search for optimal scale to fit within max_num_patches
2997                float scale = 1.0f;
2998                int target_height = original_size.height;
2999                int target_width = original_size.width;
3000
3001                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
3002                    float scaled_size = size * scale;
3003                    // Round up to nearest multiple of align_size
3004                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
3005                    // Ensure at least one patch
3006                    return std::max(align_size, aligned);
3007                };
3008
3009                // Linear search with 0.02 step size
3010                while (scale > 0.0f) {
3011                    target_height = get_scaled_image_size(scale, original_size.height);
3012                    target_width = get_scaled_image_size(scale, original_size.width);
3013
3014                    int num_patches_h = target_height / patch_size;
3015                    int num_patches_w = target_width / patch_size;
3016                    int num_patches = num_patches_h * num_patches_w;
3017
3018                    if (num_patches > max_num_patches) {
3019                        scale -= 0.02f;
3020                    } else {
3021                        break;
3022                    }
3023                }
3024
3025                clip_image_size new_size = {target_width, target_height};
3026
3027                // Resize the image
3028                clip_image_u8 resized;
3029                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
3030
3031                // Normalize to float32
3032                clip_image_f32_ptr img_f32(clip_image_f32_init());
3033                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
3034
3035                // Add to results
3036                res_imgs->entries.push_back(std::move(img_f32));
3037            } break;
3038
3039        case PROJECTOR_TYPE_IDEFICS3:
3040            {
3041                // The refined size has two steps:
3042                // 1. Resize w/ aspect-ratio preserving such that the longer side is
3043                //      the preprocessor longest size
3044                // 2. Resize w/out preserving aspect ratio such that both sides are
3045                //      multiples of image_size (always rounding up)
3046                //
3047                // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
3048                const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
3049                    original_size, params.image_size, params.image_longest_edge);
3050                // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
3051                //         __func__, original_size.width, original_size.height,
3052                //         refined_size.width, refined_size.height);
3053
3054                llava_uhd::slice_instructions instructions;
3055                instructions.overview_size = clip_image_size{params.image_size, params.image_size};
3056                instructions.refined_size = refined_size;
3057                instructions.grid_size = clip_image_size{
3058                    static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
3059                    static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
3060                };
3061                for (int y = 0; y < refined_size.height; y += params.image_size) {
3062                    for (int x = 0; x < refined_size.width; x += params.image_size) {
3063                        // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
3064                        instructions.slices.push_back(llava_uhd::slice_coordinates{
3065                            /* x    */x,
3066                            /* y    */y,
3067                            /* size */clip_image_size{
3068                                std::min(params.image_size, refined_size.width - x),
3069                                std::min(params.image_size, refined_size.height - y)
3070                            }
3071                        });
3072                    }
3073                }
3074                auto imgs = llava_uhd::slice_image(img, instructions);
3075
3076                // cast and normalize to f32
3077                for (size_t i = 0; i < imgs.size(); ++i) {
3078                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3079                    clip_image_f32_ptr res(clip_image_f32_init());
3080                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3081                    res_imgs->entries.push_back(std::move(res));
3082                }
3083
3084                res_imgs->grid_x = instructions.grid_size.width;
3085                res_imgs->grid_y = instructions.grid_size.height;
3086            } break;
3087
3088        case PROJECTOR_TYPE_GLM_EDGE:
3089        case PROJECTOR_TYPE_GEMMA3:
3090        case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
3091            {
3092                clip_image_u8 resized_image;
3093                int sz = params.image_size;
3094                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
3095                clip_image_f32_ptr img_f32(clip_image_f32_init());
3096                //clip_image_save_to_bmp(resized_image, "resized.bmp");
3097                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3098                res_imgs->entries.push_back(std::move(img_f32));
3099            } break;
3100
3101        case PROJECTOR_TYPE_GEMMA3NV:
3102            {
3103                clip_image_u8 resized_image;
3104                int sz = params.image_size;
3105                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
3106                clip_image_f32_ptr img_f32(clip_image_f32_init());
3107                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3108                res_imgs->entries.push_back(std::move(img_f32));
3109            } break;
3110
3111        case PROJECTOR_TYPE_JANUS_PRO:
3112            {
3113                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
3114                const std::array<uint8_t, 3> pad_color = {127, 127, 127};
3115                clip_image_u8 resized_image;
3116                int sz = params.image_size;
3117                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
3118                clip_image_f32_ptr img_f32(clip_image_f32_init());
3119                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3120                res_imgs->entries.push_back(std::move(img_f32));
3121            } break;
3122
3123        case PROJECTOR_TYPE_PIXTRAL:
3124        case PROJECTOR_TYPE_LIGHTONOCR:
3125            {
3126                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
3127                clip_image_u8 resized_image;
3128                // the original pixtral model doesn't have n_merge
3129                const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
3130                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
3131                    original_size,
3132                    params.patch_size * cur_merge,
3133                    params.image_min_pixels,
3134                    params.image_max_pixels);
3135                img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
3136                clip_image_f32_ptr img_f32(clip_image_f32_init());
3137                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
3138                res_imgs->entries.push_back(std::move(img_f32));
3139            } break;
3140
3141        case PROJECTOR_TYPE_LLAMA4:
3142            {
3143                GGML_ASSERT(!params.image_res_candidates.empty());
3144                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3145                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3146
3147                for (size_t i = 0; i < imgs.size(); ++i) {
3148                    clip_image_f32_ptr res(clip_image_f32_init());
3149                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3150                    res_imgs->entries.push_back(std::move(res));
3151                }
3152
3153                res_imgs->grid_x = inst.grid_size.width;
3154                res_imgs->grid_y = inst.grid_size.height;
3155            } break;
3156
3157        case PROJECTOR_TYPE_LFM2:
3158            {
3159                auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
3160                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3161
3162                for (size_t i = 0; i < imgs.size(); ++i) {
3163                    clip_image_f32_ptr res(clip_image_f32_init());
3164                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3165                    res_imgs->entries.push_back(std::move(res));
3166                }
3167
3168                res_imgs->grid_x = inst.grid_size.width;
3169                res_imgs->grid_y = inst.grid_size.height;
3170            } break;
3171
3172        case PROJECTOR_TYPE_KIMIVL:
3173            {
3174                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
3175                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
3176                    original_size,
3177                    params.patch_size * params.n_merge,
3178                    params.image_min_pixels,
3179                    params.image_max_pixels);
3180                const std::array<uint8_t, 3> pad_color = {122, 116, 104};
3181
3182                clip_image_u8 resized_img;
3183                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
3184                clip_image_f32_ptr res(clip_image_f32_init());
3185                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
3186                res_imgs->entries.push_back(std::move(res));
3187            } break;
3188
3189        case PROJECTOR_TYPE_KIMIK25:
3190            {
3191                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
3192                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
3193                    original_size,
3194                    params.patch_size * params.n_merge,
3195                    params.image_min_pixels,
3196                    params.image_max_pixels);
3197                const std::array<uint8_t, 3> pad_color = {0, 0, 0};
3198
3199                clip_image_u8 resized_img;
3200                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
3201                clip_image_f32_ptr res(clip_image_f32_init());
3202                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
3203                res_imgs->entries.push_back(std::move(res));
3204            } break;
3205
3206        case PROJECTOR_TYPE_MLP:
3207        case PROJECTOR_TYPE_MLP_NORM:
3208        case PROJECTOR_TYPE_LDP:
3209        case PROJECTOR_TYPE_LDPV2:
3210        case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
3211            {
3212                // TODO @ngxson : refactor the code below to avoid duplicated logic
3213
3214                // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
3215                // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
3216
3217                clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
3218
3219                // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
3220                if (params.image_res_candidates.empty()) { // pad_to_square
3221                    // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
3222                    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
3223                    const int longer_side = std::max(img->nx, img->ny);
3224                    temp->nx = longer_side;
3225                    temp->ny = longer_side;
3226                    temp->buf.resize(3 * longer_side * longer_side);
3227
3228                    // background color in RGB from LLaVA (this is the mean rgb color * 255)
3229                    const std::array<uint8_t, 3> pad_color = {122, 116, 104};
3230
3231                    // resize the image to the target_size
3232                    img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
3233
3234                    clip_image_f32_ptr res(clip_image_f32_init());
3235                    normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
3236                    res_imgs->entries.push_back(std::move(res));
3237
3238                } else {
3239                    // "spatial_unpad" with "anyres" processing for llava-1.6
3240                    auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
3241                    std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
3242
3243                    for (size_t i = 0; i < imgs.size(); ++i) {
3244                        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
3245                        clip_image_f32_ptr res(clip_image_f32_init());
3246                        normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
3247                        res_imgs->entries.push_back(std::move(res));
3248                    }
3249                }
3250            } break;
3251
3252        default:
3253            LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
3254            return false;
3255    }
3256
3257    return true;
3258}
3259
3260ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
3261    return ctx->model.image_newline;
3262}
3263
3264void clip_free(clip_ctx * ctx) {
3265    if (ctx == nullptr) {
3266        return;
3267    }
3268    delete ctx;
3269}
3270
3271// deprecated
3272size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
3273    const int32_t nx = ctx->model.hparams.image_size;
3274    const int32_t ny = ctx->model.hparams.image_size;
3275    return clip_embd_nbytes_by_img(ctx, nx, ny);
3276}
3277
3278size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
3279    clip_image_f32 img;
3280    img.nx = img_w;
3281    img.ny = img_h;
3282    return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
3283}
3284
3285int32_t clip_get_image_size(const struct clip_ctx * ctx) {
3286    return ctx->model.hparams.image_size;
3287}
3288
3289int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
3290    return ctx->model.hparams.patch_size;
3291}
3292
3293int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
3294    return ctx->model.hparams.n_embd;
3295}
3296
3297const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
3298    return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
3299}
3300
3301int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3302    const auto & params = ctx->model.hparams;
3303    const int n_total = clip_n_output_tokens(ctx, img);
3304    const auto & proj = ctx->proj_type();
3305    switch (proj) {
3306        case PROJECTOR_TYPE_QWEN2VL:
3307        case PROJECTOR_TYPE_QWEN25VL:
3308        case PROJECTOR_TYPE_QWEN3VL:
3309        case PROJECTOR_TYPE_GLM4V:
3310        case PROJECTOR_TYPE_YOUTUVL:
3311            return (img->nx / params.patch_size) / 2;
3312        default:
3313            break;
3314    }
3315    return n_total;
3316}
3317
3318int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3319    const auto & params = ctx->model.hparams;
3320    const auto & proj = ctx->proj_type();
3321    switch (proj) {
3322        case PROJECTOR_TYPE_QWEN2VL:
3323        case PROJECTOR_TYPE_QWEN25VL:
3324        case PROJECTOR_TYPE_QWEN3VL:
3325        case PROJECTOR_TYPE_GLM4V:
3326        case PROJECTOR_TYPE_YOUTUVL:
3327            return (img->ny / params.patch_size) / 2;
3328        default:
3329            break;
3330    }
3331    return 1;
3332}
3333
3334int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3335    const auto & params = ctx->model.hparams;
3336
3337    // for models with fixed size image, the input image is already pre-processed and resized to square
3338    int patch_size = params.patch_size;
3339    int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
3340
3341    projector_type proj = ctx->proj_type();
3342
3343    switch (proj) {
3344        case PROJECTOR_TYPE_MLP:
3345        case PROJECTOR_TYPE_MLP_NORM:
3346        case PROJECTOR_TYPE_JANUS_PRO:
3347            {
3348                // do nothing
3349            } break;
3350        case PROJECTOR_TYPE_LDP:
3351        case PROJECTOR_TYPE_LDPV2:
3352        case PROJECTOR_TYPE_GLM_EDGE:
3353            {
3354                n_patches /= 4;
3355                if (ctx->model.mm_boi) {
3356                    n_patches += 2; // for BOI and EOI token embeddings
3357                }
3358            } break;
3359        case PROJECTOR_TYPE_MINICPMV:
3360            {
3361                // Use actual config value if available, otherwise fall back to hardcoded values
3362                if (params.minicpmv_query_num > 0) {
3363                    n_patches = params.minicpmv_query_num;
3364                } else {
3365                    // Fallback to hardcoded values for legacy models
3366                    if (params.minicpmv_version == 2) {
3367                        n_patches = 96;
3368                    } else if (params.minicpmv_version == 3) {
3369                        n_patches = 64;
3370                    } else if (params.minicpmv_version == 4) {
3371                        n_patches = 64;
3372                    } else if (params.minicpmv_version == 5) {
3373                        // MiniCPM-V 4.0
3374                        n_patches = 64;
3375                    } else if (params.minicpmv_version == 6) {
3376                        // MiniCPM-V 4.5
3377                        n_patches = 64;
3378                    } else if (params.minicpmv_version == 100045) {
3379                        // MiniCPM-o 4.5
3380                        n_patches = 64;
3381                    } else {
3382                        GGML_ABORT("Unknown minicpmv version");
3383                    }
3384                }
3385            } break;
3386        case PROJECTOR_TYPE_QWEN2VL:
3387        case PROJECTOR_TYPE_QWEN25VL:
3388        case PROJECTOR_TYPE_QWEN3VL:
3389        case PROJECTOR_TYPE_GLM4V:
3390        case PROJECTOR_TYPE_YOUTUVL:
3391            {
3392                // dynamic size (2 conv, so double patch size)
3393                int x_patch = img->nx / (params.patch_size * 2);
3394                int y_patch = img->ny / (params.patch_size * 2);
3395                n_patches = x_patch * y_patch;
3396            } break;
3397        case PROJECTOR_TYPE_GEMMA3:
3398        case PROJECTOR_TYPE_IDEFICS3:
3399        case PROJECTOR_TYPE_INTERNVL:
3400        case PROJECTOR_TYPE_LLAMA4:
3401            {
3402                // both X and Y are downscaled by the scale factor
3403                int scale_factor = ctx->model.hparams.n_merge;
3404                n_patches /= (scale_factor * scale_factor);
3405            } break;
3406        case PROJECTOR_TYPE_GEMMA3NV:
3407            {
3408                // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
3409                // regardless of input size (see architecture description)
3410                n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
3411            } break;
3412        case PROJECTOR_TYPE_LFM2:
3413        case PROJECTOR_TYPE_KIMIVL:
3414        case PROJECTOR_TYPE_KIMIK25:
3415            {
3416                // dynamic size
3417                int out_patch_size = params.patch_size * ctx->model.hparams.n_merge;
3418                int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
3419                int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
3420                n_patches = x_patch * y_patch;
3421            } break;
3422        case PROJECTOR_TYPE_PIXTRAL:
3423        case PROJECTOR_TYPE_LIGHTONOCR:
3424            {
3425                // dynamic size
3426                int n_merge = ctx->model.hparams.n_merge;
3427                int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
3428                int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
3429                if (ctx->model.token_embd_img_break) {
3430                    n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
3431                } else {
3432                    n_patches = n_patches_y * n_patches_x;
3433                }
3434            } break;
3435        case PROJECTOR_TYPE_VOXTRAL:
3436        case PROJECTOR_TYPE_ULTRAVOX:
3437        case PROJECTOR_TYPE_QWEN2A:
3438        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
3439            {
3440                n_patches = img->nx;
3441
3442                const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
3443                if (ctx->model.audio_has_stack_frames()) {
3444                    GGML_ASSERT(proj_stack_factor > 0);
3445                    const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
3446                    n_patches = n_len / proj_stack_factor;
3447                }
3448
3449                // whisper downscales input token by half after conv1d
3450                n_patches /= 2;
3451
3452                if (ctx->model.audio_has_avgpool()) {
3453                    // divide by 2 because of nn.AvgPool1d(2, stride=2)
3454                    n_patches /= 2;
3455                }
3456            } break;
3457        case PROJECTOR_TYPE_GLMA:
3458            {
3459                n_patches = img->nx;
3460                // whisper downscales input token by half after conv1d
3461                n_patches /= 2;
3462                // reshape by merge_factor
3463                n_patches /= ctx->model.hparams.proj_stack_factor;
3464                // for BOI and EOI token embeddings
3465                n_patches += 2;
3466            } break;
3467        case PROJECTOR_TYPE_COGVLM:
3468            {
3469                n_patches += 2; // for BOI and EOI token embeddings
3470            } break;
3471        case PROJECTOR_TYPE_LFM2A:
3472            {
3473                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
3474            } break;
3475        default:
3476            GGML_ABORT("unsupported projector type");
3477    }
3478
3479    return n_patches;
3480}
3481
3482bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
3483    clip_image_f32_batch imgs;
3484    clip_image_f32_ptr img_copy(clip_image_f32_init());
3485    *img_copy = *img;
3486    imgs.entries.push_back(std::move(img_copy));
3487
3488    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
3489}
3490
3491bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
3492    const clip_image_f32_batch & imgs = *imgs_c_ptr;
3493    int batch_size = imgs.entries.size();
3494
3495    // TODO @ngxson : implement batch size > 1 as a loop
3496    //                we don't need true batching support because the cgraph will gonna be big anyway
3497    if (batch_size != 1) {
3498        return false; // only support batch size of 1
3499    }
3500
3501    // if buffers are not allocated, we need to do a warmup run to allocate them
3502    if (!ctx->is_allocated) {
3503        clip_model_loader::warmup(*ctx, *imgs_c_ptr);
3504    }
3505
3506    // build the inference graph
3507    ggml_backend_sched_reset(ctx->sched.get());
3508    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
3509    ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
3510
3511    // set inputs
3512    const auto & model   = ctx->model;
3513    const auto & hparams = model.hparams;
3514
3515    const int image_size_width  = imgs.entries[0]->nx;
3516    const int image_size_height = imgs.entries[0]->ny;
3517
3518    const int patch_size    = hparams.patch_size;
3519    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
3520    const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
3521    const int pos_w = image_size_width  / patch_size;
3522    const int pos_h = image_size_height / patch_size;
3523
3524
3525    auto get_inp_tensor = [&gf](const char * name) {
3526        ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
3527        if (inp == nullptr) {
3528            GGML_ABORT("Failed to get tensor %s", name);
3529        }
3530        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
3531            GGML_ABORT("Tensor %s is not an input tensor", name);
3532        }
3533        return inp;
3534    };
3535
3536    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
3537        ggml_tensor * cur = get_inp_tensor(name);
3538        GGML_ASSERT(cur->type == GGML_TYPE_F32);
3539        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
3540        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
3541    };
3542
3543    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
3544        ggml_tensor * cur = get_inp_tensor(name);
3545        GGML_ASSERT(cur->type == GGML_TYPE_I32);
3546        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
3547        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
3548    };
3549
3550    // set input pixel values
3551    if (!imgs.is_audio) {
3552        size_t nelem = 0;
3553        for (const auto & img : imgs.entries) {
3554            nelem += img->nx * img->ny * 3;
3555        }
3556        std::vector<float> inp_raw(nelem);
3557
3558        // layout of data (note: the channel dim is unrolled to better visualize the layout):
3559        //
3560        // ┌──W──┐
3561        // │     H │  channel = R
3562        // ├─────┤ │
3563        // │     H │  channel = G
3564        // ├─────┤ │
3565        // │     H │  channel = B
3566        // └─────┘ │
3567        //   ──────┘ x B
3568
3569        for (size_t i = 0; i < imgs.entries.size(); i++) {
3570            const int nx = imgs.entries[i]->nx;
3571            const int ny = imgs.entries[i]->ny;
3572            const int n = nx * ny;
3573
3574            for (int b = 0; b < batch_size; b++) {
3575                float * batch_entry = inp_raw.data() + b * (3*n);
3576                for (int y = 0; y < ny; y++) {
3577                    for (int x = 0; x < nx; x++) {
3578                        size_t base_src = 3*(y * nx + x); // idx of the first channel
3579                        size_t base_dst =    y * nx + x;  // idx of the first channel
3580                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
3581                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
3582                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
3583                    }
3584                }
3585            }
3586        }
3587        set_input_f32("inp_raw", inp_raw);
3588
3589    } else {
3590        // audio input
3591        GGML_ASSERT(imgs.entries.size() == 1);
3592        const auto & mel_inp = imgs.entries[0];
3593        const int n_step = mel_inp->nx;
3594        const int n_mel  = mel_inp->ny;
3595        std::vector<float> inp_raw(n_step * n_mel);
3596        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
3597        set_input_f32("inp_raw", inp_raw);
3598    }
3599
3600    // set input per projector
3601    switch (ctx->model.proj_type) {
3602        case PROJECTOR_TYPE_MINICPMV:
3603            {
3604                // inspired from siglip:
3605                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
3606                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
3607                std::vector<int32_t> positions(pos_h * pos_w);
3608                int bucket_coords_h[1024];
3609                int bucket_coords_w[1024];
3610                for (int i = 0; i < pos_h; i++){
3611                    bucket_coords_h[i] = std::floor(70.0*i/pos_h);
3612                }
3613                for (int i = 0; i < pos_w; i++){
3614                    bucket_coords_w[i] = std::floor(70.0*i/pos_w);
3615                }
3616                for (int i = 0, id = 0; i < pos_h; i++){
3617                    for (int j = 0; j < pos_w; j++){
3618                        positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
3619                    }
3620                }
3621                set_input_i32("positions", positions);
3622
3623                // inputs for resampler projector
3624                // set the 2D positions (using float for sinusoidal embedding)
3625                int n_patches_per_col = image_size_width / patch_size;
3626                std::vector<float> pos_data(n_pos);
3627                // dimension H
3628                for (int i = 0; i < n_pos; i++) {
3629                    pos_data[i] = static_cast<float>(i / n_patches_per_col);
3630                }
3631                set_input_f32("pos_h", pos_data);
3632                // dimension W
3633                for (int i = 0; i < n_pos; i++) {
3634                    pos_data[i] = static_cast<float>(i % n_patches_per_col);
3635                }
3636                set_input_f32("pos_w", pos_data);
3637                // base frequency omega
3638                const float base_freq   = 10000.0f;
3639                const int   n_embd_proj = clip_n_mmproj_embd(ctx);
3640                std::vector<float> omega(n_embd_proj / 4);
3641                for (int i = 0; i < n_embd_proj / 4; ++i) {
3642                    omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
3643                }
3644                set_input_f32("omega", omega);
3645            } break;
3646        case PROJECTOR_TYPE_QWEN2VL:
3647        case PROJECTOR_TYPE_QWEN3VL:
3648        case PROJECTOR_TYPE_GLM4V:
3649            {
3650                const int merge_ratio = hparams.n_merge;
3651                const int pw = image_size_width  / patch_size;
3652                const int ph = image_size_height / patch_size;
3653                std::vector<int> positions(n_pos * 4);
3654                int ptr = 0;
3655                for (int y = 0; y < ph; y += merge_ratio) {
3656                    for (int x = 0; x < pw; x += merge_ratio) {
3657                        for (int dy = 0; dy < 2; dy++) {
3658                            for (int dx = 0; dx < 2; dx++) {
3659                                positions[                  ptr] = y + dy;
3660                                positions[    num_patches + ptr] = x + dx;
3661                                positions[2 * num_patches + ptr] = y + dy;
3662                                positions[3 * num_patches + ptr] = x + dx;
3663                                ptr++;
3664                            }
3665                        }
3666                    }
3667                }
3668
3669                set_input_i32("positions", positions);
3670            } break;
3671        case PROJECTOR_TYPE_QWEN25VL:
3672        case PROJECTOR_TYPE_YOUTUVL:
3673            {
3674                // pw * ph = number of tokens output by ViT after apply patch merger
3675                // ipw * ipw = number of vision token been processed inside ViT
3676                const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty();
3677                const int merge_ratio = 2;
3678                const int pw  = image_size_width  / patch_size / merge_ratio;
3679                const int ph  = image_size_height / patch_size / merge_ratio;
3680                const int ipw = image_size_width  / patch_size;
3681                const int iph = image_size_height / patch_size;
3682
3683                std::vector<int> idx    (ph * pw);
3684                std::vector<int> inv_idx(ph * pw);
3685
3686                if (use_window_attn) {
3687                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
3688                    const int grid_window = attn_window_size / patch_size / merge_ratio;
3689                    int dst = 0;
3690                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
3691                    std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
3692                    int mask_row = 0;
3693
3694                    for (int y = 0; y < ph; y += grid_window) {
3695                        for (int x = 0; x < pw; x += grid_window) {
3696                            const int win_h = std::min(grid_window, ph - y);
3697                            const int win_w = std::min(grid_window, pw - x);
3698                            const int dst_0 = dst;
3699                            // group all tokens belong to the same window togather (to a continue range)
3700                            for (int dy = 0; dy < win_h; dy++) {
3701                                for (int dx = 0; dx < win_w; dx++) {
3702                                    const int src = (y + dy) * pw + (x + dx);
3703                                    GGML_ASSERT(src < (int)idx.size());
3704                                    GGML_ASSERT(dst < (int)inv_idx.size());
3705                                    idx    [src] = dst;
3706                                    inv_idx[dst] = src;
3707                                    dst++;
3708                                }
3709                            }
3710
3711                            for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
3712                                int row_offset = mask_row * (ipw * iph);
3713                                std::fill(
3714                                    mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
3715                                    mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
3716                                    0.0);
3717                                mask_row++;
3718                            }
3719                        }
3720                    }
3721
3722                    set_input_i32("window_idx",     idx);
3723                    set_input_i32("inv_window_idx", inv_idx);
3724                    set_input_f32("window_mask",    mask);
3725                } else {
3726                    for (int i = 0; i < ph * pw; i++) {
3727                        idx[i] = i;
3728                    }
3729                }
3730
3731                const int mpow = merge_ratio * merge_ratio;
3732                std::vector<int> positions(n_pos * 4);
3733
3734                int ptr = 0;
3735                for (int y = 0; y < iph; y += merge_ratio) {
3736                    for (int x = 0; x < ipw; x += merge_ratio) {
3737                        for (int dy = 0; dy < 2; dy++) {
3738                            for (int dx = 0; dx < 2; dx++) {
3739                                auto remap = idx[ptr / mpow];
3740                                remap = (remap * mpow) + (ptr % mpow);
3741
3742                                positions[                  remap] = y + dy;
3743                                positions[    num_patches + remap] = x + dx;
3744                                positions[2 * num_patches + remap] = y + dy;
3745                                positions[3 * num_patches + remap] = x + dx;
3746                                ptr++;
3747                            }
3748                        }
3749                    }
3750                }
3751
3752                set_input_i32("positions", positions);
3753            } break;
3754        case PROJECTOR_TYPE_PIXTRAL:
3755        case PROJECTOR_TYPE_KIMIVL:
3756        case PROJECTOR_TYPE_KIMIK25:
3757        case PROJECTOR_TYPE_LIGHTONOCR:
3758            {
3759                // set the 2D positions
3760                int n_patches_per_col = image_size_width / patch_size;
3761                std::vector<int> pos_data(n_pos);
3762                // dimension H
3763                for (int i = 0; i < n_pos; i++) {
3764                    pos_data[i] = i / n_patches_per_col;
3765                }
3766                set_input_i32("pos_h", pos_data);
3767                // dimension W
3768                for (int i = 0; i < n_pos; i++) {
3769                    pos_data[i] = i % n_patches_per_col;
3770                }
3771                set_input_i32("pos_w", pos_data);
3772            } break;
3773        case PROJECTOR_TYPE_GLM_EDGE:
3774        {
3775            // llava and other models
3776            std::vector<int32_t> positions(n_pos);
3777            for (int i = 0; i < n_pos; i++) {
3778                positions[i] = i;
3779            }
3780            set_input_i32("positions", positions);
3781        } break;
3782        case PROJECTOR_TYPE_MLP:
3783        case PROJECTOR_TYPE_MLP_NORM:
3784        case PROJECTOR_TYPE_LDP:
3785        case PROJECTOR_TYPE_LDPV2:
3786            {
3787                // llava and other models
3788                std::vector<int32_t> positions(n_pos);
3789                for (int i = 0; i < n_pos; i++) {
3790                    positions[i] = i;
3791                }
3792                set_input_i32("positions", positions);
3793
3794                // The patches vector is used to get rows to index into the embeds with;
3795                // we should skip dim 0 only if we have CLS to avoid going out of bounds
3796                // when retrieving the rows.
3797                int patch_offset = model.class_embedding ? 1 : 0;
3798                std::vector<int32_t> patches(num_patches);
3799                for (int i = 0; i < num_patches; i++) {
3800                    patches[i] = i + patch_offset;
3801                }
3802                set_input_i32("patches", patches);
3803            } break;
3804        case PROJECTOR_TYPE_GEMMA3:
3805        case PROJECTOR_TYPE_GEMMA3NV:
3806        case PROJECTOR_TYPE_IDEFICS3:
3807        case PROJECTOR_TYPE_INTERNVL:
3808        case PROJECTOR_TYPE_QWEN2A:
3809        case PROJECTOR_TYPE_GLMA:
3810        case PROJECTOR_TYPE_ULTRAVOX:
3811        case PROJECTOR_TYPE_LFM2:
3812        case PROJECTOR_TYPE_VOXTRAL:
3813        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
3814        case PROJECTOR_TYPE_JANUS_PRO:
3815        case PROJECTOR_TYPE_COGVLM:
3816            {
3817                // do nothing
3818            } break;
3819        case PROJECTOR_TYPE_LLAMA4:
3820            {
3821                // set the 2D positions
3822                int n_patches_per_col = image_size_width / patch_size;
3823                std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
3824                // last pos is always kept 0, it's for CLS
3825                // dimension H
3826                for (int i = 0; i < num_patches; i++) {
3827                    pos_data[i] = (i / n_patches_per_col) + 1;
3828                }
3829                set_input_i32("pos_h", pos_data);
3830                // dimension W
3831                for (int i = 0; i < num_patches; i++) {
3832                    pos_data[i] = (i % n_patches_per_col) + 1;
3833                }
3834                set_input_i32("pos_w", pos_data);
3835            } break;
3836        case PROJECTOR_TYPE_LFM2A:
3837            {
3838                GGML_ASSERT(imgs.entries.size() == 1);
3839                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
3840
3841                auto d_model = 512;
3842                auto seq_len = n_frames * 2 - 1;
3843                std::vector<float> pos_emb(d_model*seq_len);
3844                std::vector<double> inv_freq(d_model / 2);
3845                for (size_t i = 0; i < inv_freq.size(); ++i) {
3846                    inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
3847                }
3848                for (int64_t pos = 0; pos < seq_len; ++pos) {
3849                    for (size_t i = 0; i < inv_freq.size(); ++i) {
3850                        const float ang = (n_frames - pos - 1) * inv_freq[i];
3851                        pos_emb[pos*d_model + 2*i + 0] = sinf(ang);  // even
3852                        pos_emb[pos*d_model + 2*i + 1] = cosf(ang);  // odd
3853                    }
3854                }
3855                set_input_f32("pos_emb", pos_emb);
3856            } break;
3857        default:
3858            GGML_ABORT("Unknown projector type");
3859    }
3860
3861    // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
3862    ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu);
3863    ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
3864    if (reg) {
3865        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
3866        if (ggml_backend_set_n_threads_fn) {
3867            ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads);
3868        }
3869    }
3870
3871    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
3872    if (status != GGML_STATUS_SUCCESS) {
3873        LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status);
3874        return false;
3875    }
3876
3877    // the last node is the embedding tensor
3878    ggml_tensor * embeddings = ggml_graph_node(gf, -1);
3879
3880    // sanity check (only support batch size of 1 for now)
3881    const int n_tokens_out = embeddings->ne[1];
3882    const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get());
3883    if (n_tokens_out != expected_n_tokens_out) {
3884        LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out);
3885        GGML_ABORT("Invalid number of output tokens");
3886    }
3887
3888    // copy the embeddings to the location passed by the user
3889    if (vec != nullptr) {
3890        ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
3891    }
3892
3893    // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
3894    if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
3895        const int64_t n_embd = embeddings->ne[0];
3896        const int64_t n_tokens = embeddings->ne[1];
3897        std::vector<float> emb_data(n_embd * n_tokens);
3898        ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings));
3899
3900        LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n");
3901        LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens);
3902
3903        // Print first few values of first token
3904        LOG_INF("Token 0 (first 16 values): ");
3905        for (int i = 0; i < std::min((int64_t)16, n_embd); i++) {
3906            LOG_INF("%.6f ", emb_data[i]);
3907        }
3908        LOG_INF("\n");
3909
3910        // Print last few values of first token
3911        if (n_embd > 16) {
3912            LOG_INF("Token 0 (last 16 values):  ");
3913            for (int64_t i = n_embd - 16; i < n_embd; i++) {
3914                LOG_INF("%.6f ", emb_data[i]);
3915            }
3916            LOG_INF("\n");
3917        }
3918
3919        // Compute and print statistics
3920        float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0];
3921        for (size_t i = 0; i < emb_data.size(); i++) {
3922            sum += emb_data[i];
3923            sum_sq += emb_data[i] * emb_data[i];
3924            min_val = std::min(min_val, emb_data[i]);
3925            max_val = std::max(max_val, emb_data[i]);
3926        }
3927        float mean = sum / emb_data.size();
3928        float variance = (sum_sq / emb_data.size()) - (mean * mean);
3929        LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n",
3930                mean, sqrtf(variance), min_val, max_val, sum);
3931        LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n");
3932    }
3933
3934    return true;
3935}
3936
3937int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
3938    switch (ctx->model.proj_type) {
3939        case PROJECTOR_TYPE_LDP:
3940            return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
3941        case PROJECTOR_TYPE_LDPV2:
3942            return ctx->model.mm_model_peg_0_b->ne[0];
3943        case PROJECTOR_TYPE_MLP:
3944        case PROJECTOR_TYPE_PIXTRAL:
3945        case PROJECTOR_TYPE_LIGHTONOCR:
3946            return ctx->model.mm_2_w->ne[1];
3947        case PROJECTOR_TYPE_MLP_NORM:
3948            return ctx->model.mm_3_b->ne[0];
3949        case PROJECTOR_TYPE_MINICPMV:
3950            return ctx->model.mm_model_proj->ne[0];
3951        case PROJECTOR_TYPE_GLM_EDGE:
3952            return ctx->model.mm_model_mlp_3_w->ne[1];
3953        case PROJECTOR_TYPE_QWEN2VL:
3954        case PROJECTOR_TYPE_QWEN25VL:
3955        case PROJECTOR_TYPE_JANUS_PRO:
3956        case PROJECTOR_TYPE_YOUTUVL:
3957            return ctx->model.mm_1_b->ne[0];
3958        case PROJECTOR_TYPE_QWEN3VL:
3959            // main path + deepstack paths
3960            return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
3961        case PROJECTOR_TYPE_GEMMA3:
3962        case PROJECTOR_TYPE_GEMMA3NV:
3963            return ctx->model.mm_input_proj_w->ne[0];
3964        case PROJECTOR_TYPE_IDEFICS3:
3965            return ctx->model.projection->ne[1];
3966        case PROJECTOR_TYPE_ULTRAVOX:
3967        case PROJECTOR_TYPE_VOXTRAL:
3968        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
3969            return ctx->model.mm_2_w->ne[1];
3970        case PROJECTOR_TYPE_INTERNVL:
3971            return ctx->model.mm_3_w->ne[1];
3972        case PROJECTOR_TYPE_LLAMA4:
3973            return ctx->model.mm_model_proj->ne[1];
3974        case PROJECTOR_TYPE_QWEN2A:
3975            return ctx->model.mm_fc_w->ne[1];
3976        case PROJECTOR_TYPE_GLMA:
3977            return ctx->model.mm_2_w->ne[1];
3978        case PROJECTOR_TYPE_LFM2:
3979        case PROJECTOR_TYPE_KIMIVL:
3980        case PROJECTOR_TYPE_KIMIK25:
3981            return ctx->model.mm_2_w->ne[1];
3982        case PROJECTOR_TYPE_COGVLM:
3983            return ctx->model.mm_4h_to_h_w->ne[1];
3984        case PROJECTOR_TYPE_LFM2A:
3985            return ctx->model.position_embeddings->ne[0];
3986        case PROJECTOR_TYPE_GLM4V:
3987            return ctx->model.mm_ffn_down_w->ne[1];
3988        default:
3989            GGML_ABORT("Unknown projector type");
3990    }
3991}
3992
3993int clip_is_minicpmv(const struct clip_ctx * ctx) {
3994    // TODO: remove this function
3995    if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
3996        return ctx->model.hparams.minicpmv_version;
3997    }
3998    return 0;
3999}
4000
4001bool clip_is_glm(const struct clip_ctx * ctx) {
4002    // TODO: remove this function
4003    return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
4004}
4005
4006bool clip_is_llava(const struct clip_ctx * ctx) {
4007    return ctx->model.hparams.has_llava_projector;
4008}
4009
4010bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
4011    return ctx->model.modality == CLIP_MODALITY_VISION;
4012}
4013
4014bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
4015    return ctx->model.modality == CLIP_MODALITY_AUDIO;
4016}
4017
4018bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
4019    switch (ctx->proj_type()) {
4020        case PROJECTOR_TYPE_ULTRAVOX:
4021        case PROJECTOR_TYPE_QWEN2A:
4022        case PROJECTOR_TYPE_GLMA:
4023        case PROJECTOR_TYPE_VOXTRAL:
4024        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
4025            return true;
4026        default:
4027            return false;
4028    }
4029}
4030
4031bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
4032    clip_image_f32 clip_img;
4033    clip_img.buf.resize(h * w * 3);
4034    for (int i = 0; i < h*w*3; i++)
4035    {
4036        clip_img.buf[i] = img[i];
4037    }
4038    clip_img.nx = w;
4039    clip_img.ny = h;
4040    clip_image_encode(ctx, n_threads, &clip_img, vec);
4041    return true;
4042}
4043
4044//
4045// API used internally with mtmd
4046//
4047
4048projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
4049    return ctx->proj_type();
4050}
4051
4052void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
4053    clip_image_f32 * audio = new clip_image_f32;
4054    audio->nx = n_frames;
4055    audio->ny = n_mel;
4056    audio->buf.resize(n_frames * n_mel);
4057    std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
4058
4059    batch->entries.push_back(clip_image_f32_ptr(audio));
4060    batch->is_audio = true;
4061}
4062
4063const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
4064    return &ctx->model.hparams;
4065}
4066
4067//
4068// API for debugging
4069//
4070void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
4071    clip_image_f32 img;
4072    img.nx = w;
4073    img.ny = h;
4074    img.buf.resize(h * w * 3);
4075    for (int i = 0; i < h * w * 3; i++) {
4076        img.buf[i] = static_cast<float>(fill_value);
4077    }
4078    clip_image_encode(ctx, 1, &img, nullptr);
4079    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
4080}