1# coding=utf-8
  2# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8#     http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15""" PyTorch Siglip model. """
 16# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
 17
 18
 19import os
 20import math
 21import warnings
 22
 23import numpy as np
 24import torch
 25import torch.nn.functional as F
 26from torch import nn
 27from torch.nn.init import _calculate_fan_in_and_fan_out
 28
 29from transformers.activations import ACT2FN
 30from transformers.modeling_utils import PreTrainedModel
 31from transformers.configuration_utils import PretrainedConfig
 32from transformers.utils import (
 33    logging,
 34)
 35from transformers.utils import logging
 36
 37logger = logging.get_logger(__name__)
 38
 39class SiglipVisionConfig(PretrainedConfig):
 40    r"""
 41    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
 42    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
 43    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
 44    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
 45    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 46    documentation from [`PretrainedConfig`] for more information.
 47    Args:
 48        hidden_size (`int`, *optional*, defaults to 768):
 49            Dimensionality of the encoder layers and the pooler layer.
 50        intermediate_size (`int`, *optional*, defaults to 3072):
 51            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 52        num_hidden_layers (`int`, *optional*, defaults to 12):
 53            Number of hidden layers in the Transformer encoder.
 54        num_attention_heads (`int`, *optional*, defaults to 12):
 55            Number of attention heads for each attention layer in the Transformer encoder.
 56        num_channels (`int`, *optional*, defaults to 3):
 57            Number of channels in the input images.
 58        image_size (`int`, *optional*, defaults to 224):
 59            The size (resolution) of each image.
 60        patch_size (`int`, *optional*, defaults to 16):
 61            The size (resolution) of each patch.
 62        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
 63            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 64            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
 65        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
 66            The epsilon used by the layer normalization layers.
 67        attention_dropout (`float`, *optional*, defaults to 0.0):
 68            The dropout ratio for the attention probabilities.
 69    Example:
 70    ```python
 71    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
 72    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
 73    >>> configuration = SiglipVisionConfig()
 74    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
 75    >>> model = SiglipVisionModel(configuration)
 76    >>> # Accessing the model configuration
 77    >>> configuration = model.config
 78    ```"""
 79
 80    model_type = "siglip_vision_model"
 81
 82    def __init__(
 83        self,
 84        hidden_size=768,
 85        intermediate_size=3072,
 86        num_hidden_layers=12,
 87        num_attention_heads=12,
 88        num_channels=3,
 89        image_size=224,
 90        patch_size=16,
 91        hidden_act="gelu_pytorch_tanh",
 92        layer_norm_eps=1e-6,
 93        attention_dropout=0.0,
 94        **kwargs,
 95    ):
 96        super().__init__(**kwargs)
 97
 98        self.hidden_size = hidden_size
 99        self.intermediate_size = intermediate_size
100        self.num_hidden_layers = num_hidden_layers
101        self.num_attention_heads = num_attention_heads
102        self.num_channels = num_channels
103        self.patch_size = patch_size
104        self.image_size = image_size
105        self.attention_dropout = attention_dropout
106        self.layer_norm_eps = layer_norm_eps
107        self.hidden_act = hidden_act
108
109_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
110
111SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
112    "google/siglip-base-patch16-224",
113    # See all SigLIP models at https://huggingface.co/models?filter=siglip
114]
115
116# Copied from transformers.models.llama.modeling_llama._get_unpad_data
117def _get_unpad_data(attention_mask):
118    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
119    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
120    max_seqlen_in_batch = seqlens_in_batch.max().item()
121    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
122    return (
123        indices,
124        cu_seqlens,
125        max_seqlen_in_batch,
126    )
127
128
129def _trunc_normal_(tensor, mean, std, a, b):
130    # Cut & paste from PyTorch official master until it's in a few official releases - RW
131    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
132    def norm_cdf(x):
133        # Computes standard normal cumulative distribution function
134        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
135
136    if (mean < a - 2 * std) or (mean > b + 2 * std):
137        warnings.warn(
138            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
139            "The distribution of values may be incorrect.",
140            stacklevel=2,
141        )
142
143    # Values are generated by using a truncated uniform distribution and
144    # then using the inverse CDF for the normal distribution.
145    # Get upper and lower cdf values
146    l = norm_cdf((a - mean) / std)
147    u = norm_cdf((b - mean) / std)
148
149    # Uniformly fill tensor with values from [l, u], then translate to
150    # [2l-1, 2u-1].
151    tensor.uniform_(2 * l - 1, 2 * u - 1)
152
153    # Use inverse cdf transform for normal distribution to get truncated
154    # standard normal
155    if tensor.dtype in [torch.float16, torch.bfloat16]:
156        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
157        og_dtype = tensor.dtype
158        tensor = tensor.to(torch.float32)
159        tensor.erfinv_()
160        tensor = tensor.to(og_dtype)
161    else:
162        tensor.erfinv_()
163
164    # Transform to proper mean, std
165    tensor.mul_(std * math.sqrt(2.0))
166    tensor.add_(mean)
167
168    # Clamp to ensure it's in the proper range
169    if tensor.dtype == torch.float16:
170        # The `clamp_` op is not (yet?) defined in float16+cpu
171        tensor = tensor.to(torch.float32)
172        tensor.clamp_(min=a, max=b)
173        tensor = tensor.to(torch.float16)
174    else:
175        tensor.clamp_(min=a, max=b)
176
177
178def trunc_normal_tf_(
179    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
180):
181    """Fills the input Tensor with values drawn from a truncated
182    normal distribution. The values are effectively drawn from the
183    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
184    with values outside :math:`[a, b]` redrawn until they are within
185    the bounds. The method used for generating the random values works
186    best when :math:`a \\leq \text{mean} \\leq b`.
187    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
188    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
189    and the result is subsquently scaled and shifted by the mean and std args.
190    Args:
191        tensor: an n-dimensional `torch.Tensor`
192        mean: the mean of the normal distribution
193        std: the standard deviation of the normal distribution
194        a: the minimum cutoff value
195        b: the maximum cutoff value
196    """
197    with torch.no_grad():
198        _trunc_normal_(tensor, 0, 1.0, a, b)
199        tensor.mul_(std).add_(mean)
200
201
202def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
203    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
204    denom = fan_in
205    if mode == "fan_in":
206        denom = fan_in
207    elif mode == "fan_out":
208        denom = fan_out
209    elif mode == "fan_avg":
210        denom = (fan_in + fan_out) / 2
211
212    variance = scale / denom
213
214    if distribution == "truncated_normal":
215        # constant is stddev of standard normal truncated to (-2, 2)
216        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
217    elif distribution == "normal":
218        with torch.no_grad():
219            tensor.normal_(std=math.sqrt(variance))
220    elif distribution == "uniform":
221        bound = math.sqrt(3 * variance)
222        with torch.no_grad():
223            tensor.uniform_(-bound, bound)
224    else:
225        raise ValueError(f"invalid distribution {distribution}")
226
227
228def lecun_normal_(tensor):
229    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
230
231
232def default_flax_embed_init(tensor):
233    variance_scaling_(tensor, mode="fan_in", distribution="normal")
234
235class SiglipVisionEmbeddings(nn.Module):
236    def __init__(self, config: SiglipVisionConfig):
237        super().__init__()
238        self.config = config
239        self.embed_dim = config.hidden_size
240        self.image_size = config.image_size
241        self.patch_size = config.patch_size
242
243        self.patch_embedding = nn.Conv2d(
244            in_channels=config.num_channels,
245            out_channels=self.embed_dim,
246            kernel_size=self.patch_size,
247            stride=self.patch_size,
248            padding="valid",
249        )
250
251        self.num_patches_per_side = self.image_size // self.patch_size
252        self.num_patches = self.num_patches_per_side**2
253        self.num_positions = self.num_patches
254        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
255
256class SiglipAttention(nn.Module):
257    """Multi-headed attention from 'Attention Is All You Need' paper"""
258
259    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
260    def __init__(self, config):
261        super().__init__()
262        self.config = config
263        self.embed_dim = config.hidden_size
264        self.num_heads = config.num_attention_heads
265        self.head_dim = self.embed_dim // self.num_heads
266        if self.head_dim * self.num_heads != self.embed_dim:
267            raise ValueError(
268                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
269                f" {self.num_heads})."
270            )
271        self.scale = self.head_dim**-0.5
272        self.dropout = config.attention_dropout
273
274        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
275        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
276        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
277        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
278
279# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
280class SiglipMLP(nn.Module):
281    def __init__(self, config):
282        super().__init__()
283        self.config = config
284        self.activation_fn = ACT2FN[config.hidden_act]
285        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
286        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
287
288
289# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
290class SiglipEncoderLayer(nn.Module):
291    def __init__(self, config: SiglipVisionConfig):
292        super().__init__()
293        self.embed_dim = config.hidden_size
294        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
295        self.self_attn = (
296            SiglipAttention(config)
297        )
298        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
299        self.mlp = SiglipMLP(config)
300        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
301
302class SiglipPreTrainedModel(PreTrainedModel):
303    """
304    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
305    models.
306    """
307
308    config_class = SiglipVisionConfig
309    base_model_prefix = "siglip"
310    supports_gradient_checkpointing = True
311
312    def _init_weights(self, module):
313        """Initialize the weights"""
314
315        if isinstance(module, SiglipVisionEmbeddings):
316            width = self.config.hidden_size
317            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
318        elif isinstance(module, nn.Embedding):
319            default_flax_embed_init(module.weight)
320        elif isinstance(module, SiglipAttention):
321            nn.init.normal_(module.q_proj.weight)
322            nn.init.normal_(module.k_proj.weight)
323            nn.init.normal_(module.v_proj.weight)
324            nn.init.normal_(module.out_proj.weight)
325            nn.init.zeros_(module.q_proj.bias)
326            nn.init.zeros_(module.k_proj.bias)
327            nn.init.zeros_(module.v_proj.bias)
328            nn.init.zeros_(module.out_proj.bias)
329        elif isinstance(module, SiglipMLP):
330            nn.init.normal_(module.fc1.weight)
331            nn.init.normal_(module.fc2.weight)
332            nn.init.normal_(module.fc1.bias, std=1e-6)
333            nn.init.normal_(module.fc2.bias, std=1e-6)
334        elif isinstance(module, (nn.Linear, nn.Conv2d)):
335            lecun_normal_(module.weight)
336            if module.bias is not None:
337                nn.init.zeros_(module.bias)
338        elif isinstance(module, nn.LayerNorm):
339            module.bias.data.zero_()
340            module.weight.data.fill_(1.0)
341
342
343SIGLIP_START_DOCSTRING = r"""
344    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
345    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
346    etc.)
347    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
348    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
349    and behavior.
350    Parameters:
351        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
352            Initializing with a config file does not load the weights associated with the model, only the
353            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
354"""
355
356
357SIGLIP_VISION_INPUTS_DOCSTRING = r"""
358    Args:
359        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
360            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
361            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
362        output_attentions (`bool`, *optional*):
363            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
364            tensors for more detail.
365        output_hidden_states (`bool`, *optional*):
366            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
367            more detail.
368        return_dict (`bool`, *optional*):
369            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
370"""
371
372
373# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
374class SiglipEncoder(nn.Module):
375    """
376    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
377    [`SiglipEncoderLayer`].
378    Args:
379        config: SiglipConfig
380    """
381
382    def __init__(self, config: SiglipVisionConfig):
383        super().__init__()
384        self.config = config
385        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
386        self.gradient_checkpointing = False
387
388class SiglipVisionTransformer(SiglipPreTrainedModel):
389    config_class = SiglipVisionConfig
390    main_input_name = "pixel_values"
391    _supports_flash_attn_2 = True
392
393    def __init__(self, config: SiglipVisionConfig):
394        super().__init__(config)
395        self.config = config
396        embed_dim = config.hidden_size
397
398        self.embeddings = SiglipVisionEmbeddings(config)
399        self.encoder = SiglipEncoder(config)
400        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
401        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
402
403        # Initialize weights and apply final processing
404        self.post_init()
405
406    def get_input_embeddings(self) -> nn.Module:
407        return self.embeddings.patch_embedding
408
409import argparse
410import json
411import re
412
413import numpy as np
414from gguf import *
415from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
416from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
417
418TEXT = "clip.text"
419VISION = "clip.vision"
420
421
422def add_key_str(raw_key: str, arch: str) -> str:
423    return raw_key.format(arch=arch)
424
425
426def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
427    if name in (
428        "logit_scale",
429        "text_model.embeddings.position_ids",
430        "vision_model.embeddings.position_ids",
431    ):
432        return True
433
434    if has_minicpmv and name in ["visual_projection.weight"]:
435        return True
436
437    if name.startswith("v") and not has_vision:
438        return True
439
440    if name.startswith("t") and not has_text:
441        return True
442
443    return False
444
445
446def get_tensor_name(name: str) -> str:
447    if "projection" in name:
448        return name
449    if "mm_projector" in name:
450        name = name.replace("model.mm_projector", "mm")
451        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
452        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
453        return name
454
455    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
456
457
458def bytes_to_unicode():
459    """
460    Returns list of utf-8 byte and a corresponding list of unicode strings.
461    The reversible bpe codes work on unicode strings.
462    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
463    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
464    This is a significant percentage of your normal, say, 32K bpe vocab.
465    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
466    And avoids mapping to whitespace/control characters the bpe code barfs on.
467    """
468    bs = (
469        list(range(ord("!"), ord("~") + 1))
470        + list(range(ord("ยก"), ord("ยฌ") + 1))
471        + list(range(ord("ยฎ"), ord("รฟ") + 1))
472    )
473    cs = bs[:]
474    n = 0
475    for b in range(2**8):
476        if b not in bs:
477            bs.append(b)
478            cs.append(2**8 + n)
479            n += 1
480    cs = [chr(n) for n in cs]
481    return dict(zip(bs, cs))
482
483
484ap = argparse.ArgumentParser()
485ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
486ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
487ap.add_argument("--text-only", action="store_true", required=False,
488                help="Save a text-only model. It can't be used to encode images")
489ap.add_argument("--vision-only", action="store_true", required=False,
490                help="Save a vision-only model. It can't be used to encode texts")
491ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
492                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
493ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
494                help="The clip model is from openclip (for ViT-SO400M type))")
495ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
496ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
497ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
498# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
499# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
500default_image_mean = [0.5, 0.5, 0.5]
501default_image_std = [0.5, 0.5, 0.5]
502ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
503ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
504ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2)
505
506# with proper
507args = ap.parse_args()
508
509
510if args.text_only and args.vision_only:
511    print("--text-only and --image-only arguments cannot be specified at the same time.")
512    exit(1)
513
514if args.use_f32:
515    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
516
517# output in the same directory as the model if output_dir is None
518dir_model = args.model_dir
519
520# Read config.json to get actual model configuration
521config_path = os.path.join(dir_model, "config.json")
522model_config = {}
523if os.path.isfile(config_path):
524    with open(config_path, "r", encoding="utf-8") as f:
525        model_config = json.load(f)
526    print(f"Loaded config from {config_path}")
527else:
528    print(f"Warning: config.json not found at {config_path}")
529
530# If minicpmv_projector is not specified but the default path exists, use the default path
531if args.minicpmv_projector is None:
532    default_projector_path = os.path.join(dir_model, "minicpmv.projector")
533    if os.path.isfile(default_projector_path):
534        args.minicpmv_projector = default_projector_path
535        print(f"Found default projector file: {default_projector_path}")
536
537# If output_dir is not specified, use model_dir as the default value
538if args.output_dir is None:
539    args.output_dir = dir_model
540
541if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
542    vocab = None
543    tokens = None
544else:
545    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
546        vocab = json.load(f)
547        tokens = [key for key in vocab]
548
549# possible data types
550#   ftype == 0 -> float32
551#   ftype == 1 -> float16
552#
553# map from ftype to string
554ftype_str = ["f32", "f16"]
555
556ftype = 1
557if args.use_f32:
558    ftype = 0
559
560# if args.clip_model_is_vision or args.clip_model_is_openclip:
561#     model = CLIPVisionModel.from_pretrained(dir_model)
562#     processor = None
563# else:
564#     model = CLIPModel.from_pretrained(dir_model)
565#     processor = CLIPProcessor.from_pretrained(dir_model)
566
567minicpmv_version = args.minicpmv_version
568
569# Use actual config values instead of hardcoded ones
570if model_config:
571    # For the projector/resampler, use the main model's hidden_size
572    emb_dim = model_config.get("hidden_size", 1536)
573
574    # For the vision model, use vision_config values
575    vision_config_dict = model_config.get("vision_config", {})
576    default_vision_config = {
577        "hidden_size": vision_config_dict.get("hidden_size", 1152),
578        "image_size": vision_config_dict.get("image_size", 980),
579        "intermediate_size": vision_config_dict.get("intermediate_size", 4304),
580        "model_type": vision_config_dict.get("model_type", "siglip"),
581        "num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
582        "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
583        "patch_size": vision_config_dict.get("patch_size", 14),
584    }
585
586    # Use vision model's num_hidden_layers for block_count
587    block_count = vision_config_dict.get("num_hidden_layers", 27)
588
589    print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
590    print(f"Vision config: {default_vision_config}")
591else:
592    # Fallback to original hardcoded logic if config.json not found
593    emb_dim = 4096
594    block_count = 26
595    if minicpmv_version == 1:
596        emb_dim = 2304
597        block_count = 26
598    elif minicpmv_version == 2:
599        emb_dim = 4096
600        block_count = 27
601    elif minicpmv_version == 3:
602        emb_dim = 3584
603        block_count = 27
604    elif minicpmv_version == 4:
605        emb_dim = 3584
606        block_count = 27
607    elif minicpmv_version == 5:
608        emb_dim = 2560
609        block_count = 27
610    elif minicpmv_version == 6:
611        emb_dim = 4096
612        block_count = 27
613    elif minicpmv_version == 100045:
614        emb_dim = 4096
615        block_count = 27
616
617    default_vision_config = {
618            "hidden_size": 1152,
619            "image_size": 980,
620            "intermediate_size": 4304,
621            "model_type": "idefics2",
622            "num_attention_heads": 16,
623            "num_hidden_layers": 27,
624            "patch_size": 14,
625        }
626
627vision_config = Idefics2VisionConfig(**default_vision_config)
628model = Idefics2VisionTransformer(vision_config)
629if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
630    vision_config = SiglipVisionConfig(**default_vision_config)
631    model = SiglipVisionTransformer(vision_config)
632elif minicpmv_version == 4:
633    vision_config = SiglipVisionConfig(**default_vision_config)
634    model = SiglipVisionTransformer(vision_config)
635elif minicpmv_version == 5:
636    default_vision_config["model_type"] = "siglip_vision_model"
637    vision_config = SiglipVisionConfig(**default_vision_config)
638    model = SiglipVisionTransformer(vision_config)
639elif minicpmv_version == 6:
640    default_vision_config["model_type"] = "siglip_vision_model"
641    vision_config = SiglipVisionConfig(**default_vision_config)
642    model = SiglipVisionTransformer(vision_config)
643elif minicpmv_version == 100045:
644    default_vision_config["model_type"] = "siglip_vision_model"
645    vision_config = SiglipVisionConfig(**default_vision_config)
646    model = SiglipVisionTransformer(vision_config)
647
648processor = None
649# if model.attn_pool is not None:
650#     model.attn_pool = torch.nn.Identity()
651
652# model.blocks = model.blocks[:-1]
653model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
654
655fname_middle = None
656has_text_encoder = True
657has_vision_encoder = True
658has_minicpmv_projector = False
659
660if args.text_only:
661    fname_middle = "text-"
662    has_vision_encoder = False
663elif args.minicpmv_projector is not None:
664    fname_middle = "mmproj-"
665    has_text_encoder = False
666    has_minicpmv_projector = True
667elif args.vision_only:
668    fname_middle = "vision-"
669    has_text_encoder = False
670else:
671    fname_middle = ""
672
673output_dir = args.output_dir
674os.makedirs(output_dir, exist_ok=True)
675output_prefix = os.path.basename(output_dir).replace("ggml_", "")
676fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
677fout = GGUFWriter(path=fname_out, arch="clip")
678
679fout.add_bool("clip.has_text_encoder", has_text_encoder)
680fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
681fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
682fout.add_file_type(ftype)
683if args.text_only:
684    fout.add_description("text-only CLIP model")
685elif args.vision_only and not has_minicpmv_projector:
686    fout.add_description("vision-only CLIP model")
687elif has_minicpmv_projector:
688    fout.add_description("image encoder for MiniCPM-V")
689    # add projector type
690    fout.add_string("clip.projector_type", "resampler")
691    fout.add_int32("clip.minicpmv_version", minicpmv_version)
692else:
693    fout.add_description("two-tower CLIP model")
694
695if has_vision_encoder:
696    # vision_model hparams - use actual config values
697    vision_image_size = model_config.get("image_size", 448) if model_config else 448
698    vision_patch_size = default_vision_config.get("patch_size", 14)
699    vision_hidden_size = default_vision_config.get("hidden_size", 1152)
700    vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
701    vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
702
703    fout.add_uint32("clip.vision.image_size", vision_image_size)
704    fout.add_uint32("clip.vision.patch_size", vision_patch_size)
705    fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
706    fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
707    fout.add_uint32("clip.vision.projection_dim", 0)
708    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
709    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
710    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
711
712    # Add MiniCPM-V specific parameters
713    query_num = model_config.get("query_num", 0) if model_config else 0
714    resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
715    fout.add_uint32("clip.minicpmv_query_num", query_num)
716
717    if processor is not None:
718        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
719        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
720    else:
721        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
722        image_std = args.image_std if args.image_std is not None else default_image_std
723    fout.add_array("clip.vision.image_mean", image_mean)
724    fout.add_array("clip.vision.image_std", image_std)
725
726use_gelu = True
727fout.add_bool("clip.use_gelu", use_gelu)
728
729def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
730    """
731    embed_dim: output dimension for each position
732    pos: a list of positions to be encoded: size (M,)
733    out: (M, D)
734    """
735    assert embed_dim % 2 == 0
736    omega = np.arange(embed_dim // 2, dtype=np.float32)
737    omega /= embed_dim / 2.
738    omega = 1. / 10000 ** omega  # (D/2,)
739
740    pos = pos.reshape(-1)  # (M,)
741    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
742
743    emb_sin = np.sin(out)  # (M, D/2)
744    emb_cos = np.cos(out)  # (M, D/2)
745
746    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
747    return emb
748
749def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
750    assert embed_dim % 2 == 0
751
752    # use half of dimensions to encode grid_h
753    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
754    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
755
756    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
757    return emb
758
759
760# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
761def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
762    """
763    grid_size: int of the grid height and width
764    return:
765    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
766    """
767    if isinstance(grid_size, int):
768        grid_h_size, grid_w_size = grid_size, grid_size
769    else:
770        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
771
772    grid_h = np.arange(grid_h_size, dtype=np.float32)
773    grid_w = np.arange(grid_w_size, dtype=np.float32)
774    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
775    grid = np.stack(grid, axis=0)
776
777    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
778    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
779    if cls_token:
780        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
781    return pos_embed
782
783def _replace_name_resampler(s, v):
784    if re.match("resampler.pos_embed", s):
785        return {
786            s: v,
787            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
788        }
789    if re.match("resampler.proj", s):
790        return {
791            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
792            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
793        }
794    if re.match("resampler.attn.in_proj_.*", s):
795        return {
796            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
797            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
798            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
799        }
800    return {s: v}
801
802if has_minicpmv_projector:
803    projector = torch.load(args.minicpmv_projector)
804    new_state_dict = {}
805    for k, v in projector.items():
806        kvs = _replace_name_resampler(k, v)
807        for nk, nv in kvs.items():
808            new_state_dict[nk] = nv
809    projector = new_state_dict
810    ftype_cur = 0
811    for name, data in projector.items():
812        name = get_tensor_name(name)
813        data = data.squeeze().numpy()
814
815        n_dims = len(data.shape)
816        if ftype == 1:
817            if name[-7:] == ".weight" and n_dims == 2:
818                print("  Converting to float16")
819                data = data.astype(np.float16)
820                ftype_cur = 1
821            else:
822                print("  Converting to float32")
823                data = data.astype(np.float32)
824                ftype_cur = 0
825        else:
826            if data.dtype != np.float32:
827                print("  Converting to float32")
828                data = data.astype(np.float32)
829                ftype_cur = 0
830
831        fout.add_tensor(name, data)
832        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
833
834    print("Projector tensors added\n")
835
836def _replace_name(s, v):
837    s = "vision_model." + s
838    if re.match("vision_model.embeddings.position_embedding", s):
839        v = v.unsqueeze(0)
840        return {s: v}
841
842    return {s: v}
843
844state_dict = model.state_dict()
845new_state_dict = {}
846for k, v in state_dict.items():
847    kvs = _replace_name(k, v)
848    for nk, nv in kvs.items():
849        new_state_dict[nk] = nv
850state_dict = new_state_dict
851for name, data in state_dict.items():
852    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
853        # we don't need this
854        print(f"skipping parameter: {name}")
855        continue
856
857    name = get_tensor_name(name)
858    data = data.squeeze().numpy()
859
860    n_dims = len(data.shape)
861
862    # ftype == 0 -> float32, ftype == 1 -> float16
863    ftype_cur = 0
864    if n_dims == 4:
865        print(f"tensor {name} is always saved in f16")
866        data = data.astype(np.float16)
867        ftype_cur = 1
868    elif ftype == 1:
869        if name[-7:] == ".weight" and n_dims == 2:
870            print("  Converting to float16")
871            data = data.astype(np.float16)
872            ftype_cur = 1
873        else:
874            print("  Converting to float32")
875            data = data.astype(np.float32)
876            ftype_cur = 0
877    else:
878        if data.dtype != np.float32:
879            print("  Converting to float32")
880            data = data.astype(np.float32)
881            ftype_cur = 0
882
883    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
884    fout.add_tensor(name, data)
885
886
887fout.write_header_to_file()
888fout.write_kv_data_to_file()
889fout.write_tensors_to_file()
890fout.close()
891
892print("Done. Output file: " + fname_out)