aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py')
-rw-r--r--llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py892
1 files changed, 892 insertions, 0 deletions
diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
new file mode 100644
index 0000000..944037e
--- /dev/null
+++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -0,0 +1,892 @@
1# coding=utf-8
2# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15""" PyTorch Siglip model. """
16# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
17
18
19import os
20import math
21import warnings
22
23import numpy as np
24import torch
25import torch.nn.functional as F
26from torch import nn
27from torch.nn.init import _calculate_fan_in_and_fan_out
28
29from transformers.activations import ACT2FN
30from transformers.modeling_utils import PreTrainedModel
31from transformers.configuration_utils import PretrainedConfig
32from transformers.utils import (
33 logging,
34)
35from transformers.utils import logging
36
37logger = logging.get_logger(__name__)
38
39class SiglipVisionConfig(PretrainedConfig):
40 r"""
41 This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
42 Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
43 configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
44 [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
45 Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
46 documentation from [`PretrainedConfig`] for more information.
47 Args:
48 hidden_size (`int`, *optional*, defaults to 768):
49 Dimensionality of the encoder layers and the pooler layer.
50 intermediate_size (`int`, *optional*, defaults to 3072):
51 Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
52 num_hidden_layers (`int`, *optional*, defaults to 12):
53 Number of hidden layers in the Transformer encoder.
54 num_attention_heads (`int`, *optional*, defaults to 12):
55 Number of attention heads for each attention layer in the Transformer encoder.
56 num_channels (`int`, *optional*, defaults to 3):
57 Number of channels in the input images.
58 image_size (`int`, *optional*, defaults to 224):
59 The size (resolution) of each image.
60 patch_size (`int`, *optional*, defaults to 16):
61 The size (resolution) of each patch.
62 hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
63 The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
64 `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
65 layer_norm_eps (`float`, *optional*, defaults to 1e-06):
66 The epsilon used by the layer normalization layers.
67 attention_dropout (`float`, *optional*, defaults to 0.0):
68 The dropout ratio for the attention probabilities.
69 Example:
70 ```python
71 >>> from transformers import SiglipVisionConfig, SiglipVisionModel
72 >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
73 >>> configuration = SiglipVisionConfig()
74 >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
75 >>> model = SiglipVisionModel(configuration)
76 >>> # Accessing the model configuration
77 >>> configuration = model.config
78 ```"""
79
80 model_type = "siglip_vision_model"
81
82 def __init__(
83 self,
84 hidden_size=768,
85 intermediate_size=3072,
86 num_hidden_layers=12,
87 num_attention_heads=12,
88 num_channels=3,
89 image_size=224,
90 patch_size=16,
91 hidden_act="gelu_pytorch_tanh",
92 layer_norm_eps=1e-6,
93 attention_dropout=0.0,
94 **kwargs,
95 ):
96 super().__init__(**kwargs)
97
98 self.hidden_size = hidden_size
99 self.intermediate_size = intermediate_size
100 self.num_hidden_layers = num_hidden_layers
101 self.num_attention_heads = num_attention_heads
102 self.num_channels = num_channels
103 self.patch_size = patch_size
104 self.image_size = image_size
105 self.attention_dropout = attention_dropout
106 self.layer_norm_eps = layer_norm_eps
107 self.hidden_act = hidden_act
108
109_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
110
111SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
112 "google/siglip-base-patch16-224",
113 # See all SigLIP models at https://huggingface.co/models?filter=siglip
114]
115
116# Copied from transformers.models.llama.modeling_llama._get_unpad_data
117def _get_unpad_data(attention_mask):
118 seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
119 indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
120 max_seqlen_in_batch = seqlens_in_batch.max().item()
121 cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
122 return (
123 indices,
124 cu_seqlens,
125 max_seqlen_in_batch,
126 )
127
128
129def _trunc_normal_(tensor, mean, std, a, b):
130 # Cut & paste from PyTorch official master until it's in a few official releases - RW
131 # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
132 def norm_cdf(x):
133 # Computes standard normal cumulative distribution function
134 return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
135
136 if (mean < a - 2 * std) or (mean > b + 2 * std):
137 warnings.warn(
138 "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
139 "The distribution of values may be incorrect.",
140 stacklevel=2,
141 )
142
143 # Values are generated by using a truncated uniform distribution and
144 # then using the inverse CDF for the normal distribution.
145 # Get upper and lower cdf values
146 l = norm_cdf((a - mean) / std)
147 u = norm_cdf((b - mean) / std)
148
149 # Uniformly fill tensor with values from [l, u], then translate to
150 # [2l-1, 2u-1].
151 tensor.uniform_(2 * l - 1, 2 * u - 1)
152
153 # Use inverse cdf transform for normal distribution to get truncated
154 # standard normal
155 if tensor.dtype in [torch.float16, torch.bfloat16]:
156 # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
157 og_dtype = tensor.dtype
158 tensor = tensor.to(torch.float32)
159 tensor.erfinv_()
160 tensor = tensor.to(og_dtype)
161 else:
162 tensor.erfinv_()
163
164 # Transform to proper mean, std
165 tensor.mul_(std * math.sqrt(2.0))
166 tensor.add_(mean)
167
168 # Clamp to ensure it's in the proper range
169 if tensor.dtype == torch.float16:
170 # The `clamp_` op is not (yet?) defined in float16+cpu
171 tensor = tensor.to(torch.float32)
172 tensor.clamp_(min=a, max=b)
173 tensor = tensor.to(torch.float16)
174 else:
175 tensor.clamp_(min=a, max=b)
176
177
178def trunc_normal_tf_(
179 tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
180):
181 """Fills the input Tensor with values drawn from a truncated
182 normal distribution. The values are effectively drawn from the
183 normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
184 with values outside :math:`[a, b]` redrawn until they are within
185 the bounds. The method used for generating the random values works
186 best when :math:`a \\leq \text{mean} \\leq b`.
187 NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
188 bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
189 and the result is subsquently scaled and shifted by the mean and std args.
190 Args:
191 tensor: an n-dimensional `torch.Tensor`
192 mean: the mean of the normal distribution
193 std: the standard deviation of the normal distribution
194 a: the minimum cutoff value
195 b: the maximum cutoff value
196 """
197 with torch.no_grad():
198 _trunc_normal_(tensor, 0, 1.0, a, b)
199 tensor.mul_(std).add_(mean)
200
201
202def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
203 fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
204 denom = fan_in
205 if mode == "fan_in":
206 denom = fan_in
207 elif mode == "fan_out":
208 denom = fan_out
209 elif mode == "fan_avg":
210 denom = (fan_in + fan_out) / 2
211
212 variance = scale / denom
213
214 if distribution == "truncated_normal":
215 # constant is stddev of standard normal truncated to (-2, 2)
216 trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
217 elif distribution == "normal":
218 with torch.no_grad():
219 tensor.normal_(std=math.sqrt(variance))
220 elif distribution == "uniform":
221 bound = math.sqrt(3 * variance)
222 with torch.no_grad():
223 tensor.uniform_(-bound, bound)
224 else:
225 raise ValueError(f"invalid distribution {distribution}")
226
227
228def lecun_normal_(tensor):
229 variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
230
231
232def default_flax_embed_init(tensor):
233 variance_scaling_(tensor, mode="fan_in", distribution="normal")
234
235class SiglipVisionEmbeddings(nn.Module):
236 def __init__(self, config: SiglipVisionConfig):
237 super().__init__()
238 self.config = config
239 self.embed_dim = config.hidden_size
240 self.image_size = config.image_size
241 self.patch_size = config.patch_size
242
243 self.patch_embedding = nn.Conv2d(
244 in_channels=config.num_channels,
245 out_channels=self.embed_dim,
246 kernel_size=self.patch_size,
247 stride=self.patch_size,
248 padding="valid",
249 )
250
251 self.num_patches_per_side = self.image_size // self.patch_size
252 self.num_patches = self.num_patches_per_side**2
253 self.num_positions = self.num_patches
254 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
255
256class SiglipAttention(nn.Module):
257 """Multi-headed attention from 'Attention Is All You Need' paper"""
258
259 # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
260 def __init__(self, config):
261 super().__init__()
262 self.config = config
263 self.embed_dim = config.hidden_size
264 self.num_heads = config.num_attention_heads
265 self.head_dim = self.embed_dim // self.num_heads
266 if self.head_dim * self.num_heads != self.embed_dim:
267 raise ValueError(
268 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
269 f" {self.num_heads})."
270 )
271 self.scale = self.head_dim**-0.5
272 self.dropout = config.attention_dropout
273
274 self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
275 self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
276 self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
277 self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
278
279# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
280class SiglipMLP(nn.Module):
281 def __init__(self, config):
282 super().__init__()
283 self.config = config
284 self.activation_fn = ACT2FN[config.hidden_act]
285 self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
286 self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
287
288
289# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
290class SiglipEncoderLayer(nn.Module):
291 def __init__(self, config: SiglipVisionConfig):
292 super().__init__()
293 self.embed_dim = config.hidden_size
294 self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
295 self.self_attn = (
296 SiglipAttention(config)
297 )
298 self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
299 self.mlp = SiglipMLP(config)
300 self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
301
302class SiglipPreTrainedModel(PreTrainedModel):
303 """
304 An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
305 models.
306 """
307
308 config_class = SiglipVisionConfig
309 base_model_prefix = "siglip"
310 supports_gradient_checkpointing = True
311
312 def _init_weights(self, module):
313 """Initialize the weights"""
314
315 if isinstance(module, SiglipVisionEmbeddings):
316 width = self.config.hidden_size
317 nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
318 elif isinstance(module, nn.Embedding):
319 default_flax_embed_init(module.weight)
320 elif isinstance(module, SiglipAttention):
321 nn.init.normal_(module.q_proj.weight)
322 nn.init.normal_(module.k_proj.weight)
323 nn.init.normal_(module.v_proj.weight)
324 nn.init.normal_(module.out_proj.weight)
325 nn.init.zeros_(module.q_proj.bias)
326 nn.init.zeros_(module.k_proj.bias)
327 nn.init.zeros_(module.v_proj.bias)
328 nn.init.zeros_(module.out_proj.bias)
329 elif isinstance(module, SiglipMLP):
330 nn.init.normal_(module.fc1.weight)
331 nn.init.normal_(module.fc2.weight)
332 nn.init.normal_(module.fc1.bias, std=1e-6)
333 nn.init.normal_(module.fc2.bias, std=1e-6)
334 elif isinstance(module, (nn.Linear, nn.Conv2d)):
335 lecun_normal_(module.weight)
336 if module.bias is not None:
337 nn.init.zeros_(module.bias)
338 elif isinstance(module, nn.LayerNorm):
339 module.bias.data.zero_()
340 module.weight.data.fill_(1.0)
341
342
343SIGLIP_START_DOCSTRING = r"""
344 This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
345 library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
346 etc.)
347 This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
348 Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
349 and behavior.
350 Parameters:
351 config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
352 Initializing with a config file does not load the weights associated with the model, only the
353 configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
354"""
355
356
357SIGLIP_VISION_INPUTS_DOCSTRING = r"""
358 Args:
359 pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
360 Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
361 [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
362 output_attentions (`bool`, *optional*):
363 Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
364 tensors for more detail.
365 output_hidden_states (`bool`, *optional*):
366 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
367 more detail.
368 return_dict (`bool`, *optional*):
369 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
370"""
371
372
373# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
374class SiglipEncoder(nn.Module):
375 """
376 Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
377 [`SiglipEncoderLayer`].
378 Args:
379 config: SiglipConfig
380 """
381
382 def __init__(self, config: SiglipVisionConfig):
383 super().__init__()
384 self.config = config
385 self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
386 self.gradient_checkpointing = False
387
388class SiglipVisionTransformer(SiglipPreTrainedModel):
389 config_class = SiglipVisionConfig
390 main_input_name = "pixel_values"
391 _supports_flash_attn_2 = True
392
393 def __init__(self, config: SiglipVisionConfig):
394 super().__init__(config)
395 self.config = config
396 embed_dim = config.hidden_size
397
398 self.embeddings = SiglipVisionEmbeddings(config)
399 self.encoder = SiglipEncoder(config)
400 self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
401 self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
402
403 # Initialize weights and apply final processing
404 self.post_init()
405
406 def get_input_embeddings(self) -> nn.Module:
407 return self.embeddings.patch_embedding
408
409import argparse
410import json
411import re
412
413import numpy as np
414from gguf import *
415from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
416from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig
417
418TEXT = "clip.text"
419VISION = "clip.vision"
420
421
422def add_key_str(raw_key: str, arch: str) -> str:
423 return raw_key.format(arch=arch)
424
425
426def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
427 if name in (
428 "logit_scale",
429 "text_model.embeddings.position_ids",
430 "vision_model.embeddings.position_ids",
431 ):
432 return True
433
434 if has_minicpmv and name in ["visual_projection.weight"]:
435 return True
436
437 if name.startswith("v") and not has_vision:
438 return True
439
440 if name.startswith("t") and not has_text:
441 return True
442
443 return False
444
445
446def get_tensor_name(name: str) -> str:
447 if "projection" in name:
448 return name
449 if "mm_projector" in name:
450 name = name.replace("model.mm_projector", "mm")
451 name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
452 name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
453 return name
454
455 return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
456
457
458def bytes_to_unicode():
459 """
460 Returns list of utf-8 byte and a corresponding list of unicode strings.
461 The reversible bpe codes work on unicode strings.
462 This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
463 When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
464 This is a significant percentage of your normal, say, 32K bpe vocab.
465 To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
466 And avoids mapping to whitespace/control characters the bpe code barfs on.
467 """
468 bs = (
469 list(range(ord("!"), ord("~") + 1))
470 + list(range(ord("¡"), ord("¬") + 1))
471 + list(range(ord("®"), ord("ÿ") + 1))
472 )
473 cs = bs[:]
474 n = 0
475 for b in range(2**8):
476 if b not in bs:
477 bs.append(b)
478 cs.append(2**8 + n)
479 n += 1
480 cs = [chr(n) for n in cs]
481 return dict(zip(bs, cs))
482
483
484ap = argparse.ArgumentParser()
485ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
486ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
487ap.add_argument("--text-only", action="store_true", required=False,
488 help="Save a text-only model. It can't be used to encode images")
489ap.add_argument("--vision-only", action="store_true", required=False,
490 help="Save a vision-only model. It can't be used to encode texts")
491ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
492 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
493ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
494 help="The clip model is from openclip (for ViT-SO400M type))")
495ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
496ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
497ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
498# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
499# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
500default_image_mean = [0.5, 0.5, 0.5]
501default_image_std = [0.5, 0.5, 0.5]
502ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
503ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
504ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2)
505
506# with proper
507args = ap.parse_args()
508
509
510if args.text_only and args.vision_only:
511 print("--text-only and --image-only arguments cannot be specified at the same time.")
512 exit(1)
513
514if args.use_f32:
515 print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
516
517# output in the same directory as the model if output_dir is None
518dir_model = args.model_dir
519
520# Read config.json to get actual model configuration
521config_path = os.path.join(dir_model, "config.json")
522model_config = {}
523if os.path.isfile(config_path):
524 with open(config_path, "r", encoding="utf-8") as f:
525 model_config = json.load(f)
526 print(f"Loaded config from {config_path}")
527else:
528 print(f"Warning: config.json not found at {config_path}")
529
530# If minicpmv_projector is not specified but the default path exists, use the default path
531if args.minicpmv_projector is None:
532 default_projector_path = os.path.join(dir_model, "minicpmv.projector")
533 if os.path.isfile(default_projector_path):
534 args.minicpmv_projector = default_projector_path
535 print(f"Found default projector file: {default_projector_path}")
536
537# If output_dir is not specified, use model_dir as the default value
538if args.output_dir is None:
539 args.output_dir = dir_model
540
541if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
542 vocab = None
543 tokens = None
544else:
545 with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
546 vocab = json.load(f)
547 tokens = [key for key in vocab]
548
549# possible data types
550# ftype == 0 -> float32
551# ftype == 1 -> float16
552#
553# map from ftype to string
554ftype_str = ["f32", "f16"]
555
556ftype = 1
557if args.use_f32:
558 ftype = 0
559
560# if args.clip_model_is_vision or args.clip_model_is_openclip:
561# model = CLIPVisionModel.from_pretrained(dir_model)
562# processor = None
563# else:
564# model = CLIPModel.from_pretrained(dir_model)
565# processor = CLIPProcessor.from_pretrained(dir_model)
566
567minicpmv_version = args.minicpmv_version
568
569# Use actual config values instead of hardcoded ones
570if model_config:
571 # For the projector/resampler, use the main model's hidden_size
572 emb_dim = model_config.get("hidden_size", 1536)
573
574 # For the vision model, use vision_config values
575 vision_config_dict = model_config.get("vision_config", {})
576 default_vision_config = {
577 "hidden_size": vision_config_dict.get("hidden_size", 1152),
578 "image_size": vision_config_dict.get("image_size", 980),
579 "intermediate_size": vision_config_dict.get("intermediate_size", 4304),
580 "model_type": vision_config_dict.get("model_type", "siglip"),
581 "num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
582 "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
583 "patch_size": vision_config_dict.get("patch_size", 14),
584 }
585
586 # Use vision model's num_hidden_layers for block_count
587 block_count = vision_config_dict.get("num_hidden_layers", 27)
588
589 print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
590 print(f"Vision config: {default_vision_config}")
591else:
592 # Fallback to original hardcoded logic if config.json not found
593 emb_dim = 4096
594 block_count = 26
595 if minicpmv_version == 1:
596 emb_dim = 2304
597 block_count = 26
598 elif minicpmv_version == 2:
599 emb_dim = 4096
600 block_count = 27
601 elif minicpmv_version == 3:
602 emb_dim = 3584
603 block_count = 27
604 elif minicpmv_version == 4:
605 emb_dim = 3584
606 block_count = 27
607 elif minicpmv_version == 5:
608 emb_dim = 2560
609 block_count = 27
610 elif minicpmv_version == 6:
611 emb_dim = 4096
612 block_count = 27
613 elif minicpmv_version == 100045:
614 emb_dim = 4096
615 block_count = 27
616
617 default_vision_config = {
618 "hidden_size": 1152,
619 "image_size": 980,
620 "intermediate_size": 4304,
621 "model_type": "idefics2",
622 "num_attention_heads": 16,
623 "num_hidden_layers": 27,
624 "patch_size": 14,
625 }
626
627vision_config = Idefics2VisionConfig(**default_vision_config)
628model = Idefics2VisionTransformer(vision_config)
629if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
630 vision_config = SiglipVisionConfig(**default_vision_config)
631 model = SiglipVisionTransformer(vision_config)
632elif minicpmv_version == 4:
633 vision_config = SiglipVisionConfig(**default_vision_config)
634 model = SiglipVisionTransformer(vision_config)
635elif minicpmv_version == 5:
636 default_vision_config["model_type"] = "siglip_vision_model"
637 vision_config = SiglipVisionConfig(**default_vision_config)
638 model = SiglipVisionTransformer(vision_config)
639elif minicpmv_version == 6:
640 default_vision_config["model_type"] = "siglip_vision_model"
641 vision_config = SiglipVisionConfig(**default_vision_config)
642 model = SiglipVisionTransformer(vision_config)
643elif minicpmv_version == 100045:
644 default_vision_config["model_type"] = "siglip_vision_model"
645 vision_config = SiglipVisionConfig(**default_vision_config)
646 model = SiglipVisionTransformer(vision_config)
647
648processor = None
649# if model.attn_pool is not None:
650# model.attn_pool = torch.nn.Identity()
651
652# model.blocks = model.blocks[:-1]
653model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
654
655fname_middle = None
656has_text_encoder = True
657has_vision_encoder = True
658has_minicpmv_projector = False
659
660if args.text_only:
661 fname_middle = "text-"
662 has_vision_encoder = False
663elif args.minicpmv_projector is not None:
664 fname_middle = "mmproj-"
665 has_text_encoder = False
666 has_minicpmv_projector = True
667elif args.vision_only:
668 fname_middle = "vision-"
669 has_text_encoder = False
670else:
671 fname_middle = ""
672
673output_dir = args.output_dir
674os.makedirs(output_dir, exist_ok=True)
675output_prefix = os.path.basename(output_dir).replace("ggml_", "")
676fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
677fout = GGUFWriter(path=fname_out, arch="clip")
678
679fout.add_bool("clip.has_text_encoder", has_text_encoder)
680fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
681fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
682fout.add_file_type(ftype)
683if args.text_only:
684 fout.add_description("text-only CLIP model")
685elif args.vision_only and not has_minicpmv_projector:
686 fout.add_description("vision-only CLIP model")
687elif has_minicpmv_projector:
688 fout.add_description("image encoder for MiniCPM-V")
689 # add projector type
690 fout.add_string("clip.projector_type", "resampler")
691 fout.add_int32("clip.minicpmv_version", minicpmv_version)
692else:
693 fout.add_description("two-tower CLIP model")
694
695if has_vision_encoder:
696 # vision_model hparams - use actual config values
697 vision_image_size = model_config.get("image_size", 448) if model_config else 448
698 vision_patch_size = default_vision_config.get("patch_size", 14)
699 vision_hidden_size = default_vision_config.get("hidden_size", 1152)
700 vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
701 vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
702
703 fout.add_uint32("clip.vision.image_size", vision_image_size)
704 fout.add_uint32("clip.vision.patch_size", vision_patch_size)
705 fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
706 fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
707 fout.add_uint32("clip.vision.projection_dim", 0)
708 fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
709 fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
710 fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
711
712 # Add MiniCPM-V specific parameters
713 query_num = model_config.get("query_num", 0) if model_config else 0
714 resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
715 fout.add_uint32("clip.minicpmv_query_num", query_num)
716
717 if processor is not None:
718 image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
719 image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
720 else:
721 image_mean = args.image_mean if args.image_mean is not None else default_image_mean
722 image_std = args.image_std if args.image_std is not None else default_image_std
723 fout.add_array("clip.vision.image_mean", image_mean)
724 fout.add_array("clip.vision.image_std", image_std)
725
726use_gelu = True
727fout.add_bool("clip.use_gelu", use_gelu)
728
729def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
730 """
731 embed_dim: output dimension for each position
732 pos: a list of positions to be encoded: size (M,)
733 out: (M, D)
734 """
735 assert embed_dim % 2 == 0
736 omega = np.arange(embed_dim // 2, dtype=np.float32)
737 omega /= embed_dim / 2.
738 omega = 1. / 10000 ** omega # (D/2,)
739
740 pos = pos.reshape(-1) # (M,)
741 out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
742
743 emb_sin = np.sin(out) # (M, D/2)
744 emb_cos = np.cos(out) # (M, D/2)
745
746 emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
747 return emb
748
749def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
750 assert embed_dim % 2 == 0
751
752 # use half of dimensions to encode grid_h
753 emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
754 emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
755
756 emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
757 return emb
758
759
760# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
761def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
762 """
763 grid_size: int of the grid height and width
764 return:
765 pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
766 """
767 if isinstance(grid_size, int):
768 grid_h_size, grid_w_size = grid_size, grid_size
769 else:
770 grid_h_size, grid_w_size = grid_size[0], grid_size[1]
771
772 grid_h = np.arange(grid_h_size, dtype=np.float32)
773 grid_w = np.arange(grid_w_size, dtype=np.float32)
774 grid = np.meshgrid(grid_w, grid_h) # here w goes first
775 grid = np.stack(grid, axis=0)
776
777 grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
778 pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
779 if cls_token:
780 pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
781 return pos_embed
782
783def _replace_name_resampler(s, v):
784 if re.match("resampler.pos_embed", s):
785 return {
786 s: v,
787 re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
788 }
789 if re.match("resampler.proj", s):
790 return {
791 re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
792 re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
793 }
794 if re.match("resampler.attn.in_proj_.*", s):
795 return {
796 re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
797 re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
798 re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
799 }
800 return {s: v}
801
802if has_minicpmv_projector:
803 projector = torch.load(args.minicpmv_projector)
804 new_state_dict = {}
805 for k, v in projector.items():
806 kvs = _replace_name_resampler(k, v)
807 for nk, nv in kvs.items():
808 new_state_dict[nk] = nv
809 projector = new_state_dict
810 ftype_cur = 0
811 for name, data in projector.items():
812 name = get_tensor_name(name)
813 data = data.squeeze().numpy()
814
815 n_dims = len(data.shape)
816 if ftype == 1:
817 if name[-7:] == ".weight" and n_dims == 2:
818 print(" Converting to float16")
819 data = data.astype(np.float16)
820 ftype_cur = 1
821 else:
822 print(" Converting to float32")
823 data = data.astype(np.float32)
824 ftype_cur = 0
825 else:
826 if data.dtype != np.float32:
827 print(" Converting to float32")
828 data = data.astype(np.float32)
829 ftype_cur = 0
830
831 fout.add_tensor(name, data)
832 print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
833
834 print("Projector tensors added\n")
835
836def _replace_name(s, v):
837 s = "vision_model." + s
838 if re.match("vision_model.embeddings.position_embedding", s):
839 v = v.unsqueeze(0)
840 return {s: v}
841
842 return {s: v}
843
844state_dict = model.state_dict()
845new_state_dict = {}
846for k, v in state_dict.items():
847 kvs = _replace_name(k, v)
848 for nk, nv in kvs.items():
849 new_state_dict[nk] = nv
850state_dict = new_state_dict
851for name, data in state_dict.items():
852 if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
853 # we don't need this
854 print(f"skipping parameter: {name}")
855 continue
856
857 name = get_tensor_name(name)
858 data = data.squeeze().numpy()
859
860 n_dims = len(data.shape)
861
862 # ftype == 0 -> float32, ftype == 1 -> float16
863 ftype_cur = 0
864 if n_dims == 4:
865 print(f"tensor {name} is always saved in f16")
866 data = data.astype(np.float16)
867 ftype_cur = 1
868 elif ftype == 1:
869 if name[-7:] == ".weight" and n_dims == 2:
870 print(" Converting to float16")
871 data = data.astype(np.float16)
872 ftype_cur = 1
873 else:
874 print(" Converting to float32")
875 data = data.astype(np.float32)
876 ftype_cur = 0
877 else:
878 if data.dtype != np.float32:
879 print(" Converting to float32")
880 data = data.astype(np.float32)
881 ftype_cur = 0
882
883 print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
884 fout.add_tensor(name, data)
885
886
887fout.write_header_to_file()
888fout.write_kv_data_to_file()
889fout.write_tensors_to_file()
890fout.close()
891
892print("Done. Output file: " + fname_out)