1import argparse
2import os
3import json
4import re
5
6import torch
7import numpy as np
8from gguf import *
9from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
10
11TEXT = "clip.text"
12VISION = "clip.vision"
13
14
15def k(raw_key: str, arch: str) -> str:
16 return raw_key.format(arch=arch)
17
18
19def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
20 if name in (
21 "logit_scale",
22 "text_model.embeddings.position_ids",
23 "vision_model.embeddings.position_ids",
24 ):
25 return True
26
27 if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
28 return True
29
30 if name.startswith("v") and not has_vision:
31 return True
32
33 if name.startswith("t") and not has_text:
34 return True
35
36 return False
37
38
39def get_tensor_name(name: str) -> str:
40 # Standardize the transformers llava next keys for
41 # image newline / mm projector with the classes in haotian-liu LLaVA
42 if name == "image_newline":
43 return "model.image_newline"
44 if name.startswith("multi_modal_projector"):
45 name = name.replace("multi_modal_projector", "mm")
46 if "linear_1" in name:
47 name = name.replace("linear_1", "0")
48 if "linear_2" in name:
49 name = name.replace("linear_2", "2")
50 return name
51
52 if "projection" in name:
53 return name
54 if "mm_projector" in name:
55 name = name.replace("model.mm_projector", "mm")
56 name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
57 name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
58 return name
59
60 return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
61
62
63def bytes_to_unicode():
64 """
65 Returns list of utf-8 byte and a corresponding list of unicode strings.
66 The reversible bpe codes work on unicode strings.
67 This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
68 When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
69 This is a significant percentage of your normal, say, 32K bpe vocab.
70 To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
71 And avoids mapping to whitespace/control characters the bpe code barfs on.
72 """
73 bs = (
74 list(range(ord("!"), ord("~") + 1))
75 + list(range(ord("¡"), ord("¬") + 1))
76 + list(range(ord("®"), ord("ÿ") + 1))
77 )
78 cs = bs[:]
79 n = 0
80 for b in range(2**8):
81 if b not in bs:
82 bs.append(b)
83 cs.append(2**8 + n)
84 n += 1
85 cs = [chr(n) for n in cs]
86 return dict(zip(bs, cs))
87
88
89ap = argparse.ArgumentParser()
90ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
91ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
92ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
93ap.add_argument("--text-only", action="store_true", required=False,
94 help="Save a text-only model. It can't be used to encode images")
95ap.add_argument("--vision-only", action="store_true", required=False,
96 help="Save a vision-only model. It can't be used to encode texts")
97ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
98 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
99
100# Selectable visual encoders that are compatible with this script
101encoder_group = ap.add_mutually_exclusive_group()
102encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
103 help="The clip model is from openclip (for ViT-SO400M type))")
104encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
105 help="the visual encoder is Siglip.")
106
107ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
108ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
109ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
110# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
111# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
112default_image_mean = [0.48145466, 0.4578275, 0.40821073]
113default_image_std = [0.26862954, 0.26130258, 0.27577711]
114ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
115ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
116
117# with proper
118args = ap.parse_args()
119
120
121if args.text_only and args.vision_only:
122 print("--text-only and --image-only arguments cannot be specified at the same time.")
123 exit(1)
124
125if args.use_f32:
126 print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
127
128# output in the same directory as the model if output_dir is None
129dir_model = args.model_dir
130
131if (
132 args.clip_model_is_vision or
133 not os.path.exists(dir_model + "/vocab.json") or
134 args.clip_model_is_openclip or
135 args.clip_model_is_siglip
136):
137 vocab = None
138 tokens = None
139else:
140 with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
141 vocab = json.load(f)
142 tokens = [key for key in vocab]
143
144with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
145 config = json.load(f)
146 if args.clip_model_is_vision:
147 v_hparams = config
148 t_hparams = None
149 else:
150 v_hparams = config["vision_config"]
151 t_hparams = config["text_config"]
152
153# possible data types
154# ftype == 0 -> float32
155# ftype == 1 -> float16
156#
157# map from ftype to string
158ftype_str = ["f32", "f16"]
159
160ftype = 1
161if args.use_f32:
162 ftype = 0
163
164if args.clip_model_is_siglip:
165 model = SiglipVisionModel.from_pretrained(dir_model)
166 processor = None
167elif args.clip_model_is_vision or args.clip_model_is_openclip:
168 model = CLIPVisionModel.from_pretrained(dir_model)
169 processor = None
170else:
171 model = CLIPModel.from_pretrained(dir_model)
172 processor = CLIPProcessor.from_pretrained(dir_model)
173
174fname_middle = None
175has_text_encoder = True
176has_vision_encoder = True
177has_llava_projector = False
178if args.text_only:
179 fname_middle = "text-"
180 has_vision_encoder = False
181elif args.llava_projector is not None:
182 fname_middle = "mmproj-"
183 has_text_encoder = False
184 has_llava_projector = True
185elif args.vision_only:
186 fname_middle = "vision-"
187 has_text_encoder = False
188else:
189 fname_middle = ""
190
191output_dir = args.output_dir if args.output_dir is not None else dir_model
192os.makedirs(output_dir, exist_ok=True)
193output_prefix = os.path.basename(output_dir).replace("ggml_", "")
194fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
195fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
196
197fout.add_bool("clip.has_text_encoder", has_text_encoder)
198fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
199fout.add_bool("clip.has_llava_projector", has_llava_projector)
200fout.add_file_type(ftype)
201model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
202fout.add_name(model_name)
203if args.text_only:
204 fout.add_description("text-only CLIP model")
205elif args.vision_only and not has_llava_projector:
206 fout.add_description("vision-only CLIP model")
207elif has_llava_projector:
208 fout.add_description("image encoder for LLaVA")
209 # add projector type
210 fout.add_string("clip.projector_type", args.projector_type)
211else:
212 fout.add_description("two-tower CLIP model")
213
214if has_text_encoder:
215 assert t_hparams is not None
216 assert tokens is not None
217 if args.clip_model_is_siglip:
218 text_projection_dim = 0
219 else:
220 text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
221 # text_model hparams
222 fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
223 fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
224 fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
225 fout.add_uint32("clip.text.projection_dim", text_projection_dim)
226 fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
227 fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
228 fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
229 fout.add_token_list(tokens)
230
231
232
233def get_non_negative_vision_feature_layers(v_hparams):
234 """
235 Determine the vision feature layer(s) for the llava model, which are indices into the
236 hidden states of the visual encoder. Note that the hidden states array generally takes the
237 form:
238
239 [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
240
241 so feature indices should be offset as n+1 to get the output of encoder block n.
242 We convert all vision feature layers to non-negative so that -1 can be used in
243 the model as an unset value. If no vision feature layer is found, we leave it unset.
244 """
245 num_hidden_layers = v_hparams["num_hidden_layers"]
246 to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
247 feature_layers_key = None
248 # Key used for llava models in transformers
249 if "vision_feature_layer" in config:
250 feature_layers_key = "vision_feature_layer"
251 # Key used for llava models in the original format
252 elif "mm_vision_select_layer" in config:
253 feature_layers_key = "mm_vision_select_layer"
254 if feature_layers_key is not None:
255 feature_layers = config[feature_layers_key]
256 if isinstance(feature_layers, int):
257 feature_layers = [feature_layers]
258 return [to_non_negative(feature_layer) for feature_layer in feature_layers]
259
260# Determine if we have explicitly specified vision feature layers in our config
261feature_layers = get_non_negative_vision_feature_layers(v_hparams)
262
263if has_vision_encoder:
264 # Siglip does not have a visual projector; set projection dim to 0
265 if args.clip_model_is_siglip:
266 visual_projection_dim = 0
267 else:
268 visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
269
270 # set vision_model hparams
271 fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
272 fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
273 fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
274 fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
275 fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
276 fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
277 fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
278 if feature_layers:
279 block_count = max(feature_layers)
280 else:
281 block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
282 fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
283 # /**
284 # "image_grid_pinpoints": [
285 # [
286 # 336,
287 # 672
288 # ],
289 # [
290 # 672,
291 # 336
292 # ],
293 # [
294 # 672,
295 # 672
296 # ],
297 # [
298 # 1008,
299 # 336
300 # ],
301 # [
302 # 336,
303 # 1008
304 # ]
305 # ],
306 # Flattened:
307 # [
308 # 336, 672,
309 # 672, 336,
310 # 672, 672,
311 # 1008, 336,
312 # 336, 1008
313 # ]
314 # *
315 # */
316 if "image_grid_pinpoints" in v_hparams:
317 # flatten it
318 image_grid_pinpoints = []
319 for pinpoint in v_hparams["image_grid_pinpoints"]:
320 for p in pinpoint:
321 image_grid_pinpoints.append(p)
322 fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
323 if "image_crop_resolution" in v_hparams:
324 fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
325 if "image_aspect_ratio" in v_hparams:
326 fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
327 if "image_split_resolution" in v_hparams:
328 fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
329 if "mm_patch_merge_type" in v_hparams:
330 fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
331 if "mm_projector_type" in v_hparams:
332 fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
333 if feature_layers:
334 fout.add_array("clip.vision.feature_layer", feature_layers)
335
336 if processor is not None:
337 image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue]
338 image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue]
339 else:
340 image_mean = args.image_mean if args.image_mean is not None else default_image_mean
341 image_std = args.image_std if args.image_std is not None else default_image_std
342 fout.add_array("clip.vision.image_mean", image_mean)
343 fout.add_array("clip.vision.image_std", image_std)
344
345use_gelu = v_hparams["hidden_act"] == "gelu"
346fout.add_bool("clip.use_gelu", use_gelu)
347
348
349if has_llava_projector:
350 # By default, we drop the last layer for llava projector
351 # models unless we have explicitly set vision feature layers
352 if feature_layers is None:
353 model.vision_model.encoder.layers.pop(-1)
354 else:
355 model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
356
357 projector = torch.load(args.llava_projector)
358 for name, data in projector.items():
359 name = get_tensor_name(name)
360 # pw and dw conv ndim==4
361 if data.ndim == 2 or data.ndim == 4:
362 data = data.squeeze().numpy().astype(np.float16)
363 else:
364 data = data.squeeze().numpy().astype(np.float32)
365
366 fout.add_tensor(name, data)
367
368 print("Projector tensors added\n")
369
370state_dict = model.state_dict()
371for name, data in state_dict.items():
372 if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
373 # we don't need this
374 print(f"skipping parameter: {name}")
375 continue
376
377 name = get_tensor_name(name)
378 data = data.squeeze().numpy()
379
380 n_dims = len(data.shape)
381
382 # ftype == 0 -> float32, ftype == 1 -> float16
383 ftype_cur = 0
384 if n_dims == 4:
385 print(f"tensor {name} is always saved in f16")
386 data = data.astype(np.float16)
387 ftype_cur = 1
388 elif ftype == 1:
389 if name[-7:] == ".weight" and n_dims == 2:
390 print(" Converting to float16")
391 data = data.astype(np.float16)
392 ftype_cur = 1
393 else:
394 print(" Converting to float32")
395 data = data.astype(np.float32)
396 ftype_cur = 0
397 else:
398 if data.dtype != np.float32:
399 print(" Converting to float32")
400 data = data.astype(np.float32)
401 ftype_cur = 0
402
403 print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
404 fout.add_tensor(name, data)
405
406
407fout.write_header_to_file()
408fout.write_kv_data_to_file()
409fout.write_tensors_to_file()
410fout.close()
411
412print("Done. Output file: " + fname_out)