1import argparse
  2import os
  3import json
  4import re
  5
  6import torch
  7import numpy as np
  8from gguf import *
  9from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
 10
 11TEXT = "clip.text"
 12VISION = "clip.vision"
 13
 14
 15def k(raw_key: str, arch: str) -> str:
 16    return raw_key.format(arch=arch)
 17
 18
 19def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
 20    if name in (
 21        "logit_scale",
 22        "text_model.embeddings.position_ids",
 23        "vision_model.embeddings.position_ids",
 24    ):
 25        return True
 26
 27    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
 28        return True
 29
 30    if name.startswith("v") and not has_vision:
 31        return True
 32
 33    if name.startswith("t") and not has_text:
 34        return True
 35
 36    return False
 37
 38
 39def get_tensor_name(name: str) -> str:
 40    # Standardize the transformers llava next keys for
 41    # image newline / mm projector with the classes in haotian-liu LLaVA
 42    if name == "image_newline":
 43        return "model.image_newline"
 44    if name.startswith("multi_modal_projector"):
 45        name = name.replace("multi_modal_projector", "mm")
 46        if "linear_1" in name:
 47            name = name.replace("linear_1", "0")
 48        if "linear_2" in name:
 49            name = name.replace("linear_2", "2")
 50        return name
 51
 52    if "projection" in name:
 53        return name
 54    if "mm_projector" in name:
 55        name = name.replace("model.mm_projector", "mm")
 56        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
 57        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
 58        return name
 59
 60    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
 61
 62
 63def bytes_to_unicode():
 64    """
 65    Returns list of utf-8 byte and a corresponding list of unicode strings.
 66    The reversible bpe codes work on unicode strings.
 67    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 68    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 69    This is a significant percentage of your normal, say, 32K bpe vocab.
 70    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 71    And avoids mapping to whitespace/control characters the bpe code barfs on.
 72    """
 73    bs = (
 74        list(range(ord("!"), ord("~") + 1))
 75        + list(range(ord("¡"), ord("¬") + 1))
 76        + list(range(ord("®"), ord("ÿ") + 1))
 77    )
 78    cs = bs[:]
 79    n = 0
 80    for b in range(2**8):
 81        if b not in bs:
 82            bs.append(b)
 83            cs.append(2**8 + n)
 84            n += 1
 85    cs = [chr(n) for n in cs]
 86    return dict(zip(bs, cs))
 87
 88
 89ap = argparse.ArgumentParser()
 90ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 91ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
 92ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
 93ap.add_argument("--text-only", action="store_true", required=False,
 94                help="Save a text-only model. It can't be used to encode images")
 95ap.add_argument("--vision-only", action="store_true", required=False,
 96                help="Save a vision-only model. It can't be used to encode texts")
 97ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
 98                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 99
100# Selectable visual encoders that are compatible with this script
101encoder_group = ap.add_mutually_exclusive_group()
102encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
103                help="The clip model is from openclip (for ViT-SO400M type))")
104encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
105                help="the visual encoder is Siglip.")
106
107ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
108ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
109ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
110# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
111# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
112default_image_mean = [0.48145466, 0.4578275, 0.40821073]
113default_image_std = [0.26862954, 0.26130258, 0.27577711]
114ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
115ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
116
117# with proper
118args = ap.parse_args()
119
120
121if args.text_only and args.vision_only:
122    print("--text-only and --image-only arguments cannot be specified at the same time.")
123    exit(1)
124
125if args.use_f32:
126    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
127
128# output in the same directory as the model if output_dir is None
129dir_model = args.model_dir
130
131if (
132    args.clip_model_is_vision or
133    not os.path.exists(dir_model + "/vocab.json") or
134    args.clip_model_is_openclip or
135    args.clip_model_is_siglip
136):
137    vocab = None
138    tokens = None
139else:
140    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
141        vocab = json.load(f)
142        tokens = [key for key in vocab]
143
144with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
145    config = json.load(f)
146    if args.clip_model_is_vision:
147        v_hparams = config
148        t_hparams = None
149    else:
150        v_hparams = config["vision_config"]
151        t_hparams = config["text_config"]
152
153# possible data types
154#   ftype == 0 -> float32
155#   ftype == 1 -> float16
156#
157# map from ftype to string
158ftype_str = ["f32", "f16"]
159
160ftype = 1
161if args.use_f32:
162    ftype = 0
163
164if args.clip_model_is_siglip:
165    model = SiglipVisionModel.from_pretrained(dir_model)
166    processor = None
167elif args.clip_model_is_vision or args.clip_model_is_openclip:
168    model = CLIPVisionModel.from_pretrained(dir_model)
169    processor = None
170else:
171    model = CLIPModel.from_pretrained(dir_model)
172    processor = CLIPProcessor.from_pretrained(dir_model)
173
174fname_middle = None
175has_text_encoder = True
176has_vision_encoder = True
177has_llava_projector = False
178if args.text_only:
179    fname_middle = "text-"
180    has_vision_encoder = False
181elif args.llava_projector is not None:
182    fname_middle = "mmproj-"
183    has_text_encoder = False
184    has_llava_projector = True
185elif args.vision_only:
186    fname_middle = "vision-"
187    has_text_encoder = False
188else:
189    fname_middle = ""
190
191output_dir = args.output_dir if args.output_dir is not None else dir_model
192os.makedirs(output_dir, exist_ok=True)
193output_prefix = os.path.basename(output_dir).replace("ggml_", "")
194fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
195fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
196
197fout.add_bool("clip.has_text_encoder", has_text_encoder)
198fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
199fout.add_bool("clip.has_llava_projector", has_llava_projector)
200fout.add_file_type(ftype)
201model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
202fout.add_name(model_name)
203if args.text_only:
204    fout.add_description("text-only CLIP model")
205elif args.vision_only and not has_llava_projector:
206    fout.add_description("vision-only CLIP model")
207elif has_llava_projector:
208    fout.add_description("image encoder for LLaVA")
209    # add projector type
210    fout.add_string("clip.projector_type", args.projector_type)
211else:
212    fout.add_description("two-tower CLIP model")
213
214if has_text_encoder:
215    assert t_hparams is not None
216    assert tokens is not None
217    if args.clip_model_is_siglip:
218        text_projection_dim = 0
219    else:
220        text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
221    # text_model hparams
222    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
223    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
224    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
225    fout.add_uint32("clip.text.projection_dim", text_projection_dim)
226    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
227    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
228    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
229    fout.add_token_list(tokens)
230
231
232
233def get_non_negative_vision_feature_layers(v_hparams):
234    """
235    Determine the vision feature layer(s) for the llava model, which are indices into the
236    hidden states of the visual encoder. Note that the hidden states array generally takes the
237    form:
238
239        [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
240
241    so feature indices should be offset as n+1 to get the output of encoder block n.
242    We convert all vision feature layers to non-negative so that -1 can be used in
243    the model as an unset value. If no vision feature layer is found, we leave it unset.
244    """
245    num_hidden_layers = v_hparams["num_hidden_layers"]
246    to_non_negative = lambda layer_idx: layer_idx  if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
247    feature_layers_key = None
248    # Key used for llava models in transformers
249    if "vision_feature_layer" in config:
250        feature_layers_key = "vision_feature_layer"
251    # Key used for llava models in the original format
252    elif "mm_vision_select_layer" in config:
253        feature_layers_key = "mm_vision_select_layer"
254    if feature_layers_key is not None:
255        feature_layers = config[feature_layers_key]
256        if isinstance(feature_layers, int):
257            feature_layers = [feature_layers]
258        return [to_non_negative(feature_layer) for feature_layer in feature_layers]
259
260# Determine if we have explicitly specified vision feature layers in our config
261feature_layers = get_non_negative_vision_feature_layers(v_hparams)
262
263if has_vision_encoder:
264    # Siglip does not have a visual projector; set projection dim to 0
265    if args.clip_model_is_siglip:
266        visual_projection_dim = 0
267    else:
268        visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
269
270    # set vision_model hparams
271    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
272    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
273    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
274    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
275    fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
276    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
277    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
278    if feature_layers:
279        block_count = max(feature_layers)
280    else:
281        block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
282    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
283                            #     /**
284                            #      "image_grid_pinpoints": [
285                            #         [
286                            #         336,
287                            #         672
288                            #         ],
289                            #         [
290                            #         672,
291                            #         336
292                            #         ],
293                            #         [
294                            #         672,
295                            #         672
296                            #         ],
297                            #         [
298                            #         1008,
299                            #         336
300                            #         ],
301                            #         [
302                            #         336,
303                            #         1008
304                            #         ]
305                            #     ],
306                            #     Flattened:
307                            #     [
308                            #         336, 672,
309                            #         672, 336,
310                            #         672, 672,
311                            #         1008, 336,
312                            #         336, 1008
313                            #     ]
314                            #  *
315                            #  */
316    if "image_grid_pinpoints" in v_hparams:
317        # flatten it
318        image_grid_pinpoints = []
319        for pinpoint in v_hparams["image_grid_pinpoints"]:
320            for p in pinpoint:
321                image_grid_pinpoints.append(p)
322        fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
323    if "image_crop_resolution" in v_hparams:
324        fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
325    if "image_aspect_ratio" in v_hparams:
326        fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
327    if "image_split_resolution" in v_hparams:
328        fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
329    if "mm_patch_merge_type" in v_hparams:
330        fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
331    if "mm_projector_type" in v_hparams:
332        fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
333    if feature_layers:
334        fout.add_array("clip.vision.feature_layer", feature_layers)
335
336    if processor is not None:
337        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
338        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
339    else:
340        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
341        image_std = args.image_std if args.image_std is not None else default_image_std
342    fout.add_array("clip.vision.image_mean", image_mean)
343    fout.add_array("clip.vision.image_std", image_std)
344
345use_gelu = v_hparams["hidden_act"] == "gelu"
346fout.add_bool("clip.use_gelu", use_gelu)
347
348
349if has_llava_projector:
350    # By default, we drop the last layer for llava projector
351    # models unless we have explicitly set vision feature layers
352    if feature_layers is None:
353        model.vision_model.encoder.layers.pop(-1)
354    else:
355        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
356
357    projector = torch.load(args.llava_projector)
358    for name, data in projector.items():
359        name = get_tensor_name(name)
360        # pw and dw conv ndim==4
361        if data.ndim == 2 or data.ndim == 4:
362            data = data.squeeze().numpy().astype(np.float16)
363        else:
364            data = data.squeeze().numpy().astype(np.float32)
365
366        fout.add_tensor(name, data)
367
368    print("Projector tensors added\n")
369
370state_dict = model.state_dict()
371for name, data in state_dict.items():
372    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
373        # we don't need this
374        print(f"skipping parameter: {name}")
375        continue
376
377    name = get_tensor_name(name)
378    data = data.squeeze().numpy()
379
380    n_dims = len(data.shape)
381
382    # ftype == 0 -> float32, ftype == 1 -> float16
383    ftype_cur = 0
384    if n_dims == 4:
385        print(f"tensor {name} is always saved in f16")
386        data = data.astype(np.float16)
387        ftype_cur = 1
388    elif ftype == 1:
389        if name[-7:] == ".weight" and n_dims == 2:
390            print("  Converting to float16")
391            data = data.astype(np.float16)
392            ftype_cur = 1
393        else:
394            print("  Converting to float32")
395            data = data.astype(np.float32)
396            ftype_cur = 0
397    else:
398        if data.dtype != np.float32:
399            print("  Converting to float32")
400            data = data.astype(np.float32)
401            ftype_cur = 0
402
403    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
404    fout.add_tensor(name, data)
405
406
407fout.write_header_to_file()
408fout.write_kv_data_to_file()
409fout.write_tensors_to_file()
410fout.close()
411
412print("Done. Output file: " + fname_out)