llmnpc - llama.cpp/examples/convert_legacy

Path: llmnpc / llama.cpp / examples / convert_legacy_llama.py (raw)
   1#!/usr/bin/env python3
   2from __future__ import annotations
   3
   4import logging
   5import argparse
   6import concurrent.futures
   7import enum
   8import faulthandler
   9import functools
  10import itertools
  11import json
  12import math
  13import mmap
  14import os
  15import pickle
  16import re
  17import signal
  18import struct
  19import sys
  20import textwrap
  21import time
  22import zipfile
  23from abc import ABC, abstractmethod
  24from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
  25from dataclasses import dataclass
  26from pathlib import Path
  27from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
  28
  29import numpy as np
  30
  31if 'NO_LOCAL_GGUF' not in os.environ:
  32    # use .parent.parent since we are in "examples" directory
  33    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
  34
  35import gguf
  36from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
  37
  38if TYPE_CHECKING:
  39    from typing_extensions import Self, TypeAlias
  40
  41logger = logging.getLogger("convert")
  42
  43if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
  44    faulthandler.register(signal.SIGUSR1)
  45
  46NDArray: TypeAlias = 'np.ndarray[Any, Any]'
  47
  48ARCH = gguf.MODEL_ARCH.LLAMA
  49
  50DEFAULT_CONCURRENCY = 8
  51
  52ADDED_TOKENS_FILE = 'added_tokens.json'
  53FAST_TOKENIZER_FILE = 'tokenizer.json'
  54
  55#
  56# data types
  57#
  58
  59
  60@dataclass(frozen=True)
  61class DataType:
  62    name: str
  63    dtype: np.dtype[Any]
  64    valid_conversions: list[str]
  65
  66    def elements_to_bytes(self, n_elements: int) -> int:
  67        return n_elements * self.dtype.itemsize
  68
  69
  70@dataclass(frozen=True)
  71class UnquantizedDataType(DataType):
  72    pass
  73
  74
  75DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
  76DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
  77DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
  78DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])
  79
  80
  81@dataclass(frozen=True)
  82class QuantizedDataType(DataType):
  83    block_size: int
  84    quantized_dtype: np.dtype[Any]
  85    ggml_type: gguf.GGMLQuantizationType
  86
  87    def quantize(self, arr: NDArray) -> NDArray:
  88        raise NotImplementedError(f'Quantization for {self.name} not implemented')
  89
  90    def elements_to_bytes(self, n_elements: int) -> int:
  91        assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
  92        return self.quantized_dtype.itemsize * (n_elements // self.block_size)
  93
  94
  95@dataclass(frozen=True)
  96class Q8_0QuantizedDataType(QuantizedDataType):
  97    # Mini Q8_0 quantization in Python!
  98    def quantize(self, arr: NDArray) -> NDArray:
  99        assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
 100        assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
 101        n_blocks = arr.size // self.block_size
 102        blocks = arr.reshape((n_blocks, self.block_size))
 103        # Much faster implementation of block quantization contributed by @Cebtenzzre
 104
 105        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
 106            d = abs(blocks).max(axis = 1) / np.float32(127)
 107            with np.errstate(divide = 'ignore'):
 108                qs = (blocks / d[:, None]).round()
 109            qs[d == 0] = 0
 110            yield from zip(d, qs)
 111        return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
 112
 113
 114DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
 115                                dtype = np.dtype(np.float32), valid_conversions = [],
 116                                ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
 117                                quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
 118
 119# Quantized types skipped here because they may also map to np.float32
 120NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
 121for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
 122    if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
 123        raise ValueError(f'Invalid duplicate data type {dt}')
 124    NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
 125
 126SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
 127    'BF16': DT_BF16,
 128    'F16': DT_F16,
 129    'F32': DT_F32,
 130    'I32': DT_I32,
 131}
 132
 133# TODO: match this with `llama_ftype`
 134# TODO: rename to LLAMAFileType
 135# TODO: move to `gguf.py`
 136
 137
 138class GGMLFileType(enum.IntEnum):
 139    AllF32     = 0
 140    MostlyF16  = 1  # except 1d tensors
 141    MostlyQ8_0 = 7  # except 1d tensors
 142
 143    def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
 144        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
 145        if dt is None:
 146            raise ValueError(self)
 147        # Convert all 1D tensors to F32.  Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
 148        #  Also The 1d tensors aren't much of a performance/size issue.  So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
 149        return dt if len(tensor.shape) > 1 else DT_F32
 150
 151
 152GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
 153    GGMLFileType.AllF32    : DT_F32,
 154    GGMLFileType.MostlyF16 : DT_F16,
 155    GGMLFileType.MostlyQ8_0: DT_Q8_0,
 156}
 157
 158#
 159# hparams loading
 160#
 161
 162
 163@dataclass
 164class Params:
 165    n_vocab:        int
 166    n_embd:         int
 167    n_layer:        int
 168    n_ctx:          int
 169    n_ff:           int
 170    n_head:         int
 171    n_head_kv:      int
 172    n_experts:      int | None = None
 173    n_experts_used: int | None = None
 174    f_norm_eps:     float | None = None
 175
 176    rope_scaling_type: gguf.RopeScalingType | None = None
 177    f_rope_freq_base: float | None = None
 178    f_rope_scale: float | None = None
 179    n_ctx_orig: int | None = None
 180    rope_finetuned: bool | None = None
 181
 182    ftype: GGMLFileType | None = None
 183
 184    # path to the directory containing the model files
 185    path_model: Path | None = None
 186
 187    @staticmethod
 188    def guessed(model: LazyModel) -> Params:
 189        # try transformer naming first
 190        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
 191
 192        # try transformer naming first
 193        if "model.layers.0.self_attn.q_proj.weight" in model:
 194            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
 195        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
 196            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
 197        else:
 198            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
 199
 200        if n_layer < 1:
 201            msg = """\
 202                failed to guess 'n_layer'. This model is unknown or unsupported.
 203                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
 204            raise KeyError(textwrap.dedent(msg))
 205
 206        n_head = n_embd // 128 # guessed
 207        n_mult = 256           # guessed
 208
 209        # TODO: verify this
 210        n_ff = int(2 * (4 * n_embd) / 3)
 211        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
 212
 213        return Params(
 214            n_vocab    = n_vocab,
 215            n_embd     = n_embd,
 216            n_layer    = n_layer,
 217            n_ctx      = -1,
 218            n_ff       = n_ff,
 219            n_head     = n_head,
 220            n_head_kv  = n_head,
 221            f_norm_eps = 1e-5,
 222        )
 223
 224    @staticmethod
 225    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
 226        with open(config_path) as f:
 227            config = json.load(f)
 228
 229        rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
 230        rope_scaling = config.get("rope_scaling")
 231
 232        if rope_scaling is not None and (typ := rope_scaling.get("type")):
 233            rope_factor = rope_scaling.get("factor")
 234            f_rope_scale = rope_factor
 235            if typ == "linear":
 236                rope_scaling_type = gguf.RopeScalingType.LINEAR
 237            elif typ == "yarn":
 238                rope_scaling_type = gguf.RopeScalingType.YARN
 239                n_ctx_orig = rope_scaling['original_max_position_embeddings']
 240                rope_finetuned = rope_scaling['finetuned']
 241            else:
 242                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
 243
 244        if "max_sequence_length" in config:
 245            n_ctx = config["max_sequence_length"]
 246        elif "max_position_embeddings" in config:
 247            n_ctx = config["max_position_embeddings"]
 248        else:
 249            msg = """\
 250                failed to guess 'n_ctx'. This model is unknown or unsupported.
 251                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
 252            raise KeyError(textwrap.dedent(msg))
 253
 254        n_experts      = None
 255        n_experts_used = None
 256
 257        if "num_local_experts" in config:
 258            n_experts = config["num_local_experts"]
 259            n_experts_used = config["num_experts_per_tok"]
 260
 261        return Params(
 262            n_vocab           = config["vocab_size"],
 263            n_embd            = config["hidden_size"],
 264            n_layer           = config["num_hidden_layers"],
 265            n_ctx             = n_ctx,
 266            n_ff              = config["intermediate_size"],
 267            n_head            = (n_head := config["num_attention_heads"]),
 268            n_head_kv         = config.get("num_key_value_heads", n_head),
 269            n_experts         = n_experts,
 270            n_experts_used    = n_experts_used,
 271            f_norm_eps        = config["rms_norm_eps"],
 272            f_rope_freq_base  = config.get("rope_theta"),
 273            rope_scaling_type = rope_scaling_type,
 274            f_rope_scale      = f_rope_scale,
 275            n_ctx_orig        = n_ctx_orig,
 276            rope_finetuned    = rope_finetuned,
 277        )
 278
 279    # LLaMA v2 70B params.json
 280    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
 281    @staticmethod
 282    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
 283        with open(config_path) as f:
 284            config = json.load(f)
 285
 286        n_experts      = None
 287        n_experts_used = None
 288        f_rope_freq_base = None
 289        n_ff = None
 290
 291        # hack to determine LLaMA v1 vs v2 vs CodeLlama
 292        if config.get("moe"):
 293            # Mixtral
 294            n_ctx = 32768
 295        elif config.get("rope_theta") == 1000000:
 296            # CodeLlama
 297            n_ctx = 16384
 298        elif config["norm_eps"] == 1e-05:
 299            # LLaMA v2
 300            n_ctx = 4096
 301        else:
 302            # LLaMA v1
 303            n_ctx = 2048
 304
 305        if "layers.0.feed_forward.w1.weight" in model:
 306            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
 307
 308        if config.get("moe"):
 309            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
 310            n_experts      = config["moe"]["num_experts"]
 311            n_experts_used = config["moe"]["num_experts_per_tok"]
 312            f_rope_freq_base = 1e6
 313
 314        assert n_ff is not None
 315
 316        return Params(
 317            n_vocab          = model["tok_embeddings.weight"].shape[0],
 318            n_embd           = config["dim"],
 319            n_layer          = config["n_layers"],
 320            n_ctx            = n_ctx,
 321            n_ff             = n_ff,
 322            n_head           = (n_head := config["n_heads"]),
 323            n_head_kv        = config.get("n_kv_heads", n_head),
 324            n_experts        = n_experts,
 325            n_experts_used   = n_experts_used,
 326            f_norm_eps       = config["norm_eps"],
 327            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
 328        )
 329
 330    @staticmethod
 331    def load(model_plus: ModelPlus) -> Params:
 332        hf_config_path   = model_plus.paths[0].parent / "config.json"
 333        orig_config_path = model_plus.paths[0].parent / "params.json"
 334
 335        if hf_config_path.exists():
 336            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
 337        elif orig_config_path.exists():
 338            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
 339        elif model_plus.format != 'none':
 340            params = Params.guessed(model_plus.model)
 341        else:
 342            raise ValueError('Cannot guess params when model format is none')
 343
 344        params.path_model = model_plus.paths[0].parent
 345
 346        return params
 347
 348
 349#
 350# data loading
 351# TODO: reuse (probably move to gguf.py?)
 352#
 353
 354
 355def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
 356    if n_head_kv is not None and n_head != n_head_kv:
 357        n_head = n_head_kv
 358    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
 359            .swapaxes(1, 2)
 360            .reshape(weights.shape))
 361
 362
 363class Tensor(ABC):
 364    ndarray: NDArray
 365    data_type: DataType
 366
 367    @abstractmethod
 368    def astype(self, data_type: DataType) -> Self: ...
 369    @abstractmethod
 370    def permute(self, n_head: int, n_head_kv: int) -> Self: ...
 371    @abstractmethod
 372    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
 373    @abstractmethod
 374    def part(self, n_part: int) -> Self: ...
 375    @abstractmethod
 376    def to_ggml(self) -> GGMLCompatibleTensor: ...
 377
 378
 379def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
 380    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
 381    fp32_arr = bf16_arr.astype(np.uint32) << 16
 382    return fp32_arr.view(np.float32)
 383
 384
 385class UnquantizedTensor(Tensor):
 386    def __init__(self, ndarray: NDArray):
 387        assert isinstance(ndarray, np.ndarray)
 388        self.ndarray = ndarray
 389        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
 390
 391    def astype(self, data_type: DataType) -> UnquantizedTensor:
 392        dtype = data_type.dtype
 393        if self.data_type == DT_BF16:
 394            self.ndarray = bf16_to_fp32(self.ndarray)
 395        return UnquantizedTensor(self.ndarray.astype(dtype))
 396
 397    def to_ggml(self) -> Self:
 398        return self
 399
 400    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
 401        r = self.ndarray.shape[0] // 3
 402        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
 403
 404    def part(self, n_part: int) -> UnquantizedTensor:
 405        r = self.ndarray.shape[0] // 3
 406        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
 407
 408    def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
 409        return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
 410
 411
 412def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
 413    tensor = lazy_tensor.load()
 414    assert isinstance(tensor, UnquantizedTensor)
 415
 416    # double-check:
 417    actual_shape = list(tensor.ndarray.shape)
 418    assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
 419    if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
 420        if convert:
 421            tensor.ndarray = tensor.ndarray.astype(expected_dtype)
 422        else:
 423            raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
 424
 425    return tensor.ndarray
 426
 427
 428GGMLCompatibleTensor = UnquantizedTensor
 429
 430
 431@dataclass
 432class LazyTensor:
 433    _load: Callable[[], Tensor]
 434    shape: list[int]
 435    data_type: DataType
 436    description: str
 437
 438    def load(self) -> Tensor:
 439        ret = self._load()
 440        # Should be okay if it maps to the same numpy type?
 441        assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
 442            (self.data_type, ret.data_type, self.description)
 443        return ret
 444
 445    def astype(self, data_type: DataType) -> LazyTensor:
 446        self.validate_conversion_to(data_type)
 447
 448        def load() -> Tensor:
 449            return self.load().astype(data_type)
 450        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
 451
 452    def validate_conversion_to(self, data_type: DataType) -> None:
 453        if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
 454            raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
 455
 456
 457LazyModel: TypeAlias = 'dict[str, LazyTensor]'
 458
 459ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
 460
 461@dataclass
 462class ModelPlus:
 463    model: LazyModel
 464    paths: list[Path]  # Where this was read from.
 465    format: ModelFormat
 466    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
 467
 468
 469def merge_sharded(models: list[LazyModel]) -> LazyModel:
 470    # Original LLaMA models have each file contain one part of each tensor.
 471    # Use a dict instead of a set to preserve order.
 472    names = {name: None for model in models for name in model}
 473
 474    def convert(name: str) -> LazyTensor:
 475        lazy_tensors = [model[name] for model in models]
 476        if len(lazy_tensors) == 1:
 477            # only one file; don't go through this procedure since there might
 478            # be quantized tensors
 479            return lazy_tensors[0]
 480        if len(lazy_tensors[0].shape) == 1:
 481            # the tensor is just duplicated in every file
 482            return lazy_tensors[0]
 483        if name.startswith('tok_embeddings.') or \
 484           name.endswith('.attention.wo.weight') or \
 485           name.endswith('.feed_forward.w2.weight'):
 486            # split by columns
 487            axis = 1
 488        else:
 489            # split by rows
 490            axis = 0
 491        concatenated_shape = list(lazy_tensors[0].shape)
 492        concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
 493
 494        def load() -> UnquantizedTensor:
 495            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
 496            concatenated = np.concatenate(ndarrays, axis=axis)
 497            return UnquantizedTensor(concatenated)
 498        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
 499        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
 500    return {name: convert(name) for name in names}
 501
 502
 503def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
 504    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
 505    assert len(formats) == 1, "different formats?"
 506    format = formats.pop()
 507    paths = [path for mp in models_plus for path in mp.paths]
 508    # Use the first non-None vocab, if any.
 509    try:
 510        vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
 511    except StopIteration:
 512        vocab = None
 513
 514    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
 515        # Transformers models put different tensors in different files, but
 516        # don't split individual tensors between files.
 517        model: LazyModel = {}
 518        for mp in models_plus:
 519            model.update(mp.model)
 520    else:
 521        model = merge_sharded([mp.model for mp in models_plus])
 522
 523    return ModelPlus(model, paths, format, vocab)
 524
 525
 526def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
 527    def load() -> Tensor:
 528        return lazy_tensor.load().permute(n_head, n_head_kv)
 529    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
 530
 531
 532def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
 533    def load() -> Tensor:
 534        return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
 535    s = lazy_tensor.shape.copy()
 536    s[0] = s[0] // 3
 537    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
 538
 539
 540def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
 541    def load() -> Tensor:
 542        return lazy_tensor.load().part(n_part)
 543    s = lazy_tensor.shape.copy()
 544    s[0] = s[0] // 3
 545    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
 546
 547
 548def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
 549    def load() -> Tensor:
 550        tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
 551        return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
 552    s = lazy_tensors[0].shape.copy()
 553    s.insert(0, len(lazy_tensors))
 554    return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
 555
 556
 557# Functionality that simulates `torch.load` but where individual tensors are
 558# only loaded into memory on demand, not all at once.
 559# PyTorch can't do this natively as of time of writing:
 560# - https://github.com/pytorch/pytorch/issues/64327
 561# This allows us to de-shard without multiplying RAM usage, and also
 562# conveniently drops the PyTorch dependency (though we still need numpy).
 563
 564
 565@dataclass
 566class LazyStorageKind:
 567    data_type: DataType
 568
 569
 570@dataclass
 571class LazyStorage:
 572    load: Callable[[int, int], NDArray]
 573    kind: LazyStorageKind
 574    description: str
 575
 576
 577class LazyUnpickler(pickle.Unpickler):
 578    def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
 579        super().__init__(fp)
 580        self.data_base_path = data_base_path
 581        self.zip_file = zip_file
 582
 583    def persistent_load(self, pid: Any) -> Any:
 584        assert pid[0] == 'storage'
 585        assert isinstance(pid[1], LazyStorageKind)
 586        data_type = pid[1].data_type
 587        filename_stem = pid[2]
 588        filename = f'{self.data_base_path}/{filename_stem}'
 589        info = self.zip_file.getinfo(filename)
 590
 591        def load(offset: int, elm_count: int) -> NDArray:
 592            dtype = data_type.dtype
 593            with self.zip_file.open(info) as fp:
 594                fp.seek(offset * dtype.itemsize)
 595                size = elm_count * dtype.itemsize
 596                data = fp.read(size)
 597            assert len(data) == size
 598            return np.frombuffer(data, dtype)
 599        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
 600        return LazyStorage(load=load, kind=pid[1], description=description)
 601
 602    @staticmethod
 603    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
 604                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
 605        assert isinstance(storage, LazyStorage)
 606
 607        def load() -> UnquantizedTensor:
 608            elm_count = stride[0] * size[0]
 609            return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
 610        description = f'pickled storage_offset={storage_offset} in {storage.description}'
 611        return LazyTensor(load, list(size), storage.kind.data_type, description)
 612
 613    @staticmethod
 614    def rebuild_from_type_v2(func, new_type, args, state):
 615        return func(*args)
 616
 617    CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
 618        # getattr used here as a workaround for mypy not being smart enough to determine
 619        # the staticmethods have a __func__ attribute.
 620        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
 621        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
 622        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
 623        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
 624        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
 625        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
 626        ('torch', 'Tensor'): LazyTensor,
 627    }
 628
 629    def find_class(self, module: str, name: str) -> Any:
 630        if not module.startswith('torch'):
 631            return super().find_class(module, name)
 632        return self.CLASSES[(module, name)]
 633
 634
 635def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
 636    zf = zipfile.ZipFile(outer_fp)
 637    pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
 638    assert len(pickle_paths) == 1, pickle_paths
 639    pickle_fp = zf.open(pickle_paths[0], 'r')
 640    unpickler = LazyUnpickler(pickle_fp,
 641                              data_base_path=pickle_paths[0][:-4],
 642                              zip_file=zf)
 643    model = unpickler.load()
 644    if 'model' in model: model = model['model']
 645    as_dict = dict(model.items())
 646    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
 647
 648
 649def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
 650    header_size, = struct.unpack('<Q', fp.read(8))
 651    header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
 652    # Use mmap for the actual data to avoid race conditions with the file offset.
 653    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
 654    byte_buf = mapped[8 + header_size:]
 655
 656    def convert(info: dict[str, Any]) -> LazyTensor:
 657        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
 658        numpy_dtype = data_type.dtype
 659        shape: list[int] = info['shape']
 660        begin, end = info['data_offsets']
 661        assert 0 <= begin <= end <= len(byte_buf)
 662        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
 663        buf = byte_buf[begin:end]
 664
 665        def load() -> UnquantizedTensor:
 666            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
 667        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
 668        return LazyTensor(load, shape, data_type, description)
 669    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
 670    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
 671
 672
 673def must_read(fp: IO[bytes], length: int) -> bytes:
 674    ret = fp.read(length)
 675    if len(ret) < length:
 676        raise EOFError("unexpectedly reached end of file")
 677    return ret
 678
 679
 680@functools.lru_cache(maxsize=None)
 681def lazy_load_file(path: Path) -> ModelPlus:
 682    fp = open(path, 'rb')
 683    first8 = fp.read(8)
 684    fp.seek(0)
 685    if first8[:2] == b'PK':
 686        # A zip file, i.e. PyTorch format
 687        return lazy_load_torch_file(fp, path)
 688    elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
 689        # Probably safetensors
 690        return lazy_load_safetensors_file(fp, path)
 691    else:
 692        raise ValueError(f"unknown format: {path}")
 693
 694
 695In = TypeVar('In')
 696Out = TypeVar('Out')
 697
 698
 699def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
 700    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
 701    fast enough, this will stop calling `func` at some point rather than
 702    letting results pile up in memory.  Specifically, there is a max of one
 703    output value buffered per thread.'''
 704    if concurrency < 2:
 705        yield from map(func, iterable)
 706        # Not reached.
 707    iterable = iter(iterable)
 708    executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
 709    if use_processpool_executor:
 710        executor_class = ProcessPoolExecutor
 711    else:
 712        executor_class = ThreadPoolExecutor
 713    with executor_class(max_workers=max_workers) as executor:
 714        futures: list[concurrent.futures.Future[Out]] = []
 715        done = False
 716        for _ in range(concurrency):
 717            try:
 718                futures.append(executor.submit(func, next(iterable)))
 719            except StopIteration:
 720                done = True
 721                break
 722
 723        while futures:
 724            result = futures.pop(0).result()
 725            while not done and len(futures) < concurrency:
 726                try:
 727                    futures.append(executor.submit(func, next(iterable)))
 728                except StopIteration:
 729                    done = True
 730                    break
 731            yield result
 732
 733
 734def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
 735    # Handle special case where the model's vocab size is not set
 736    if params.n_vocab == -1:
 737        raise ValueError(
 738            "The model's vocab size is set to -1 in params.json. Please update it manually."
 739            + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
 740        )
 741    if not isinstance(vocab, Vocab):
 742        return  # model has no vocab
 743
 744    # Check for a vocab size mismatch
 745    if params.n_vocab == vocab.vocab_size:
 746        logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
 747        return
 748
 749    if pad_vocab and params.n_vocab > vocab.vocab_size:
 750        pad_count = params.n_vocab - vocab.vocab_size
 751        logger.debug(
 752            f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
 753        )
 754        for i in range(1, pad_count + 1):
 755            vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
 756            vocab.added_tokens_list.append(f"<dummy{i:05}>")
 757        vocab.vocab_size = params.n_vocab
 758        return
 759
 760    msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
 761    if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
 762        msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
 763    if vocab.vocab_size < params.n_vocab:
 764        msg += " Add the --pad-vocab option and try again."
 765
 766    raise ValueError(msg)
 767
 768
 769class OutputFile:
 770    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
 771        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 772
 773    def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
 774        # Metadata About The Model And Its Provenence
 775        name = "LLaMA"
 776        if metadata is not None and metadata.name is not None:
 777            name = metadata.name
 778        elif params.path_model is not None:
 779            name = params.path_model.name
 780        elif params.n_ctx == 4096:
 781            # Heuristic detection of LLaMA v2 model
 782            name = "LLaMA v2"
 783
 784        self.gguf.add_name(name)
 785
 786        if metadata is not None:
 787            if metadata.author is not None:
 788                self.gguf.add_author(metadata.author)
 789            if metadata.version is not None:
 790                self.gguf.add_version(metadata.version)
 791            if metadata.organization is not None:
 792                self.gguf.add_organization(metadata.organization)
 793
 794            if metadata.finetune is not None:
 795                self.gguf.add_finetune(metadata.finetune)
 796            if metadata.basename is not None:
 797                self.gguf.add_basename(metadata.basename)
 798
 799            if metadata.description is not None:
 800                self.gguf.add_description(metadata.description)
 801            if metadata.quantized_by is not None:
 802                self.gguf.add_quantized_by(metadata.quantized_by)
 803
 804            if metadata.size_label is not None:
 805                self.gguf.add_size_label(metadata.size_label)
 806
 807            if metadata.license is not None:
 808                self.gguf.add_license(metadata.license)
 809            if metadata.license_name is not None:
 810                self.gguf.add_license_name(metadata.license_name)
 811            if metadata.license_link is not None:
 812                self.gguf.add_license_link(metadata.license_link)
 813
 814            if metadata.url is not None:
 815                self.gguf.add_url(metadata.url)
 816            if metadata.doi is not None:
 817                self.gguf.add_doi(metadata.doi)
 818            if metadata.uuid is not None:
 819                self.gguf.add_uuid(metadata.uuid)
 820            if metadata.repo_url is not None:
 821                self.gguf.add_repo_url(metadata.repo_url)
 822
 823            if metadata.source_url is not None:
 824                self.gguf.add_source_url(metadata.source_url)
 825            if metadata.source_doi is not None:
 826                self.gguf.add_source_doi(metadata.source_doi)
 827            if metadata.source_uuid is not None:
 828                self.gguf.add_source_uuid(metadata.source_uuid)
 829            if metadata.source_repo_url is not None:
 830                self.gguf.add_source_repo_url(metadata.source_repo_url)
 831
 832            if metadata.base_models is not None:
 833                self.gguf.add_base_model_count(len(metadata.base_models))
 834                for key, base_model_entry in enumerate(metadata.base_models):
 835                    if "name" in base_model_entry:
 836                        self.gguf.add_base_model_name(key, base_model_entry["name"])
 837                    if "author" in base_model_entry:
 838                        self.gguf.add_base_model_author(key, base_model_entry["author"])
 839                    if "version" in base_model_entry:
 840                        self.gguf.add_base_model_version(key, base_model_entry["version"])
 841                    if "organization" in base_model_entry:
 842                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
 843                    if "description" in base_model_entry:
 844                        self.gguf.add_base_model_description(key, base_model_entry["description"])
 845                    if "url" in base_model_entry:
 846                        self.gguf.add_base_model_url(key, base_model_entry["url"])
 847                    if "doi" in base_model_entry:
 848                        self.gguf.add_base_model_doi(key, base_model_entry["doi"])
 849                    if "uuid" in base_model_entry:
 850                        self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
 851                    if "repo_url" in base_model_entry:
 852                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
 853
 854            if metadata.datasets is not None:
 855                self.gguf.add_dataset_count(len(metadata.datasets))
 856                for key, dataset_entry in enumerate(metadata.datasets):
 857                    if "name" in dataset_entry:
 858                        self.gguf.add_dataset_name(key, dataset_entry["name"])
 859                    if "author" in dataset_entry:
 860                        self.gguf.add_dataset_author(key, dataset_entry["author"])
 861                    if "version" in dataset_entry:
 862                        self.gguf.add_dataset_version(key, dataset_entry["version"])
 863                    if "organization" in dataset_entry:
 864                        self.gguf.add_dataset_organization(key, dataset_entry["organization"])
 865                    if "description" in dataset_entry:
 866                        self.gguf.add_dataset_description(key, dataset_entry["description"])
 867                    if "url" in dataset_entry:
 868                        self.gguf.add_dataset_url(key, dataset_entry["url"])
 869                    if "doi" in dataset_entry:
 870                        self.gguf.add_dataset_doi(key, dataset_entry["doi"])
 871                    if "uuid" in dataset_entry:
 872                        self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
 873                    if "repo_url" in dataset_entry:
 874                        self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
 875
 876            if metadata.tags is not None:
 877                self.gguf.add_tags(metadata.tags)
 878            if metadata.languages is not None:
 879                self.gguf.add_languages(metadata.languages)
 880
 881    def add_meta_arch(self, params: Params) -> None:
 882        # Metadata About The Neural Architecture Itself
 883        self.gguf.add_vocab_size(params.n_vocab)
 884        self.gguf.add_context_length(params.n_ctx)
 885        self.gguf.add_embedding_length(params.n_embd)
 886        self.gguf.add_block_count(params.n_layer)
 887        self.gguf.add_feed_forward_length(params.n_ff)
 888        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
 889        self.gguf.add_head_count          (params.n_head)
 890        self.gguf.add_head_count_kv       (params.n_head_kv)
 891
 892        if params.n_experts:
 893            self.gguf.add_expert_count(params.n_experts)
 894
 895        if params.n_experts_used:
 896            self.gguf.add_expert_used_count(params.n_experts_used)
 897
 898        if params.f_norm_eps:
 899            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
 900        else:
 901            raise ValueError('f_norm_eps is None')
 902
 903        if params.f_rope_freq_base is not None:
 904            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
 905
 906        if params.rope_scaling_type:
 907            assert params.f_rope_scale is not None
 908            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
 909            self.gguf.add_rope_scaling_factor(params.f_rope_scale)
 910
 911        if params.n_ctx_orig is not None:
 912            self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
 913
 914        if params.rope_finetuned is not None:
 915            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
 916
 917        if params.ftype is not None:
 918            self.gguf.add_file_type(params.ftype)
 919
 920    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
 921        tokens = []
 922        scores = []
 923        toktypes = []
 924
 925        # NOTE: `all_tokens` returns the base vocabulary and added tokens
 926        for text, score, toktype in vocab.all_tokens():
 927            tokens.append(text)
 928            scores.append(score)
 929            toktypes.append(toktype)
 930
 931        assert len(tokens) == vocab.vocab_size
 932
 933        return tokens, scores, toktypes
 934
 935    def add_meta_vocab(self, vocab: Vocab) -> None:
 936        # Ensure that tokenizer_model is added to the GGUF model
 937        self.gguf.add_tokenizer_model(vocab.tokenizer_model)
 938
 939        # Extract model vocabulary for model conversion
 940        tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
 941
 942        # Add extracted token information for model conversion
 943        self.gguf.add_token_list(tokens)
 944        self.gguf.add_token_scores(scores)
 945        self.gguf.add_token_types(toktypes)
 946
 947    def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
 948        svocab.add_to_gguf(self.gguf)
 949
 950    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
 951        n_elements = int(np.prod(tensor.shape))
 952        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
 953        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
 954        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
 955        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
 956
 957    def write_meta(self) -> None:
 958        self.gguf.write_header_to_file()
 959        self.gguf.write_kv_data_to_file()
 960
 961    def write_tensor_info(self) -> None:
 962        self.gguf.write_ti_data_to_file()
 963
 964    def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
 965        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
 966        if ftype == GGMLFileType.MostlyQ8_0:
 967            ndarrays = bounded_parallel_map(
 968                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
 969                use_processpool_executor=True,
 970            )
 971        else:
 972            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
 973
 974        start = time.time()
 975        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
 976            elapsed = time.time() - start
 977            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
 978            padi = len(str(len(model)))
 979            logger.info(
 980                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
 981            )
 982            self.gguf.write_tensor_data(ndarray)
 983
 984    def close(self) -> None:
 985        self.gguf.close()
 986
 987    @staticmethod
 988    def write_vocab_only(
 989        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
 990        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
 991    ) -> None:
 992        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
 993
 994        of = OutputFile(fname_out, endianess=endianess)
 995
 996        # meta data
 997        of.add_meta_model(params, metadata)
 998        of.add_meta_arch(params)
 999        of.add_meta_vocab(vocab)
1000        of.add_meta_special_vocab(svocab)
1001
1002        of.write_meta()
1003
1004        of.close()
1005
1006    @staticmethod
1007    def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
1008        name, lazy_tensor = item
1009        tensor = lazy_tensor.load().to_ggml()
1010        return (lazy_tensor.data_type, tensor.ndarray)
1011
1012    @staticmethod
1013    def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
1014        dt, arr = item
1015        if not isinstance(dt, QuantizedDataType):
1016            return arr
1017        return dt.quantize(arr)
1018
1019    @staticmethod
1020    def write_all(
1021        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
1022        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
1023        pad_vocab: bool = False,
1024        metadata: gguf.Metadata | None = None,
1025    ) -> None:
1026        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
1027
1028        of = OutputFile(fname_out, endianess=endianess)
1029
1030        # meta data
1031        of.add_meta_model(params, metadata)
1032        of.add_meta_arch(params)
1033        if isinstance(vocab, Vocab):
1034            of.add_meta_vocab(vocab)
1035            of.add_meta_special_vocab(svocab)
1036        else:  # NoVocab
1037            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
1038
1039        # tensor info
1040        for name, lazy_tensor in model.items():
1041            of.add_tensor_info(name, lazy_tensor)
1042
1043        of.write_meta()
1044        of.write_tensor_info()
1045
1046        # tensor data
1047        of.write_tensor_data(ftype, model, concurrency)
1048
1049        of.close()
1050
1051
1052def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
1053    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
1054
1055    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
1056        return GGMLFileType.AllF32
1057    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
1058        return GGMLFileType.MostlyF16
1059    if output_type_str == "q8_0":
1060        return GGMLFileType.MostlyQ8_0
1061
1062    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
1063
1064    raise ValueError(f"Unexpected combination of types: {name_to_type}")
1065
1066
1067def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
1068    total_params = 0
1069    shared_params = 0
1070    expert_params = 0
1071
1072    for name, lazy_tensor in tensors:
1073        # We don't need these
1074        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
1075            continue
1076
1077        # Got A Tensor
1078        sum_weights_in_tensor: int = 1
1079
1080        # Tensor Volume
1081        for dim in lazy_tensor.shape:
1082            sum_weights_in_tensor *= dim
1083
1084        if ".experts." in name:
1085            if ".experts.0." in name:
1086                expert_params += sum_weights_in_tensor
1087        else:
1088            shared_params += sum_weights_in_tensor
1089
1090        total_params += sum_weights_in_tensor
1091
1092    return total_params, shared_params, expert_params
1093
1094
1095def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
1096    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
1097            for (name, tensor) in model.items()}
1098
1099
1100def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
1101    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
1102    should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
1103
1104    tmp = model
1105
1106    # merge experts into one tensor
1107    if params.n_experts and params.n_experts > 0:
1108        for i_l in range(params.n_layer):
1109            for w in range(1, 4):
1110                experts = []
1111                for e in range(params.n_experts):
1112                    if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
1113                        experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
1114                        del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
1115                    elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
1116                        experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
1117                        del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
1118                    else:
1119                        raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
1120                tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
1121
1122    # HF models permut or pack some of the tensors, so we need to undo that
1123    for i in itertools.count():
1124        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
1125            logger.debug(f"Permuting layer {i}")
1126            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
1127            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
1128            # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
1129        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
1130            logger.debug(f"Unpacking and permuting layer {i}")
1131            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
1132            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
1133            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
1134            del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
1135        else:
1136            break
1137
1138    out: LazyModel = {}
1139    for name, lazy_tensor in model.items():
1140        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
1141        if name_new is None:
1142            if skip_unknown:
1143                logger.warning(f"Unexpected tensor name: {name} - skipping")
1144                continue
1145            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
1146
1147        if tensor_type in should_skip:
1148            logger.debug(f"skipping tensor {name_new}")
1149            continue
1150
1151        logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
1152        out[name_new] = lazy_tensor
1153
1154    return out
1155
1156
1157def nth_multifile_path(path: Path, n: int) -> Path | None:
1158    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1159    the nth path in the model.
1160    '''
1161    # Support the following patterns:
1162    patterns = [
1163        # - x.00.pth, x.01.pth, etc.
1164        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1165        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
1166        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
1167        # x.bin, x.bin.1, etc.
1168        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
1169    ]
1170    for regex, replacement in patterns:
1171        if re.search(regex, path.name):
1172            new_path = path.with_name(re.sub(regex, replacement, path.name))
1173            if new_path.exists():
1174                return new_path
1175    return None
1176
1177
1178def find_multifile_paths(path: Path) -> list[Path]:
1179    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1180    the whole list of paths in the model.
1181    '''
1182    ret: list[Path] = []
1183    for i in itertools.count():
1184        nth_path = nth_multifile_path(path, i)
1185        if nth_path is None:
1186            break
1187        ret.append(nth_path)
1188    if not ret:
1189        # No matches.  This should only happen if the file was named, e.g.,
1190        # foo.0, and there was no file named foo.  Oh well, try to process it
1191        # as a single file.
1192        return [path]
1193    return ret
1194
1195
1196def load_some_model(path: Path) -> ModelPlus:
1197    '''Load a model of any supported format.'''
1198    # Be extra-friendly and accept either a file or a directory:
1199    if path.is_dir():
1200        # Check if it's a set of safetensors files first
1201        globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
1202        files = [file for glob in globs for file in path.glob(glob)]
1203        if not files:
1204            # Try the PyTorch patterns too, with lower priority
1205            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1206            files = [file for glob in globs for file in path.glob(glob)]
1207        if not files:
1208            raise FileNotFoundError(f"Can't find model in directory {path}")
1209        if len(files) > 1:
1210            raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
1211        path = files[0]
1212
1213    paths = find_multifile_paths(path)
1214    models_plus: list[ModelPlus] = []
1215    for path in paths:
1216        logger.info(f"Loading model file {path}")
1217        models_plus.append(lazy_load_file(path))
1218
1219    model_plus = merge_multifile_models(models_plus)
1220    return model_plus
1221
1222
1223class VocabFactory:
1224    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
1225
1226    def __init__(self, path: Path):
1227        self.path = path
1228
1229    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
1230        load_merges = vocab.name == "bpe"
1231        n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
1232        return gguf.SpecialVocab(
1233            model_parent_path,
1234            load_merges=load_merges,
1235            special_token_types=None,  # Predetermined or passed as a parameter
1236            n_vocab=n_vocab,
1237        )
1238
1239    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
1240        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
1241        selected_vocabs: dict[str, type[Vocab]] = {}
1242        for vtype in vocab_types:
1243            try:
1244                selected_vocabs[vtype] = vocab_classes[vtype]
1245            except KeyError:
1246                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
1247
1248        for vtype, cls in selected_vocabs.items():
1249            try:
1250                vocab = cls(self.path)
1251                break
1252            except FileNotFoundError:
1253                pass  # ignore unavailable tokenizers
1254        else:
1255            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
1256
1257        logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
1258        return vocab
1259
1260    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
1261        vocab: BaseVocab
1262        if vocab_types is None:
1263            vocab = NoVocab()
1264        else:
1265            vocab = self._create_vocab_by_path(vocab_types)
1266        # FIXME: Respect --vocab-dir?
1267        special_vocab = self._create_special_vocab(
1268            vocab,
1269            model_parent_path,
1270        )
1271        return vocab, special_vocab
1272
1273
1274def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
1275    name = metadata.name if metadata.name is not None else None
1276    basename = metadata.basename if metadata.basename is not None else None
1277    finetune = metadata.finetune if metadata.finetune is not None else None
1278    version = metadata.version if metadata.version is not None else None
1279    size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
1280
1281    output_type = {
1282        GGMLFileType.AllF32:    "F32",
1283        GGMLFileType.MostlyF16: "F16",
1284        GGMLFileType.MostlyQ8_0: "Q8_0",
1285    }[file_type]
1286
1287    return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
1288
1289
1290def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
1291    default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
1292    ret = model_paths[0].parent / f"{default_filename}.gguf"
1293    if ret in model_paths:
1294        logger.error(
1295            f"Error: Default output path ({ret}) would overwrite the input. "
1296            "Please explicitly specify a path using --outfile.")
1297        sys.exit(1)
1298    return ret
1299
1300
1301def do_dump_model(model_plus: ModelPlus) -> None:
1302    print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
1303    print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
1304    print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
1305    for name, lazy_tensor in model_plus.model.items():
1306        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
1307
1308
1309def main(args_in: list[str] | None = None) -> None:
1310    output_choices = ["f32", "f16"]
1311    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
1312        # We currently only support Q8_0 output on little endian systems.
1313        output_choices.append("q8_0")
1314    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
1315    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
1316    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
1317    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
1318    parser.add_argument("--no-vocab",     action="store_true",    help="store model without the vocab")
1319    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
1320    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
1321    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
1322    parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
1323    parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1324    parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
1325    parser.add_argument("--concurrency",  type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
1326    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
1327    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
1328    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
1329    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
1330    parser.add_argument("--metadata",     type=Path,              help="Specify the path for an authorship metadata override file")
1331    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
1332    parser.add_argument("--model-name",   type=str, default=None, help="name of the model")
1333
1334    args = parser.parse_args(args_in)
1335
1336    if args.verbose:
1337        logging.basicConfig(level=logging.DEBUG)
1338    elif args.dump_single or args.dump or args.get_outfile:
1339        # Avoid printing anything besides the dump output
1340        logging.basicConfig(level=logging.WARNING)
1341    else:
1342        logging.basicConfig(level=logging.INFO)
1343
1344    model_name = args.model_name
1345    dir_model = args.model
1346
1347    metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
1348
1349    if args.get_outfile:
1350        model_plus = load_some_model(dir_model)
1351        params = Params.load(model_plus)
1352        model = convert_model_names(model_plus.model, params, args.skip_unknown)
1353        model_params_count = per_model_weight_count_estimation(model_plus.model.items())
1354        ftype = pick_output_type(model, args.outtype)
1355
1356        if (metadata is None or metadata.name is None) and params.path_model is not None:
1357            metadata.name = params.path_model.name
1358
1359        print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
1360        return
1361
1362    if args.no_vocab and args.vocab_only:
1363        raise ValueError("--vocab-only does not make sense with --no-vocab")
1364
1365    if args.dump_single:
1366        model_plus = lazy_load_file(dir_model)
1367        do_dump_model(model_plus)
1368        return
1369
1370    if not args.vocab_only:
1371        model_plus = load_some_model(dir_model)
1372    else:
1373        model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
1374
1375    if args.dump:
1376        do_dump_model(model_plus)
1377        return
1378
1379    endianess = gguf.GGUFEndian.LITTLE
1380    if args.big_endian:
1381        endianess = gguf.GGUFEndian.BIG
1382
1383    params = None
1384    if args.pad_vocab or not args.vocab_only:
1385        params = Params.load(model_plus)
1386        if params.n_ctx == -1:
1387            if args.ctx is None:
1388                msg = """\
1389                    The model doesn't have a context size, and you didn't specify one with --ctx
1390                    Please specify one with --ctx:
1391                     - LLaMA v1: --ctx 2048
1392                     - LLaMA v2: --ctx 4096"""
1393                parser.error(textwrap.dedent(msg))
1394            params.n_ctx = args.ctx
1395
1396        if args.outtype:
1397            params.ftype = {
1398                "f32": GGMLFileType.AllF32,
1399                "f16": GGMLFileType.MostlyF16,
1400                "q8_0": GGMLFileType.MostlyQ8_0,
1401            }[args.outtype]
1402
1403        logger.info(f"params = {params}")
1404
1405    model_parent_path = model_plus.paths[0].parent
1406    vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
1407    vocab_factory = VocabFactory(vocab_path)
1408    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
1409    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
1410
1411    if args.vocab_only:
1412        assert isinstance(vocab, Vocab)
1413        if not args.outfile:
1414            raise ValueError("need --outfile if using --vocab-only")
1415        outfile = args.outfile
1416        if params is None:
1417            params = Params(
1418                n_vocab    = vocab.vocab_size,
1419                n_embd     = 1,
1420                n_layer    = 1,
1421                n_ctx      = 1,
1422                n_ff       = 1,
1423                n_head     = 1,
1424                n_head_kv  = 1,
1425                f_norm_eps = 1e-5,
1426            )
1427        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
1428                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
1429        logger.info(f"Wrote {outfile}")
1430        return
1431
1432    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
1433        vocab = model_plus.vocab
1434
1435    assert params is not None
1436
1437    if metadata.name is None and params.path_model is not None:
1438        metadata.name = params.path_model.name
1439
1440    model_params_count = per_model_weight_count_estimation(model_plus.model.items())
1441    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
1442
1443    logger.info(f"Vocab info: {vocab}")
1444    logger.info(f"Special vocab info: {special_vocab}")
1445    model   = model_plus.model
1446    model   = convert_model_names(model, params, args.skip_unknown)
1447    ftype   = pick_output_type(model, args.outtype)
1448    model   = convert_to_output_type(model, ftype)
1449    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
1450
1451    metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
1452
1453    params.ftype = ftype
1454    logger.info(f"Writing {outfile}, format {ftype}")
1455
1456    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
1457                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
1458    logger.info(f"Wrote {outfile}")
1459
1460
1461if __name__ == '__main__':
1462    main()