1#!/usr/bin/env python3
2from __future__ import annotations
3
4import logging
5import argparse
6import concurrent.futures
7import enum
8import faulthandler
9import functools
10import itertools
11import json
12import math
13import mmap
14import os
15import pickle
16import re
17import signal
18import struct
19import sys
20import textwrap
21import time
22import zipfile
23from abc import ABC, abstractmethod
24from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
25from dataclasses import dataclass
26from pathlib import Path
27from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
28
29import numpy as np
30
31if 'NO_LOCAL_GGUF' not in os.environ:
32 # use .parent.parent since we are in "examples" directory
33 sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
34
35import gguf
36from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
37
38if TYPE_CHECKING:
39 from typing_extensions import Self, TypeAlias
40
41logger = logging.getLogger("convert")
42
43if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
44 faulthandler.register(signal.SIGUSR1)
45
46NDArray: TypeAlias = 'np.ndarray[Any, Any]'
47
48ARCH = gguf.MODEL_ARCH.LLAMA
49
50DEFAULT_CONCURRENCY = 8
51
52ADDED_TOKENS_FILE = 'added_tokens.json'
53FAST_TOKENIZER_FILE = 'tokenizer.json'
54
55#
56# data types
57#
58
59
60@dataclass(frozen=True)
61class DataType:
62 name: str
63 dtype: np.dtype[Any]
64 valid_conversions: list[str]
65
66 def elements_to_bytes(self, n_elements: int) -> int:
67 return n_elements * self.dtype.itemsize
68
69
70@dataclass(frozen=True)
71class UnquantizedDataType(DataType):
72 pass
73
74
75DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
76DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
77DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
78DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
79
80
81@dataclass(frozen=True)
82class QuantizedDataType(DataType):
83 block_size: int
84 quantized_dtype: np.dtype[Any]
85 ggml_type: gguf.GGMLQuantizationType
86
87 def quantize(self, arr: NDArray) -> NDArray:
88 raise NotImplementedError(f'Quantization for {self.name} not implemented')
89
90 def elements_to_bytes(self, n_elements: int) -> int:
91 assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
92 return self.quantized_dtype.itemsize * (n_elements // self.block_size)
93
94
95@dataclass(frozen=True)
96class Q8_0QuantizedDataType(QuantizedDataType):
97 # Mini Q8_0 quantization in Python!
98 def quantize(self, arr: NDArray) -> NDArray:
99 assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
100 assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
101 n_blocks = arr.size // self.block_size
102 blocks = arr.reshape((n_blocks, self.block_size))
103 # Much faster implementation of block quantization contributed by @Cebtenzzre
104
105 def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
106 d = abs(blocks).max(axis = 1) / np.float32(127)
107 with np.errstate(divide = 'ignore'):
108 qs = (blocks / d[:, None]).round()
109 qs[d == 0] = 0
110 yield from zip(d, qs)
111 return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
112
113
114DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
115 dtype = np.dtype(np.float32), valid_conversions = [],
116 ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
117 quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
118
119# Quantized types skipped here because they may also map to np.float32
120NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
121for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
122 if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
123 raise ValueError(f'Invalid duplicate data type {dt}')
124 NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
125
126SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
127 'BF16': DT_BF16,
128 'F16': DT_F16,
129 'F32': DT_F32,
130 'I32': DT_I32,
131}
132
133# TODO: match this with `llama_ftype`
134# TODO: rename to LLAMAFileType
135# TODO: move to `gguf.py`
136
137
138class GGMLFileType(enum.IntEnum):
139 AllF32 = 0
140 MostlyF16 = 1 # except 1d tensors
141 MostlyQ8_0 = 7 # except 1d tensors
142
143 def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
144 dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
145 if dt is None:
146 raise ValueError(self)
147 # Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
148 # Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
149 return dt if len(tensor.shape) > 1 else DT_F32
150
151
152GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
153 GGMLFileType.AllF32 : DT_F32,
154 GGMLFileType.MostlyF16 : DT_F16,
155 GGMLFileType.MostlyQ8_0: DT_Q8_0,
156}
157
158#
159# hparams loading
160#
161
162
163@dataclass
164class Params:
165 n_vocab: int
166 n_embd: int
167 n_layer: int
168 n_ctx: int
169 n_ff: int
170 n_head: int
171 n_head_kv: int
172 n_experts: int | None = None
173 n_experts_used: int | None = None
174 f_norm_eps: float | None = None
175
176 rope_scaling_type: gguf.RopeScalingType | None = None
177 f_rope_freq_base: float | None = None
178 f_rope_scale: float | None = None
179 n_ctx_orig: int | None = None
180 rope_finetuned: bool | None = None
181
182 ftype: GGMLFileType | None = None
183
184 # path to the directory containing the model files
185 path_model: Path | None = None
186
187 @staticmethod
188 def guessed(model: LazyModel) -> Params:
189 # try transformer naming first
190 n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
191
192 # try transformer naming first
193 if "model.layers.0.self_attn.q_proj.weight" in model:
194 n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
195 elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
196 n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
197 else:
198 n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
199
200 if n_layer < 1:
201 msg = """\
202 failed to guess 'n_layer'. This model is unknown or unsupported.
203 Suggestion: provide 'config.json' of the model in the same directory containing model files."""
204 raise KeyError(textwrap.dedent(msg))
205
206 n_head = n_embd // 128 # guessed
207 n_mult = 256 # guessed
208
209 # TODO: verify this
210 n_ff = int(2 * (4 * n_embd) / 3)
211 n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
212
213 return Params(
214 n_vocab = n_vocab,
215 n_embd = n_embd,
216 n_layer = n_layer,
217 n_ctx = -1,
218 n_ff = n_ff,
219 n_head = n_head,
220 n_head_kv = n_head,
221 f_norm_eps = 1e-5,
222 )
223
224 @staticmethod
225 def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
226 with open(config_path) as f:
227 config = json.load(f)
228
229 rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
230 rope_scaling = config.get("rope_scaling")
231
232 if rope_scaling is not None and (typ := rope_scaling.get("type")):
233 rope_factor = rope_scaling.get("factor")
234 f_rope_scale = rope_factor
235 if typ == "linear":
236 rope_scaling_type = gguf.RopeScalingType.LINEAR
237 elif typ == "yarn":
238 rope_scaling_type = gguf.RopeScalingType.YARN
239 n_ctx_orig = rope_scaling['original_max_position_embeddings']
240 rope_finetuned = rope_scaling['finetuned']
241 else:
242 raise NotImplementedError(f'Unknown rope scaling type: {typ}')
243
244 if "max_sequence_length" in config:
245 n_ctx = config["max_sequence_length"]
246 elif "max_position_embeddings" in config:
247 n_ctx = config["max_position_embeddings"]
248 else:
249 msg = """\
250 failed to guess 'n_ctx'. This model is unknown or unsupported.
251 Suggestion: provide 'config.json' of the model in the same directory containing model files."""
252 raise KeyError(textwrap.dedent(msg))
253
254 n_experts = None
255 n_experts_used = None
256
257 if "num_local_experts" in config:
258 n_experts = config["num_local_experts"]
259 n_experts_used = config["num_experts_per_tok"]
260
261 return Params(
262 n_vocab = config["vocab_size"],
263 n_embd = config["hidden_size"],
264 n_layer = config["num_hidden_layers"],
265 n_ctx = n_ctx,
266 n_ff = config["intermediate_size"],
267 n_head = (n_head := config["num_attention_heads"]),
268 n_head_kv = config.get("num_key_value_heads", n_head),
269 n_experts = n_experts,
270 n_experts_used = n_experts_used,
271 f_norm_eps = config["rms_norm_eps"],
272 f_rope_freq_base = config.get("rope_theta"),
273 rope_scaling_type = rope_scaling_type,
274 f_rope_scale = f_rope_scale,
275 n_ctx_orig = n_ctx_orig,
276 rope_finetuned = rope_finetuned,
277 )
278
279 # LLaMA v2 70B params.json
280 # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
281 @staticmethod
282 def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
283 with open(config_path) as f:
284 config = json.load(f)
285
286 n_experts = None
287 n_experts_used = None
288 f_rope_freq_base = None
289 n_ff = None
290
291 # hack to determine LLaMA v1 vs v2 vs CodeLlama
292 if config.get("moe"):
293 # Mixtral
294 n_ctx = 32768
295 elif config.get("rope_theta") == 1000000:
296 # CodeLlama
297 n_ctx = 16384
298 elif config["norm_eps"] == 1e-05:
299 # LLaMA v2
300 n_ctx = 4096
301 else:
302 # LLaMA v1
303 n_ctx = 2048
304
305 if "layers.0.feed_forward.w1.weight" in model:
306 n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
307
308 if config.get("moe"):
309 n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
310 n_experts = config["moe"]["num_experts"]
311 n_experts_used = config["moe"]["num_experts_per_tok"]
312 f_rope_freq_base = 1e6
313
314 assert n_ff is not None
315
316 return Params(
317 n_vocab = model["tok_embeddings.weight"].shape[0],
318 n_embd = config["dim"],
319 n_layer = config["n_layers"],
320 n_ctx = n_ctx,
321 n_ff = n_ff,
322 n_head = (n_head := config["n_heads"]),
323 n_head_kv = config.get("n_kv_heads", n_head),
324 n_experts = n_experts,
325 n_experts_used = n_experts_used,
326 f_norm_eps = config["norm_eps"],
327 f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
328 )
329
330 @staticmethod
331 def load(model_plus: ModelPlus) -> Params:
332 hf_config_path = model_plus.paths[0].parent / "config.json"
333 orig_config_path = model_plus.paths[0].parent / "params.json"
334
335 if hf_config_path.exists():
336 params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
337 elif orig_config_path.exists():
338 params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
339 elif model_plus.format != 'none':
340 params = Params.guessed(model_plus.model)
341 else:
342 raise ValueError('Cannot guess params when model format is none')
343
344 params.path_model = model_plus.paths[0].parent
345
346 return params
347
348
349#
350# data loading
351# TODO: reuse (probably move to gguf.py?)
352#
353
354
355def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
356 if n_head_kv is not None and n_head != n_head_kv:
357 n_head = n_head_kv
358 return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
359 .swapaxes(1, 2)
360 .reshape(weights.shape))
361
362
363class Tensor(ABC):
364 ndarray: NDArray
365 data_type: DataType
366
367 @abstractmethod
368 def astype(self, data_type: DataType) -> Self: ...
369 @abstractmethod
370 def permute(self, n_head: int, n_head_kv: int) -> Self: ...
371 @abstractmethod
372 def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
373 @abstractmethod
374 def part(self, n_part: int) -> Self: ...
375 @abstractmethod
376 def to_ggml(self) -> GGMLCompatibleTensor: ...
377
378
379def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
380 assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
381 fp32_arr = bf16_arr.astype(np.uint32) << 16
382 return fp32_arr.view(np.float32)
383
384
385class UnquantizedTensor(Tensor):
386 def __init__(self, ndarray: NDArray):
387 assert isinstance(ndarray, np.ndarray)
388 self.ndarray = ndarray
389 self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
390
391 def astype(self, data_type: DataType) -> UnquantizedTensor:
392 dtype = data_type.dtype
393 if self.data_type == DT_BF16:
394 self.ndarray = bf16_to_fp32(self.ndarray)
395 return UnquantizedTensor(self.ndarray.astype(dtype))
396
397 def to_ggml(self) -> Self:
398 return self
399
400 def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
401 r = self.ndarray.shape[0] // 3
402 return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
403
404 def part(self, n_part: int) -> UnquantizedTensor:
405 r = self.ndarray.shape[0] // 3
406 return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
407
408 def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
409 return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
410
411
412def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
413 tensor = lazy_tensor.load()
414 assert isinstance(tensor, UnquantizedTensor)
415
416 # double-check:
417 actual_shape = list(tensor.ndarray.shape)
418 assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
419 if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
420 if convert:
421 tensor.ndarray = tensor.ndarray.astype(expected_dtype)
422 else:
423 raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
424
425 return tensor.ndarray
426
427
428GGMLCompatibleTensor = UnquantizedTensor
429
430
431@dataclass
432class LazyTensor:
433 _load: Callable[[], Tensor]
434 shape: list[int]
435 data_type: DataType
436 description: str
437
438 def load(self) -> Tensor:
439 ret = self._load()
440 # Should be okay if it maps to the same numpy type?
441 assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
442 (self.data_type, ret.data_type, self.description)
443 return ret
444
445 def astype(self, data_type: DataType) -> LazyTensor:
446 self.validate_conversion_to(data_type)
447
448 def load() -> Tensor:
449 return self.load().astype(data_type)
450 return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
451
452 def validate_conversion_to(self, data_type: DataType) -> None:
453 if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
454 raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
455
456
457LazyModel: TypeAlias = 'dict[str, LazyTensor]'
458
459ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
460
461@dataclass
462class ModelPlus:
463 model: LazyModel
464 paths: list[Path] # Where this was read from.
465 format: ModelFormat
466 vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
467
468
469def merge_sharded(models: list[LazyModel]) -> LazyModel:
470 # Original LLaMA models have each file contain one part of each tensor.
471 # Use a dict instead of a set to preserve order.
472 names = {name: None for model in models for name in model}
473
474 def convert(name: str) -> LazyTensor:
475 lazy_tensors = [model[name] for model in models]
476 if len(lazy_tensors) == 1:
477 # only one file; don't go through this procedure since there might
478 # be quantized tensors
479 return lazy_tensors[0]
480 if len(lazy_tensors[0].shape) == 1:
481 # the tensor is just duplicated in every file
482 return lazy_tensors[0]
483 if name.startswith('tok_embeddings.') or \
484 name.endswith('.attention.wo.weight') or \
485 name.endswith('.feed_forward.w2.weight'):
486 # split by columns
487 axis = 1
488 else:
489 # split by rows
490 axis = 0
491 concatenated_shape = list(lazy_tensors[0].shape)
492 concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
493
494 def load() -> UnquantizedTensor:
495 ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
496 concatenated = np.concatenate(ndarrays, axis=axis)
497 return UnquantizedTensor(concatenated)
498 description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
499 return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
500 return {name: convert(name) for name in names}
501
502
503def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
504 formats: set[ModelFormat] = set(mp.format for mp in models_plus)
505 assert len(formats) == 1, "different formats?"
506 format = formats.pop()
507 paths = [path for mp in models_plus for path in mp.paths]
508 # Use the first non-None vocab, if any.
509 try:
510 vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
511 except StopIteration:
512 vocab = None
513
514 if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
515 # Transformers models put different tensors in different files, but
516 # don't split individual tensors between files.
517 model: LazyModel = {}
518 for mp in models_plus:
519 model.update(mp.model)
520 else:
521 model = merge_sharded([mp.model for mp in models_plus])
522
523 return ModelPlus(model, paths, format, vocab)
524
525
526def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
527 def load() -> Tensor:
528 return lazy_tensor.load().permute(n_head, n_head_kv)
529 return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
530
531
532def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
533 def load() -> Tensor:
534 return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
535 s = lazy_tensor.shape.copy()
536 s[0] = s[0] // 3
537 return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
538
539
540def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
541 def load() -> Tensor:
542 return lazy_tensor.load().part(n_part)
543 s = lazy_tensor.shape.copy()
544 s[0] = s[0] // 3
545 return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
546
547
548def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
549 def load() -> Tensor:
550 tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
551 return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
552 s = lazy_tensors[0].shape.copy()
553 s.insert(0, len(lazy_tensors))
554 return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
555
556
557# Functionality that simulates `torch.load` but where individual tensors are
558# only loaded into memory on demand, not all at once.
559# PyTorch can't do this natively as of time of writing:
560# - https://github.com/pytorch/pytorch/issues/64327
561# This allows us to de-shard without multiplying RAM usage, and also
562# conveniently drops the PyTorch dependency (though we still need numpy).
563
564
565@dataclass
566class LazyStorageKind:
567 data_type: DataType
568
569
570@dataclass
571class LazyStorage:
572 load: Callable[[int, int], NDArray]
573 kind: LazyStorageKind
574 description: str
575
576
577class LazyUnpickler(pickle.Unpickler):
578 def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
579 super().__init__(fp)
580 self.data_base_path = data_base_path
581 self.zip_file = zip_file
582
583 def persistent_load(self, pid: Any) -> Any:
584 assert pid[0] == 'storage'
585 assert isinstance(pid[1], LazyStorageKind)
586 data_type = pid[1].data_type
587 filename_stem = pid[2]
588 filename = f'{self.data_base_path}/{filename_stem}'
589 info = self.zip_file.getinfo(filename)
590
591 def load(offset: int, elm_count: int) -> NDArray:
592 dtype = data_type.dtype
593 with self.zip_file.open(info) as fp:
594 fp.seek(offset * dtype.itemsize)
595 size = elm_count * dtype.itemsize
596 data = fp.read(size)
597 assert len(data) == size
598 return np.frombuffer(data, dtype)
599 description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
600 return LazyStorage(load=load, kind=pid[1], description=description)
601
602 @staticmethod
603 def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
604 requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
605 assert isinstance(storage, LazyStorage)
606
607 def load() -> UnquantizedTensor:
608 elm_count = stride[0] * size[0]
609 return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
610 description = f'pickled storage_offset={storage_offset} in {storage.description}'
611 return LazyTensor(load, list(size), storage.kind.data_type, description)
612
613 @staticmethod
614 def rebuild_from_type_v2(func, new_type, args, state):
615 return func(*args)
616
617 CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
618 # getattr used here as a workaround for mypy not being smart enough to determine
619 # the staticmethods have a __func__ attribute.
620 ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
621 ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
622 ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
623 ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
624 ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
625 ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
626 ('torch', 'Tensor'): LazyTensor,
627 }
628
629 def find_class(self, module: str, name: str) -> Any:
630 if not module.startswith('torch'):
631 return super().find_class(module, name)
632 return self.CLASSES[(module, name)]
633
634
635def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
636 zf = zipfile.ZipFile(outer_fp)
637 pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
638 assert len(pickle_paths) == 1, pickle_paths
639 pickle_fp = zf.open(pickle_paths[0], 'r')
640 unpickler = LazyUnpickler(pickle_fp,
641 data_base_path=pickle_paths[0][:-4],
642 zip_file=zf)
643 model = unpickler.load()
644 if 'model' in model: model = model['model']
645 as_dict = dict(model.items())
646 return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
647
648
649def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
650 header_size, = struct.unpack('<Q', fp.read(8))
651 header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
652 # Use mmap for the actual data to avoid race conditions with the file offset.
653 mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
654 byte_buf = mapped[8 + header_size:]
655
656 def convert(info: dict[str, Any]) -> LazyTensor:
657 data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
658 numpy_dtype = data_type.dtype
659 shape: list[int] = info['shape']
660 begin, end = info['data_offsets']
661 assert 0 <= begin <= end <= len(byte_buf)
662 assert end - begin == math.prod(shape) * numpy_dtype.itemsize
663 buf = byte_buf[begin:end]
664
665 def load() -> UnquantizedTensor:
666 return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
667 description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
668 return LazyTensor(load, shape, data_type, description)
669 model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
670 return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
671
672
673def must_read(fp: IO[bytes], length: int) -> bytes:
674 ret = fp.read(length)
675 if len(ret) < length:
676 raise EOFError("unexpectedly reached end of file")
677 return ret
678
679
680@functools.lru_cache(maxsize=None)
681def lazy_load_file(path: Path) -> ModelPlus:
682 fp = open(path, 'rb')
683 first8 = fp.read(8)
684 fp.seek(0)
685 if first8[:2] == b'PK':
686 # A zip file, i.e. PyTorch format
687 return lazy_load_torch_file(fp, path)
688 elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
689 # Probably safetensors
690 return lazy_load_safetensors_file(fp, path)
691 else:
692 raise ValueError(f"unknown format: {path}")
693
694
695In = TypeVar('In')
696Out = TypeVar('Out')
697
698
699def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
700 '''Parallel map, but with backpressure. If the caller doesn't call `next`
701 fast enough, this will stop calling `func` at some point rather than
702 letting results pile up in memory. Specifically, there is a max of one
703 output value buffered per thread.'''
704 if concurrency < 2:
705 yield from map(func, iterable)
706 # Not reached.
707 iterable = iter(iterable)
708 executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
709 if use_processpool_executor:
710 executor_class = ProcessPoolExecutor
711 else:
712 executor_class = ThreadPoolExecutor
713 with executor_class(max_workers=max_workers) as executor:
714 futures: list[concurrent.futures.Future[Out]] = []
715 done = False
716 for _ in range(concurrency):
717 try:
718 futures.append(executor.submit(func, next(iterable)))
719 except StopIteration:
720 done = True
721 break
722
723 while futures:
724 result = futures.pop(0).result()
725 while not done and len(futures) < concurrency:
726 try:
727 futures.append(executor.submit(func, next(iterable)))
728 except StopIteration:
729 done = True
730 break
731 yield result
732
733
734def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
735 # Handle special case where the model's vocab size is not set
736 if params.n_vocab == -1:
737 raise ValueError(
738 "The model's vocab size is set to -1 in params.json. Please update it manually."
739 + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
740 )
741 if not isinstance(vocab, Vocab):
742 return # model has no vocab
743
744 # Check for a vocab size mismatch
745 if params.n_vocab == vocab.vocab_size:
746 logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
747 return
748
749 if pad_vocab and params.n_vocab > vocab.vocab_size:
750 pad_count = params.n_vocab - vocab.vocab_size
751 logger.debug(
752 f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
753 )
754 for i in range(1, pad_count + 1):
755 vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
756 vocab.added_tokens_list.append(f"<dummy{i:05}>")
757 vocab.vocab_size = params.n_vocab
758 return
759
760 msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
761 if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
762 msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
763 if vocab.vocab_size < params.n_vocab:
764 msg += " Add the --pad-vocab option and try again."
765
766 raise ValueError(msg)
767
768
769class OutputFile:
770 def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
771 self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
772
773 def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
774 # Metadata About The Model And Its Provenence
775 name = "LLaMA"
776 if metadata is not None and metadata.name is not None:
777 name = metadata.name
778 elif params.path_model is not None:
779 name = params.path_model.name
780 elif params.n_ctx == 4096:
781 # Heuristic detection of LLaMA v2 model
782 name = "LLaMA v2"
783
784 self.gguf.add_name(name)
785
786 if metadata is not None:
787 if metadata.author is not None:
788 self.gguf.add_author(metadata.author)
789 if metadata.version is not None:
790 self.gguf.add_version(metadata.version)
791 if metadata.organization is not None:
792 self.gguf.add_organization(metadata.organization)
793
794 if metadata.finetune is not None:
795 self.gguf.add_finetune(metadata.finetune)
796 if metadata.basename is not None:
797 self.gguf.add_basename(metadata.basename)
798
799 if metadata.description is not None:
800 self.gguf.add_description(metadata.description)
801 if metadata.quantized_by is not None:
802 self.gguf.add_quantized_by(metadata.quantized_by)
803
804 if metadata.size_label is not None:
805 self.gguf.add_size_label(metadata.size_label)
806
807 if metadata.license is not None:
808 self.gguf.add_license(metadata.license)
809 if metadata.license_name is not None:
810 self.gguf.add_license_name(metadata.license_name)
811 if metadata.license_link is not None:
812 self.gguf.add_license_link(metadata.license_link)
813
814 if metadata.url is not None:
815 self.gguf.add_url(metadata.url)
816 if metadata.doi is not None:
817 self.gguf.add_doi(metadata.doi)
818 if metadata.uuid is not None:
819 self.gguf.add_uuid(metadata.uuid)
820 if metadata.repo_url is not None:
821 self.gguf.add_repo_url(metadata.repo_url)
822
823 if metadata.source_url is not None:
824 self.gguf.add_source_url(metadata.source_url)
825 if metadata.source_doi is not None:
826 self.gguf.add_source_doi(metadata.source_doi)
827 if metadata.source_uuid is not None:
828 self.gguf.add_source_uuid(metadata.source_uuid)
829 if metadata.source_repo_url is not None:
830 self.gguf.add_source_repo_url(metadata.source_repo_url)
831
832 if metadata.base_models is not None:
833 self.gguf.add_base_model_count(len(metadata.base_models))
834 for key, base_model_entry in enumerate(metadata.base_models):
835 if "name" in base_model_entry:
836 self.gguf.add_base_model_name(key, base_model_entry["name"])
837 if "author" in base_model_entry:
838 self.gguf.add_base_model_author(key, base_model_entry["author"])
839 if "version" in base_model_entry:
840 self.gguf.add_base_model_version(key, base_model_entry["version"])
841 if "organization" in base_model_entry:
842 self.gguf.add_base_model_organization(key, base_model_entry["organization"])
843 if "description" in base_model_entry:
844 self.gguf.add_base_model_description(key, base_model_entry["description"])
845 if "url" in base_model_entry:
846 self.gguf.add_base_model_url(key, base_model_entry["url"])
847 if "doi" in base_model_entry:
848 self.gguf.add_base_model_doi(key, base_model_entry["doi"])
849 if "uuid" in base_model_entry:
850 self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
851 if "repo_url" in base_model_entry:
852 self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
853
854 if metadata.datasets is not None:
855 self.gguf.add_dataset_count(len(metadata.datasets))
856 for key, dataset_entry in enumerate(metadata.datasets):
857 if "name" in dataset_entry:
858 self.gguf.add_dataset_name(key, dataset_entry["name"])
859 if "author" in dataset_entry:
860 self.gguf.add_dataset_author(key, dataset_entry["author"])
861 if "version" in dataset_entry:
862 self.gguf.add_dataset_version(key, dataset_entry["version"])
863 if "organization" in dataset_entry:
864 self.gguf.add_dataset_organization(key, dataset_entry["organization"])
865 if "description" in dataset_entry:
866 self.gguf.add_dataset_description(key, dataset_entry["description"])
867 if "url" in dataset_entry:
868 self.gguf.add_dataset_url(key, dataset_entry["url"])
869 if "doi" in dataset_entry:
870 self.gguf.add_dataset_doi(key, dataset_entry["doi"])
871 if "uuid" in dataset_entry:
872 self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
873 if "repo_url" in dataset_entry:
874 self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
875
876 if metadata.tags is not None:
877 self.gguf.add_tags(metadata.tags)
878 if metadata.languages is not None:
879 self.gguf.add_languages(metadata.languages)
880
881 def add_meta_arch(self, params: Params) -> None:
882 # Metadata About The Neural Architecture Itself
883 self.gguf.add_vocab_size(params.n_vocab)
884 self.gguf.add_context_length(params.n_ctx)
885 self.gguf.add_embedding_length(params.n_embd)
886 self.gguf.add_block_count(params.n_layer)
887 self.gguf.add_feed_forward_length(params.n_ff)
888 self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
889 self.gguf.add_head_count (params.n_head)
890 self.gguf.add_head_count_kv (params.n_head_kv)
891
892 if params.n_experts:
893 self.gguf.add_expert_count(params.n_experts)
894
895 if params.n_experts_used:
896 self.gguf.add_expert_used_count(params.n_experts_used)
897
898 if params.f_norm_eps:
899 self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
900 else:
901 raise ValueError('f_norm_eps is None')
902
903 if params.f_rope_freq_base is not None:
904 self.gguf.add_rope_freq_base(params.f_rope_freq_base)
905
906 if params.rope_scaling_type:
907 assert params.f_rope_scale is not None
908 self.gguf.add_rope_scaling_type(params.rope_scaling_type)
909 self.gguf.add_rope_scaling_factor(params.f_rope_scale)
910
911 if params.n_ctx_orig is not None:
912 self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
913
914 if params.rope_finetuned is not None:
915 self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
916
917 if params.ftype is not None:
918 self.gguf.add_file_type(params.ftype)
919
920 def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
921 tokens = []
922 scores = []
923 toktypes = []
924
925 # NOTE: `all_tokens` returns the base vocabulary and added tokens
926 for text, score, toktype in vocab.all_tokens():
927 tokens.append(text)
928 scores.append(score)
929 toktypes.append(toktype)
930
931 assert len(tokens) == vocab.vocab_size
932
933 return tokens, scores, toktypes
934
935 def add_meta_vocab(self, vocab: Vocab) -> None:
936 # Ensure that tokenizer_model is added to the GGUF model
937 self.gguf.add_tokenizer_model(vocab.tokenizer_model)
938
939 # Extract model vocabulary for model conversion
940 tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
941
942 # Add extracted token information for model conversion
943 self.gguf.add_token_list(tokens)
944 self.gguf.add_token_scores(scores)
945 self.gguf.add_token_types(toktypes)
946
947 def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
948 svocab.add_to_gguf(self.gguf)
949
950 def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
951 n_elements = int(np.prod(tensor.shape))
952 raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
953 data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
954 data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
955 self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
956
957 def write_meta(self) -> None:
958 self.gguf.write_header_to_file()
959 self.gguf.write_kv_data_to_file()
960
961 def write_tensor_info(self) -> None:
962 self.gguf.write_ti_data_to_file()
963
964 def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
965 ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
966 if ftype == GGMLFileType.MostlyQ8_0:
967 ndarrays = bounded_parallel_map(
968 OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
969 use_processpool_executor=True,
970 )
971 else:
972 ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
973
974 start = time.time()
975 for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
976 elapsed = time.time() - start
977 size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
978 padi = len(str(len(model)))
979 logger.info(
980 f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
981 )
982 self.gguf.write_tensor_data(ndarray)
983
984 def close(self) -> None:
985 self.gguf.close()
986
987 @staticmethod
988 def write_vocab_only(
989 fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
990 endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
991 ) -> None:
992 check_vocab_size(params, vocab, pad_vocab=pad_vocab)
993
994 of = OutputFile(fname_out, endianess=endianess)
995
996 # meta data
997 of.add_meta_model(params, metadata)
998 of.add_meta_arch(params)
999 of.add_meta_vocab(vocab)
1000 of.add_meta_special_vocab(svocab)
1001
1002 of.write_meta()
1003
1004 of.close()
1005
1006 @staticmethod
1007 def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
1008 name, lazy_tensor = item
1009 tensor = lazy_tensor.load().to_ggml()
1010 return (lazy_tensor.data_type, tensor.ndarray)
1011
1012 @staticmethod
1013 def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
1014 dt, arr = item
1015 if not isinstance(dt, QuantizedDataType):
1016 return arr
1017 return dt.quantize(arr)
1018
1019 @staticmethod
1020 def write_all(
1021 fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
1022 concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
1023 pad_vocab: bool = False,
1024 metadata: gguf.Metadata | None = None,
1025 ) -> None:
1026 check_vocab_size(params, vocab, pad_vocab=pad_vocab)
1027
1028 of = OutputFile(fname_out, endianess=endianess)
1029
1030 # meta data
1031 of.add_meta_model(params, metadata)
1032 of.add_meta_arch(params)
1033 if isinstance(vocab, Vocab):
1034 of.add_meta_vocab(vocab)
1035 of.add_meta_special_vocab(svocab)
1036 else: # NoVocab
1037 of.gguf.add_tokenizer_model(vocab.tokenizer_model)
1038
1039 # tensor info
1040 for name, lazy_tensor in model.items():
1041 of.add_tensor_info(name, lazy_tensor)
1042
1043 of.write_meta()
1044 of.write_tensor_info()
1045
1046 # tensor data
1047 of.write_tensor_data(ftype, model, concurrency)
1048
1049 of.close()
1050
1051
1052def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
1053 wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
1054
1055 if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
1056 return GGMLFileType.AllF32
1057 if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
1058 return GGMLFileType.MostlyF16
1059 if output_type_str == "q8_0":
1060 return GGMLFileType.MostlyQ8_0
1061
1062 name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
1063
1064 raise ValueError(f"Unexpected combination of types: {name_to_type}")
1065
1066
1067def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
1068 total_params = 0
1069 shared_params = 0
1070 expert_params = 0
1071
1072 for name, lazy_tensor in tensors:
1073 # We don't need these
1074 if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
1075 continue
1076
1077 # Got A Tensor
1078 sum_weights_in_tensor: int = 1
1079
1080 # Tensor Volume
1081 for dim in lazy_tensor.shape:
1082 sum_weights_in_tensor *= dim
1083
1084 if ".experts." in name:
1085 if ".experts.0." in name:
1086 expert_params += sum_weights_in_tensor
1087 else:
1088 shared_params += sum_weights_in_tensor
1089
1090 total_params += sum_weights_in_tensor
1091
1092 return total_params, shared_params, expert_params
1093
1094
1095def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
1096 return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
1097 for (name, tensor) in model.items()}
1098
1099
1100def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
1101 tmap = gguf.TensorNameMap(ARCH, params.n_layer)
1102 should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
1103
1104 tmp = model
1105
1106 # merge experts into one tensor
1107 if params.n_experts and params.n_experts > 0:
1108 for i_l in range(params.n_layer):
1109 for w in range(1, 4):
1110 experts = []
1111 for e in range(params.n_experts):
1112 if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
1113 experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
1114 del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
1115 elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
1116 experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
1117 del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
1118 else:
1119 raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
1120 tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
1121
1122 # HF models permut or pack some of the tensors, so we need to undo that
1123 for i in itertools.count():
1124 if f"model.layers.{i}.self_attn.q_proj.weight" in model:
1125 logger.debug(f"Permuting layer {i}")
1126 tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
1127 tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
1128 # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
1129 elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
1130 logger.debug(f"Unpacking and permuting layer {i}")
1131 tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
1132 tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
1133 tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
1134 del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
1135 else:
1136 break
1137
1138 out: LazyModel = {}
1139 for name, lazy_tensor in model.items():
1140 tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
1141 if name_new is None:
1142 if skip_unknown:
1143 logger.warning(f"Unexpected tensor name: {name} - skipping")
1144 continue
1145 raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
1146
1147 if tensor_type in should_skip:
1148 logger.debug(f"skipping tensor {name_new}")
1149 continue
1150
1151 logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
1152 out[name_new] = lazy_tensor
1153
1154 return out
1155
1156
1157def nth_multifile_path(path: Path, n: int) -> Path | None:
1158 '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1159 the nth path in the model.
1160 '''
1161 # Support the following patterns:
1162 patterns = [
1163 # - x.00.pth, x.01.pth, etc.
1164 (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
1165 # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
1166 (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
1167 # x.bin, x.bin.1, etc.
1168 (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
1169 ]
1170 for regex, replacement in patterns:
1171 if re.search(regex, path.name):
1172 new_path = path.with_name(re.sub(regex, replacement, path.name))
1173 if new_path.exists():
1174 return new_path
1175 return None
1176
1177
1178def find_multifile_paths(path: Path) -> list[Path]:
1179 '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
1180 the whole list of paths in the model.
1181 '''
1182 ret: list[Path] = []
1183 for i in itertools.count():
1184 nth_path = nth_multifile_path(path, i)
1185 if nth_path is None:
1186 break
1187 ret.append(nth_path)
1188 if not ret:
1189 # No matches. This should only happen if the file was named, e.g.,
1190 # foo.0, and there was no file named foo. Oh well, try to process it
1191 # as a single file.
1192 return [path]
1193 return ret
1194
1195
1196def load_some_model(path: Path) -> ModelPlus:
1197 '''Load a model of any supported format.'''
1198 # Be extra-friendly and accept either a file or a directory:
1199 if path.is_dir():
1200 # Check if it's a set of safetensors files first
1201 globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
1202 files = [file for glob in globs for file in path.glob(glob)]
1203 if not files:
1204 # Try the PyTorch patterns too, with lower priority
1205 globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
1206 files = [file for glob in globs for file in path.glob(glob)]
1207 if not files:
1208 raise FileNotFoundError(f"Can't find model in directory {path}")
1209 if len(files) > 1:
1210 raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
1211 path = files[0]
1212
1213 paths = find_multifile_paths(path)
1214 models_plus: list[ModelPlus] = []
1215 for path in paths:
1216 logger.info(f"Loading model file {path}")
1217 models_plus.append(lazy_load_file(path))
1218
1219 model_plus = merge_multifile_models(models_plus)
1220 return model_plus
1221
1222
1223class VocabFactory:
1224 _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
1225
1226 def __init__(self, path: Path):
1227 self.path = path
1228
1229 def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
1230 load_merges = vocab.name == "bpe"
1231 n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
1232 return gguf.SpecialVocab(
1233 model_parent_path,
1234 load_merges=load_merges,
1235 special_token_types=None, # Predetermined or passed as a parameter
1236 n_vocab=n_vocab,
1237 )
1238
1239 def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
1240 vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
1241 selected_vocabs: dict[str, type[Vocab]] = {}
1242 for vtype in vocab_types:
1243 try:
1244 selected_vocabs[vtype] = vocab_classes[vtype]
1245 except KeyError:
1246 raise ValueError(f"Unsupported vocabulary type {vtype}") from None
1247
1248 for vtype, cls in selected_vocabs.items():
1249 try:
1250 vocab = cls(self.path)
1251 break
1252 except FileNotFoundError:
1253 pass # ignore unavailable tokenizers
1254 else:
1255 raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
1256
1257 logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
1258 return vocab
1259
1260 def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
1261 vocab: BaseVocab
1262 if vocab_types is None:
1263 vocab = NoVocab()
1264 else:
1265 vocab = self._create_vocab_by_path(vocab_types)
1266 # FIXME: Respect --vocab-dir?
1267 special_vocab = self._create_special_vocab(
1268 vocab,
1269 model_parent_path,
1270 )
1271 return vocab, special_vocab
1272
1273
1274def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
1275 name = metadata.name if metadata.name is not None else None
1276 basename = metadata.basename if metadata.basename is not None else None
1277 finetune = metadata.finetune if metadata.finetune is not None else None
1278 version = metadata.version if metadata.version is not None else None
1279 size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
1280
1281 output_type = {
1282 GGMLFileType.AllF32: "F32",
1283 GGMLFileType.MostlyF16: "F16",
1284 GGMLFileType.MostlyQ8_0: "Q8_0",
1285 }[file_type]
1286
1287 return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
1288
1289
1290def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
1291 default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
1292 ret = model_paths[0].parent / f"{default_filename}.gguf"
1293 if ret in model_paths:
1294 logger.error(
1295 f"Error: Default output path ({ret}) would overwrite the input. "
1296 "Please explicitly specify a path using --outfile.")
1297 sys.exit(1)
1298 return ret
1299
1300
1301def do_dump_model(model_plus: ModelPlus) -> None:
1302 print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
1303 print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
1304 print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
1305 for name, lazy_tensor in model_plus.model.items():
1306 print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
1307
1308
1309def main(args_in: list[str] | None = None) -> None:
1310 output_choices = ["f32", "f16"]
1311 if np.uint32(1) == np.uint32(1).newbyteorder("<"):
1312 # We currently only support Q8_0 output on little endian systems.
1313 output_choices.append("q8_0")
1314 parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
1315 parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
1316 parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
1317 parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
1318 parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
1319 parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
1320 parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
1321 parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
1322 parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
1323 parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
1324 parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
1325 parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
1326 parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
1327 parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
1328 parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
1329 parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
1330 parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file")
1331 parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
1332 parser.add_argument("--model-name", type=str, default=None, help="name of the model")
1333
1334 args = parser.parse_args(args_in)
1335
1336 if args.verbose:
1337 logging.basicConfig(level=logging.DEBUG)
1338 elif args.dump_single or args.dump or args.get_outfile:
1339 # Avoid printing anything besides the dump output
1340 logging.basicConfig(level=logging.WARNING)
1341 else:
1342 logging.basicConfig(level=logging.INFO)
1343
1344 model_name = args.model_name
1345 dir_model = args.model
1346
1347 metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
1348
1349 if args.get_outfile:
1350 model_plus = load_some_model(dir_model)
1351 params = Params.load(model_plus)
1352 model = convert_model_names(model_plus.model, params, args.skip_unknown)
1353 model_params_count = per_model_weight_count_estimation(model_plus.model.items())
1354 ftype = pick_output_type(model, args.outtype)
1355
1356 if (metadata is None or metadata.name is None) and params.path_model is not None:
1357 metadata.name = params.path_model.name
1358
1359 print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
1360 return
1361
1362 if args.no_vocab and args.vocab_only:
1363 raise ValueError("--vocab-only does not make sense with --no-vocab")
1364
1365 if args.dump_single:
1366 model_plus = lazy_load_file(dir_model)
1367 do_dump_model(model_plus)
1368 return
1369
1370 if not args.vocab_only:
1371 model_plus = load_some_model(dir_model)
1372 else:
1373 model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
1374
1375 if args.dump:
1376 do_dump_model(model_plus)
1377 return
1378
1379 endianess = gguf.GGUFEndian.LITTLE
1380 if args.big_endian:
1381 endianess = gguf.GGUFEndian.BIG
1382
1383 params = None
1384 if args.pad_vocab or not args.vocab_only:
1385 params = Params.load(model_plus)
1386 if params.n_ctx == -1:
1387 if args.ctx is None:
1388 msg = """\
1389 The model doesn't have a context size, and you didn't specify one with --ctx
1390 Please specify one with --ctx:
1391 - LLaMA v1: --ctx 2048
1392 - LLaMA v2: --ctx 4096"""
1393 parser.error(textwrap.dedent(msg))
1394 params.n_ctx = args.ctx
1395
1396 if args.outtype:
1397 params.ftype = {
1398 "f32": GGMLFileType.AllF32,
1399 "f16": GGMLFileType.MostlyF16,
1400 "q8_0": GGMLFileType.MostlyQ8_0,
1401 }[args.outtype]
1402
1403 logger.info(f"params = {params}")
1404
1405 model_parent_path = model_plus.paths[0].parent
1406 vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
1407 vocab_factory = VocabFactory(vocab_path)
1408 vocab_types = None if args.no_vocab else args.vocab_type.split(",")
1409 vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
1410
1411 if args.vocab_only:
1412 assert isinstance(vocab, Vocab)
1413 if not args.outfile:
1414 raise ValueError("need --outfile if using --vocab-only")
1415 outfile = args.outfile
1416 if params is None:
1417 params = Params(
1418 n_vocab = vocab.vocab_size,
1419 n_embd = 1,
1420 n_layer = 1,
1421 n_ctx = 1,
1422 n_ff = 1,
1423 n_head = 1,
1424 n_head_kv = 1,
1425 f_norm_eps = 1e-5,
1426 )
1427 OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
1428 endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
1429 logger.info(f"Wrote {outfile}")
1430 return
1431
1432 if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
1433 vocab = model_plus.vocab
1434
1435 assert params is not None
1436
1437 if metadata.name is None and params.path_model is not None:
1438 metadata.name = params.path_model.name
1439
1440 model_params_count = per_model_weight_count_estimation(model_plus.model.items())
1441 logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
1442
1443 logger.info(f"Vocab info: {vocab}")
1444 logger.info(f"Special vocab info: {special_vocab}")
1445 model = model_plus.model
1446 model = convert_model_names(model, params, args.skip_unknown)
1447 ftype = pick_output_type(model, args.outtype)
1448 model = convert_to_output_type(model, ftype)
1449 outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
1450
1451 metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
1452
1453 params.ftype = ftype
1454 logger.info(f"Writing {outfile}, format {ftype}")
1455
1456 OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
1457 concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
1458 logger.info(f"Wrote {outfile}")
1459
1460
1461if __name__ == '__main__':
1462 main()