1#include "llama-model-loader.h"
2
3#include "ggml.h"
4
5#include <algorithm>
6#include <array>
7#include <cinttypes>
8#include <cstring>
9#include <future>
10
11static const size_t kiB = 1024;
12static const size_t MiB = 1024*kiB;
13static const size_t GiB = 1024*MiB;
14
15const char * llama_file_version_name(llama_fver version) {
16 switch (version) {
17 case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
18 case GGUF_FILE_VERSION_V2: return "GGUF V2";
19 case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
20 }
21
22 return "unknown";
23}
24
25static std::string llama_model_ftype_name(llama_ftype ftype) {
26 if (ftype & LLAMA_FTYPE_GUESSED) {
27 return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
28 }
29
30 switch (ftype) {
31 case LLAMA_FTYPE_ALL_F32: return "all F32";
32 case LLAMA_FTYPE_MOSTLY_F16: return "F16";
33 case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
34 case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
35 case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
36 case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
37 case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
38 case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
39 case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
40 case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
41 case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
42 case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
43 case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
44 case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
45 case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
46 case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
47 case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
48 case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
49 case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
50 case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
51 case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
52 case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
53 case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
54 case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
55 case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
56 case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
57 case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
58 case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
59 case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
60 case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
61 case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
62 case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
63 case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
64
65 default: return "unknown, may not work";
66 }
67}
68
69// return a list of splits for a given path
70// for example, given "<name>-00002-of-00004.gguf", returns list of all 4 splits
71static std::vector<std::string> llama_get_list_splits(const std::string & path, const int idx, const int n_split) {
72 std::vector<std::string> paths;
73 std::string split_prefix;
74 std::vector<char> buf(llama_path_max(), 0);
75
76 {
77 int ret = llama_split_prefix(buf.data(), buf.size(), path.c_str(), idx, n_split);
78 if (!ret) {
79 throw std::runtime_error(format("invalid split file name: %s", path.c_str()));
80 }
81 split_prefix = std::string(buf.data(), ret);
82 }
83
84 if (split_prefix.empty()) {
85 throw std::runtime_error(format("invalid split file: %s", path.c_str()));
86 }
87
88 for (int idx = 0; idx < n_split; ++idx) {
89 int ret = llama_split_path(buf.data(), buf.size(), split_prefix.c_str(), idx, n_split);
90 paths.push_back(std::string(buf.data(), ret));
91 }
92
93 return paths;
94}
95
96namespace GGUFMeta {
97 template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
98 struct GKV_Base_Type {
99 static constexpr gguf_type gt = gt_;
100
101 static T getter(const gguf_context * ctx, const int kid) {
102 return gfun(ctx, kid);
103 }
104 };
105
106 template<typename T> struct GKV_Base;
107
108 template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
109 template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
110 template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
111 template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
112 template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
113 template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
114 template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
115 template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
116 template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
117 template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
118 template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
119 template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
120
121 template<> struct GKV_Base<std::string> {
122 static constexpr gguf_type gt = GGUF_TYPE_STRING;
123
124 static std::string getter(const gguf_context * ctx, const int kid) {
125 return gguf_get_val_str(ctx, kid);
126 }
127 };
128
129 struct ArrayInfo {
130 const gguf_type gt;
131 const size_t length;
132 const void * data;
133 };
134
135 template<> struct GKV_Base<ArrayInfo> {
136 public:
137 static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
138 static ArrayInfo getter(const gguf_context *ctx, const int k) {
139 const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
140 return ArrayInfo {
141 arr_type,
142 size_t(gguf_get_arr_n(ctx, k)),
143 arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
144 };
145 }
146 };
147
148 template<typename T>
149 class GKV : public GKV_Base<T> {
150 GKV() = delete;
151
152 public:
153 static T get_kv(const gguf_context * ctx, const int k) {
154 const enum gguf_type kt = gguf_get_kv_type(ctx, k);
155
156 if (kt != GKV::gt) {
157 throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
158 gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
159 }
160 return GKV::getter(ctx, k);
161 }
162
163 static const char * override_type_to_str(const llama_model_kv_override_type ty) {
164 switch (ty) {
165 case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
166 case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
167 case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
168 case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
169 }
170 return "unknown";
171 }
172
173 static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
174 if (!ovrd) { return false; }
175 if (ovrd->tag == expected_type) {
176 LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
177 __func__, override_type_to_str(ovrd->tag), ovrd->key);
178 switch (ovrd->tag) {
179 case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
180 LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
181 } break;
182 case LLAMA_KV_OVERRIDE_TYPE_INT: {
183 LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
184 } break;
185 case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
186 LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
187 } break;
188 case LLAMA_KV_OVERRIDE_TYPE_STR: {
189 LLAMA_LOG_INFO("%s\n", ovrd->val_str);
190 } break;
191 default:
192 // Shouldn't be possible to end up here, but just in case...
193 throw std::runtime_error(
194 format("Unsupported attempt to override %s type for metadata key %s\n",
195 override_type_to_str(ovrd->tag), ovrd->key));
196 }
197 return true;
198 }
199 LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
200 __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
201 return false;
202 }
203
204 template<typename OT>
205 static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
206 try_override(OT & target, const struct llama_model_kv_override * ovrd) {
207 if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
208 target = ovrd->val_bool;
209 return true;
210 }
211 return false;
212 }
213
214 template<typename OT>
215 static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
216 try_override(OT & target, const struct llama_model_kv_override * ovrd) {
217 if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
218 target = ovrd->val_i64;
219 return true;
220 }
221 return false;
222 }
223
224 template<typename OT>
225 static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
226 try_override(T & target, const struct llama_model_kv_override * ovrd) {
227 if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
228 target = ovrd->val_f64;
229 return true;
230 }
231 return false;
232 }
233
234 template<typename OT>
235 static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
236 try_override(T & target, const struct llama_model_kv_override * ovrd) {
237 if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
238 target = ovrd->val_str;
239 return true;
240 }
241 return false;
242 }
243
244 static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
245 if (try_override<T>(target, ovrd)) {
246 return true;
247 }
248 if (k < 0) { return false; }
249 target = get_kv(ctx, k);
250 return true;
251 }
252
253 static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
254 return set(ctx, gguf_find_key(ctx, key), target, ovrd);
255 }
256
257 static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
258 return set(ctx, key.c_str(), target, ovrd);
259 }
260 };
261}
262
263 template<typename T>
264 typename std::enable_if<std::is_integral<T>::value, bool>::type
265 llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
266 const int kid = gguf_find_key(meta.get(), key.c_str());
267
268 if (kid < 0) {
269 if (required) {
270 throw std::runtime_error(format("key not found in model: %s", key.c_str()));
271 }
272 return false;
273 }
274
275 struct GGUFMeta::ArrayInfo arr_info =
276 GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
277
278
279 result = arr_info.length;
280 return true;
281 }
282
283 template<typename T>
284 typename std::enable_if<std::is_integral<T>::value, bool>::type
285 llama_model_loader::get_arr_n(enum llm_kv kid, T & result, bool required) {
286 return get_arr_n(llm_kv(kid), result, required);
287 }
288
289 template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
290
291 template<typename T>
292 bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
293 const gguf_context * ctx = meta.get();
294 const int kid = gguf_find_key(ctx, key.c_str());
295
296 if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
297 if (required) {
298 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
299 }
300 return false;
301 }
302
303 struct GGUFMeta::ArrayInfo arr_info =
304 GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
305
306 switch (arr_info.gt) {
307 case GGUF_TYPE_UINT32:
308 case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
309 (std::is_same<T, uint32_t>::value)); break;
310 case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
311 case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
312 default:
313 throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
314 }
315
316 if constexpr (std::is_same<T, std::string>::value) {
317 const size_t n_items = gguf_get_arr_n(ctx, kid);
318 result.clear();
319
320 for (size_t i = 0; i < n_items; i++) {
321 const T value = gguf_get_arr_str(ctx, kid, i);
322 result.emplace_back(value);
323 }
324 } else {
325 result.resize(arr_info.length);
326 result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
327 }
328
329 return true;
330 }
331
332 template<typename T, size_t N_MAX>
333 bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
334 const gguf_context * ctx = meta.get();
335 const int kid = gguf_find_key(ctx, key.c_str());
336
337 if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
338 if (required) {
339 throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
340 }
341 return false;
342 }
343
344 struct GGUFMeta::ArrayInfo arr_info =
345 GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
346
347 switch (arr_info.gt) {
348 case GGUF_TYPE_BOOL:
349 case GGUF_TYPE_UINT32:
350 case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
351 (std::is_same<T, uint32_t>::value)); break;
352 case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
353 case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same<T, std::string>::value)); break;
354 default:
355 throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
356 }
357
358 if (arr_info.length > N_MAX) {
359 throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
360 }
361
362 if constexpr (std::is_same<T, std::string>::value) {
363 const size_t n_items = gguf_get_arr_n(ctx, kid);
364
365 for (size_t i = 0; i < n_items; i++) {
366 const T value = gguf_get_arr_str(ctx, kid, i);
367 result[i] = value;
368 }
369 } else {
370 if (arr_info.gt == GGUF_TYPE_BOOL) {
371 std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
372 return static_cast<T>(x);
373 });
374 } else {
375 std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
376 }
377 }
378
379 return true;
380 }
381
382 template<typename T>
383 bool llama_model_loader::get_arr(enum llm_kv kid, T & result, bool required) {
384 return get_arr(llm_kv(kid), result, required);
385 }
386
387 template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
388
389 template<typename T>
390 bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
391 auto it = kv_overrides.find(key);
392
393 const struct llama_model_kv_override * override =
394 it != kv_overrides.end() ? &it->second : nullptr;
395
396 const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
397
398 if (required && !found) {
399 throw std::runtime_error(format("key not found in model: %s", key.c_str()));
400 }
401
402 return found;
403 }
404
405 template<typename T>
406 bool llama_model_loader::get_key(enum llm_kv kid, T & result, bool required) {
407 return get_key(llm_kv(kid), result, required);
408 }
409
410 template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
411 template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
412 template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
413 template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
414
415 template<>
416 bool llama_model_loader::get_key(enum llm_kv kid, enum llama_pooling_type & result, bool required) {
417 uint32_t tmp;
418 const bool found = get_key(kid, tmp, required);
419 if (found) {
420 result = (enum llama_pooling_type) tmp;
421 } else {
422 result = LLAMA_POOLING_TYPE_UNSPECIFIED;
423 }
424 return found;
425 }
426
427 // get array of n <= N_MAX elements, or a single element repeated n times
428 template<typename T, size_t N_MAX>
429 bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
430 const int kid = gguf_find_key(meta.get(), key.c_str());
431
432 if (kid < 0) {
433 if (required) {
434 throw std::runtime_error(format("key not found in model: %s", key.c_str()));
435 }
436 return false;
437 }
438
439 if (n > N_MAX) {
440 throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
441 }
442
443 if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
444 struct GGUFMeta::ArrayInfo arr_info =
445 GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
446
447 if (n != arr_info.length) {
448 throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
449 }
450
451 return get_arr(key, result, required);
452 }
453
454 T value;
455
456 bool ok = get_key(key, value, required);
457 if (!ok) {
458 return false;
459 }
460
461 for (uint32_t i = 0; i < n; i++) {
462 result[i] = value;
463 }
464
465 return true;
466 }
467
468 template<typename T>
469 bool llama_model_loader::get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required) {
470 return get_key_or_arr(llm_kv(kid), result, n, required);
471 }
472
473 bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
474 const std::string key = llm_kv(kid);
475
476 const int id = gguf_find_key(meta.get(), key.c_str());
477
478 if (id < 0) {
479 if (required) {
480 throw std::runtime_error(format("key not found in model: %s", key.c_str()));
481 }
482 return false;
483 }
484
485 // throw and error if type is an array
486 if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
487 if (required) {
488 throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
489 }
490 return false;
491 }
492
493 return get_key(key, result, required);
494 }
495
496 // TODO: this is not very clever - figure out something better
497 template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
498 template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
499 template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
500
501
502llama_model_loader::llama_model_loader(
503 const std::string & fname,
504 std::vector<std::string> & splits,
505 bool use_mmap,
506 bool use_direct_io,
507 bool check_tensors,
508 bool no_alloc,
509 const llama_model_kv_override * param_overrides_p,
510 const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
511 int trace = 0;
512 if (getenv("LLAMA_TRACE")) {
513 trace = atoi(getenv("LLAMA_TRACE"));
514 }
515
516 if (param_overrides_p != nullptr) {
517 for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
518 kv_overrides.insert({std::string(p->key), *p});
519 }
520 }
521
522 tensor_buft_overrides = param_tensor_buft_overrides_p;
523
524 // Load the main GGUF
525 struct ggml_context * ctx = NULL;
526 struct gguf_init_params params = {
527 /*.no_alloc = */ true,
528 /*.ctx = */ &ctx,
529 };
530
531 meta.reset(gguf_init_from_file(fname.c_str(), params));
532 if (!meta) {
533 throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
534 }
535
536 get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
537 llm_kv = LLM_KV(llm_arch_from_string(arch_name));
538
539 files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
540 contexts.emplace_back(ctx);
541
542 if (use_mmap && use_direct_io) {
543 if (files.back()->has_direct_io()) {
544 LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
545 use_mmap = false;
546 } else {
547 LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
548 use_direct_io = false;
549
550 // reopen file using std::fopen for mmap
551 files.pop_back();
552 files.emplace_back(new llama_file(fname.c_str(), "rb", false));
553 }
554 }
555
556 // Save tensors data offset of the main file.
557 // For subsidiary files, `meta` tensor data offset must not be used,
558 // so we build a unified tensors index for weights.
559 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
560 std::string tensor_name = std::string(cur->name);
561 // make sure there is no duplicated tensor names
562 if (weights_map.find(tensor_name) != weights_map.end()) {
563 throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
564 }
565 n_elements += ggml_nelements(cur);
566 n_bytes += ggml_nbytes(cur);
567 weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
568 }
569 uint16_t n_split = 0;
570 get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
571
572 // Load additional GGML contexts
573 if (n_split > 1) {
574 // make sure the main file is loaded first
575 uint16_t idx = 0;
576 const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
577 get_key(kv_split_no, idx);
578 if (idx != 0) {
579 throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
580 }
581
582 // generate list of splits if needed
583 if (splits.empty()) {
584 splits = llama_get_list_splits(fname, idx, n_split);
585 }
586
587 // in case user give a custom list of splits, check if it matches the expected number
588 if (n_split != (uint16_t)splits.size()) {
589 throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
590 }
591
592 if (trace > 0) {
593 LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
594 }
595
596 // load other splits
597 for (idx = 1; idx < n_split; idx++) {
598 const char * fname_split = splits[idx].c_str();
599
600 struct gguf_init_params split_params = {
601 /*.no_alloc = */ true,
602 /*.ctx = */ &ctx,
603 };
604 gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
605 if (!ctx_gguf) {
606 throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
607 }
608
609 // check idx
610 {
611 const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
612 if (kid < 0) {
613 throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
614 }
615 int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
616 if (idx_gguf != idx) {
617 throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
618 }
619 }
620
621 files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
622 contexts.emplace_back(ctx);
623
624 // Save tensors data offset info of the shard.
625 for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
626 std::string tensor_name = std::string(cur->name);
627 // make sure there is no duplicated tensor names
628 if (weights_map.find(tensor_name) != weights_map.end()) {
629 throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
630 }
631 n_elements += ggml_nelements(cur);
632 n_bytes += ggml_nbytes(cur);
633 weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
634 }
635 }
636
637 get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
638
639 // sanity check
640 {
641 const int n_tensors_loaded = (int) weights_map.size();
642 if (n_tensors != n_tensors_loaded) {
643 throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
644 }
645 }
646
647 LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
648 }
649
650 n_kv = gguf_get_n_kv(meta.get());
651 n_tensors = weights_map.size();
652
653 fver = (enum llama_fver) gguf_get_version(meta.get());
654
655 LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
656 __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
657
658 // determine file type based on the number of tensors for each quantization and print meta data
659 // TODO: make optional
660 {
661 std::map<enum ggml_type, uint32_t> n_type;
662
663 uint32_t n_type_max = 0;
664 enum ggml_type type_max = GGML_TYPE_F32;
665
666 for (const auto & it : weights_map) {
667 const llama_tensor_weight & w = it.second;
668 const ggml_tensor * tensor = w.tensor;
669
670 enum ggml_type type = tensor->type;
671
672 n_type[type]++;
673
674 if (n_type_max < n_type[type]) {
675 n_type_max = n_type[type];
676 type_max = type;
677 }
678
679 if (trace > 0) {
680 const uint16_t sid = w.idx;
681 LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__,
682 sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(),
683 ggml_nbytes(tensor)/1024.0f/1024.0f);
684 }
685 }
686
687 switch (type_max) {
688 case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
689 case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
690 case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
691 case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
692 case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
693 case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
694 case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
695 case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
696 case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
697 case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
698 case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
699 case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
700 case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
701 case GGML_TYPE_TQ1_0: ftype = LLAMA_FTYPE_MOSTLY_TQ1_0; break;
702 case GGML_TYPE_TQ2_0: ftype = LLAMA_FTYPE_MOSTLY_TQ2_0; break;
703 case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
704 case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
705 case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
706 case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
707 case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
708 case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
709 case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
710 case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
711 case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
712 default:
713 {
714 LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
715 ftype = LLAMA_FTYPE_ALL_F32;
716 } break;
717 }
718
719 // this is a way to mark that we have "guessed" the file type
720 ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
721
722 {
723 uint32_t ftype_val = 0;
724 if (get_key(LLM_KV_GENERAL_FILE_TYPE, ftype_val, false)) {
725 ftype = (llama_ftype) ftype_val;
726 }
727 }
728
729 LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
730
731 for (int i = 0; i < n_kv; i++) {
732 const char * name = gguf_get_key(meta.get(), i);
733 const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
734 const std::string type_name =
735 type == GGUF_TYPE_ARRAY
736 ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
737 : gguf_type_name(type);
738
739 std::string value = gguf_kv_to_str(meta.get(), i);
740 const size_t MAX_VALUE_LEN = 40;
741 if (value.size() > MAX_VALUE_LEN) {
742 value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
743 }
744 replace_all(value, "\n", "\\n");
745
746 LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
747 }
748
749 // print type counts
750 for (auto & kv : n_type) {
751 if (kv.second == 0) {
752 continue;
753 }
754
755 LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
756 }
757 }
758
759 if (!llama_mmap::SUPPORTED) {
760 LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
761 use_mmap = false;
762 }
763
764 this->use_mmap = use_mmap;
765 this->use_direct_io = use_direct_io;
766 this->check_tensors = check_tensors;
767 this->no_alloc = no_alloc;
768}
769
770std::string llama_model_loader::get_arch_name() const {
771 return arch_name;
772}
773
774enum llm_arch llama_model_loader::get_arch() const {
775 return llm_kv.arch;
776}
777
778const llama_model_loader::llama_tensor_weight * llama_model_loader::get_weight(const char * name) const {
779 auto pos = weights_map.find(name);
780 if (pos != weights_map.end()) {
781 return &pos->second;
782 }
783
784 return nullptr;
785}
786
787const llama_model_loader::llama_tensor_weight & llama_model_loader::require_weight(const char * name) const {
788 const llama_tensor_weight * weight = get_weight(name);
789 if (!weight) {
790 throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
791 }
792 return *weight;
793}
794
795struct ggml_tensor * llama_model_loader::get_tensor_meta(const char * name) const {
796 const auto * weight = get_weight(name);
797 if (!weight) {
798 return nullptr;
799 }
800 return weight->tensor;
801}
802
803struct ggml_tensor * llama_model_loader::require_tensor_meta(const std::string & name) const {
804 struct ggml_tensor * tensor = get_tensor_meta(name.c_str());
805 if (!tensor) {
806 throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
807 }
808 return tensor;
809}
810
811const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
812 const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
813
814 if (cur == NULL) {
815 if (!required) {
816 return NULL;
817 }
818 throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
819 }
820
821 {
822 bool is_ok = true;
823 for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
824 if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
825 is_ok = false;
826 break;
827 }
828 }
829 if (!is_ok) {
830 throw std::runtime_error(
831 format("%s: tensor '%s' has wrong shape; expected %s, got %s",
832 __func__, name.c_str(),
833 llama_format_tensor_shape(ne).c_str(),
834 llama_format_tensor_shape(cur).c_str()));
835 }
836 }
837
838 return cur;
839}
840
841struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
842 LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
843 const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
844
845 if (cur == NULL) {
846 return NULL;
847 }
848
849 bool duplicated = flags & TENSOR_DUPLICATED;
850
851 struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
852 ggml_set_name(tensor, ggml_get_name(cur));
853
854 if (duplicated) {
855 size_data += ggml_nbytes(cur);
856 } else {
857 n_created++;
858 }
859
860 return tensor;
861
862}
863
864struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
865 const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
866
867 if (cur == NULL) {
868 return NULL;
869 }
870
871 if (cur->type != base->type) {
872 throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
873 }
874
875 std::array<int64_t, GGML_MAX_DIMS> dims;
876 for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
877 dims[i] = i < ne.size() ? ne.begin()[i] : 1;
878 }
879
880 struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
881 dims[0], dims[1], dims[2], dims[3],
882 cur->nb[1], cur->nb[2], cur->nb[3],
883 offset);
884
885 ggml_set_name(tensor, name.c_str());
886
887 n_created++;
888
889 return tensor;
890}
891
892void llama_model_loader::done_getting_tensors() const {
893 if (n_created != n_tensors) {
894 throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
895 }
896}
897
898void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
899 if (use_mmap) {
900 mappings.reserve(files.size());
901 mmaps_used.reserve(files.size());
902 for (const auto & file : files) {
903 bool is_numa = false;
904
905 auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
906 if (dev) {
907 auto * reg = ggml_backend_dev_backend_reg(dev);
908 auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
909 if (is_numa_fn) {
910 is_numa = is_numa_fn();
911 }
912 }
913
914 std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
915 mmaps_used.emplace_back(mapping->size(), 0);
916 if (mlock_mmaps) {
917 std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
918 mlock_mmap->init(mapping->addr());
919 mlock_mmaps->emplace_back(std::move(mlock_mmap));
920 }
921 mappings.emplace_back(std::move(mapping));
922 }
923 }
924
925 // compute the total size of all tensors for progress reporting
926 for (const auto & it : weights_map) {
927 size_data += ggml_nbytes(it.second.tensor);
928 }
929}
930
931void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
932 GGML_ASSERT(!mappings.empty());
933 const auto & mapping = mappings.at(idx);
934
935 *first = mapping->size();
936 *last = 0;
937 *addr = mapping->addr();
938 for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
939 const auto * weight = get_weight(ggml_get_name(tensor));
940 if (!weight || weight->idx != idx) {
941 continue;
942 }
943 *first = std::min(*first, weight->offs);
944 *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
945 }
946}
947
948void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
949 const auto & w = require_weight(ggml_get_name(cur));
950
951 if (use_mmap) {
952 const auto & mapping = mappings.at(w.idx);
953 if (cur->data == nullptr) {
954 cur->data = (uint8_t *)mapping->addr() + w.offs;
955 } else {
956 memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur));
957 }
958 } else {
959 GGML_ASSERT(cur->data != nullptr);
960 GGML_ASSERT(w.idx < files.size());
961 const auto & file = files.at(w.idx);
962 file->seek(w.offs, SEEK_SET);
963 file->read_raw(cur->data, ggml_nbytes(cur));
964 }
965
966 if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
967 throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
968 }
969}
970
971bool llama_model_loader::load_all_data(
972 struct ggml_context * ctx,
973 llama_buf_map & bufs,
974 llama_mlocks * lmlocks,
975 llama_progress_callback progress_callback,
976 void * progress_callback_user_data) {
977 GGML_ASSERT(size_data != 0 && "call init_mappings() first");
978
979 std::vector<no_init<uint8_t>> read_buf;
980 std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
981
982 // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
983 // NVMe raid configurations might require more / larger buffers.
984 constexpr size_t n_buffers = 4;
985
986 size_t alignment = 1;
987 for (const auto & file : files) {
988 alignment = std::max(file->read_alignment(), alignment);
989 }
990
991 // Buffer size: balance between memory usage and I/O efficiency
992 // 64MB works well for NVMe drives
993 const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
994
995 std::vector<ggml_backend_buffer_t> host_buffers;
996 std::vector<ggml_backend_event_t> events;
997 std::vector<void *> host_ptrs;
998 size_t buffer_idx = 0; // buffer to use for async loads
999 ggml_backend_t upload_backend = [&](const char * func) -> ggml_backend_t {
1000 if (use_mmap || check_tensors) {
1001 return nullptr;
1002 }
1003 // When not using mmaped io use async uploads from pinned memory to GPU memory.
1004 // First determine if the backend supports the necessary features for async uploads.
1005 auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
1006 if (!buf) {
1007 LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", func);
1008 return nullptr;
1009 }
1010
1011 auto * buft = ggml_backend_buffer_get_type(buf);
1012 auto * dev = ggml_backend_buft_get_device(buft);
1013 if (!dev) {
1014 LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", func,
1015 ggml_backend_buft_name(buft));
1016 return nullptr;
1017 }
1018
1019 if (buft != ggml_backend_dev_buffer_type(dev)) {
1020 LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", func,
1021 ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
1022 return nullptr;
1023 }
1024
1025 ggml_backend_dev_props props;
1026 ggml_backend_dev_get_props(dev, &props);
1027 if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
1028 LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", func,
1029 ggml_backend_dev_name(dev));
1030 return nullptr;
1031 }
1032
1033 auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
1034 if (!host_buft) {
1035 LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", func,
1036 ggml_backend_dev_name(dev));
1037 return nullptr;
1038 }
1039
1040 // If the backend is supported, create pinned memory buffers and events for synchronisation.
1041 for (size_t idx = 0; idx < n_buffers; ++idx) {
1042 auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
1043
1044 if (!buf) {
1045 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
1046 ggml_backend_dev_name(dev));
1047 return nullptr;
1048 }
1049
1050 host_buffers.emplace_back(buf);
1051 host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
1052
1053 auto * event = ggml_backend_event_new(dev);
1054 if (!event) {
1055 LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", func,
1056 ggml_backend_dev_name(dev));
1057 return nullptr;
1058 }
1059
1060 events.emplace_back(event);
1061 }
1062
1063 ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
1064 if (!backend) {
1065 LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", func,
1066 ggml_backend_dev_name(dev));
1067 return nullptr;
1068 }
1069
1070 return backend;
1071 }(__func__);
1072
1073 if (upload_backend) {
1074 LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
1075 ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
1076 ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
1077 ggml_backend_name(upload_backend));
1078 }
1079
1080 for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
1081 const auto * weight = get_weight(ggml_get_name(cur));
1082 if (weight == nullptr) {
1083 // this can happen with split experts models
1084 continue;
1085 }
1086
1087 if (progress_callback) {
1088 if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
1089 return false;
1090 }
1091 }
1092
1093 size_t n_size = ggml_nbytes(cur);
1094
1095 if (use_mmap) {
1096 const auto & mapping = mappings.at(weight->idx);
1097 ggml_backend_buffer_t buf_mmap = nullptr;
1098 if (bufs.count(weight->idx)) {
1099 buf_mmap = bufs.at(weight->idx);
1100 }
1101 uint8_t * data = (uint8_t *) mapping->addr() + weight->offs;
1102
1103 if (check_tensors) {
1104 validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
1105 return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
1106 }));
1107 }
1108
1109 GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
1110 if (buf_mmap && cur->data == nullptr) {
1111 ggml_backend_tensor_alloc(buf_mmap, cur, data);
1112 if (lmlocks) {
1113 const auto & lmlock = lmlocks->at(weight->idx);
1114 lmlock->grow_to(weight->offs + n_size);
1115 }
1116
1117 auto & mmap_used = mmaps_used[weight->idx];
1118 mmap_used.first = std::min(mmap_used.first, weight->offs);
1119 mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
1120 } else {
1121 ggml_backend_tensor_set(cur, data, 0, n_size);
1122 }
1123 } else {
1124 const auto & file = files.at(weight->idx);
1125
1126 if (ggml_backend_buffer_is_host(cur->buffer)) {
1127 file->seek(weight->offs, SEEK_SET);
1128 file->read_raw(cur->data, n_size);
1129 if (check_tensors) {
1130 validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1131 return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
1132 }));
1133 }
1134 } else {
1135 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1136 if (upload_backend) {
1137 size_t offset = weight->offs;
1138 alignment = file->read_alignment();
1139 size_t aligned_offset = offset & ~(alignment - 1);
1140 size_t offset_from_alignment = offset - aligned_offset;
1141 file->seek(aligned_offset, SEEK_SET);
1142
1143 // Calculate aligned read boundaries
1144 size_t read_start = aligned_offset;
1145 size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
1146
1147 size_t bytes_read = 0;
1148 size_t data_read = 0; // Actual tensor data copied (excluding padding)
1149
1150 while (bytes_read < read_end - read_start) {
1151 size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
1152
1153 // Align the destination pointer within the pinned buffer
1154 uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
1155
1156 // Wait for previous upload to complete before reusing buffer
1157 ggml_backend_event_synchronize(events[buffer_idx]);
1158
1159 // Read aligned chunk from file
1160 file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1161
1162 // Calculate actual data portion (excluding alignment padding)
1163 uintptr_t ptr_data = ptr_dest_aligned;
1164 size_t data_to_copy = read_size;
1165
1166 // Skip alignment padding at start of first chunk
1167 if (bytes_read == 0) {
1168 ptr_data += offset_from_alignment;
1169 data_to_copy -= offset_from_alignment;
1170 }
1171
1172 // Trim alignment padding at end of last chunk
1173 if (aligned_offset + bytes_read + read_size > offset + n_size) {
1174 data_to_copy -= (read_end - (offset + n_size));
1175 }
1176
1177 // Async upload actual data to GPU
1178 ggml_backend_tensor_set_async(upload_backend, cur,
1179 reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
1180 ggml_backend_event_record(events[buffer_idx], upload_backend);
1181
1182 data_read += data_to_copy;
1183 bytes_read += read_size;
1184
1185 ++buffer_idx;
1186 buffer_idx %= n_buffers;
1187 }
1188 } else {
1189 read_buf.resize(n_size);
1190 file->seek(weight->offs, SEEK_SET);
1191 file->read_raw(read_buf.data(), n_size);
1192 ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1193 if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1194 throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
1195 }
1196 }
1197 }
1198 }
1199
1200 size_done += n_size;
1201 }
1202
1203 // free temporary resources used for async uploads
1204 for (auto * event : events) {
1205 ggml_backend_event_synchronize(event);
1206 ggml_backend_event_free(event);
1207 }
1208 for (auto * buf : host_buffers) {
1209 ggml_backend_buffer_free(buf);
1210 }
1211 ggml_backend_free(upload_backend);
1212
1213 // check validation results
1214 bool validation_failed = false;
1215 for (auto & future : validation_result) {
1216 auto result = future.get();
1217 if (!result.second) {
1218 LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
1219 validation_failed = true;
1220 }
1221 }
1222 if (validation_failed) {
1223 throw std::runtime_error("found tensors with invalid data");
1224 }
1225
1226 // check if this is the last call and do final cleanup
1227 if (size_done >= size_data) {
1228 // unmap offloaded tensors and metadata
1229 if (use_mmap) {
1230 for (uint32_t idx = 0; idx < mappings.size(); idx++) {
1231 const auto & mmap_used = mmaps_used.at(idx);
1232 auto & mapping = mappings.at(idx);
1233 mapping->unmap_fragment(0, mmap_used.first);
1234 if (mmap_used.second != 0) {
1235 mapping->unmap_fragment(mmap_used.second, mapping->size());
1236 }
1237 }
1238 }
1239 if (progress_callback) {
1240 // Even though the model is done loading, we still honor
1241 // cancellation since we need to free allocations.
1242 return progress_callback(1.0f, progress_callback_user_data);
1243 }
1244 }
1245
1246 return true;
1247}
1248
1249std::string llama_model_loader::ftype_name() const {
1250 return llama_model_ftype_name(ftype);
1251}
1252
1253void llama_model_loader::print_info() const {
1254 LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1255 LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1256 if (n_bytes < GiB) {
1257 LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1258 } else {
1259 LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1260 }
1261}