1#import "ggml-metal-device.h"
2
3#import "ggml-impl.h"
4
5#include <Foundation/Foundation.h>
6
7#include <Metal/Metal.h>
8
9#include <stdatomic.h>
10
11#ifndef TARGET_OS_VISION
12#define TARGET_OS_VISION 0
13#endif
14
15// create residency sets only on macOS >= 15.0
16#if !TARGET_CPU_X86_64 && TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
17 TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
18 TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
19 TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
20#define GGML_METAL_HAS_RESIDENCY_SETS 1
21#endif
22
23// overload of MTLGPUFamilyMetalX (not available in some environments)
24static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
25static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
26
27#if !GGML_METAL_EMBED_LIBRARY
28// Here to assist with NSBundle Path Hack
29@interface GGMLMetalClass : NSObject
30@end
31@implementation GGMLMetalClass
32@end
33#endif
34
35//
36// MTLFunctionConstantValues wrapper
37//
38
39struct ggml_metal_cv {
40 MTLFunctionConstantValues * obj;
41};
42
43ggml_metal_cv_t ggml_metal_cv_init(void) {
44 ggml_metal_cv_t res = calloc(1, sizeof(struct ggml_metal_cv));
45
46 res->obj = [[MTLFunctionConstantValues alloc] init];
47
48 return res;
49}
50
51void ggml_metal_cv_free(ggml_metal_cv_t cv) {
52 [cv->obj release];
53 free(cv);
54}
55
56void ggml_metal_cv_set_int16(ggml_metal_cv_t cv, int16_t value, int32_t idx) {
57 [cv->obj setConstantValue:&value type:MTLDataTypeShort atIndex:idx];
58}
59
60void ggml_metal_cv_set_int32(ggml_metal_cv_t cv, int32_t value, int32_t idx) {
61 [cv->obj setConstantValue:&value type:MTLDataTypeInt atIndex:idx];
62}
63
64void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
65 [cv->obj setConstantValue:&value type:MTLDataTypeBool atIndex:idx];
66}
67
68//
69// MTLComputePipelineState wrapper
70//
71
72struct ggml_metal_pipeline {
73 id<MTLComputePipelineState> obj;
74};
75
76ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
77 ggml_metal_pipeline_t res = calloc(1, sizeof(struct ggml_metal_pipeline));
78
79 *res = (struct ggml_metal_pipeline) {
80 /*.obj =*/ nil,
81 };
82
83 return res;
84}
85
86void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
87 [pipeline->obj release];
88
89 free(pipeline);
90}
91
92int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
93 return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
94}
95
96struct ggml_metal_library {
97 id<MTLLibrary> obj;
98 id<MTLDevice> device;
99
100 ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
101
102 NSLock * lock;
103};
104
105ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
106 id<MTLLibrary> library = nil;
107 id<MTLDevice> device = ggml_metal_device_get_obj(dev);
108
109 // load library
110 //
111 // - first check if the library is embedded
112 // - then check if the library is in the bundle
113 // - if not found, load the source and compile it
114 // - if that fails, return NULL
115 //
116 // TODO: move to a function
117 {
118 const int64_t t_start = ggml_time_us();
119
120 NSError * error = nil;
121 NSString * src = nil;
122
123#if GGML_METAL_EMBED_LIBRARY
124 GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
125
126 extern const char ggml_metallib_start[];
127 extern const char ggml_metallib_end[];
128
129 src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
130#else
131
132#ifdef SWIFT_PACKAGE
133 NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
134#else
135 NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
136#endif
137
138 NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
139 if (path_lib == nil) {
140 // Try to find the resource in the directory where the current binary located.
141 NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
142 NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent];
143
144 NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
145 if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
146 GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]);
147
148 NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error];
149 if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
150 // Optionally, if this is a symlink, try to resolve it.
151 path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error];
152 if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) {
153 // It is a relative path, adding the binary directory as directory prefix.
154 path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]];
155 }
156 if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
157 // Link to the resource could not be resolved.
158 path_lib_default = nil;
159 } else {
160 GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]);
161 }
162 }
163 } else {
164 // The resource couldn't be found in the binary's directory.
165 path_lib_default = nil;
166 }
167
168 path_lib = path_lib_default;
169 }
170
171 if (path_lib != nil) {
172 // pre-compiled library found
173 NSURL * libURL = [NSURL fileURLWithPath:path_lib];
174 GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
175
176 library = [device newLibraryWithURL:libURL error:&error];
177 if (error) {
178 GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
179 return nil;
180 }
181 } else {
182 GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
183
184 NSString * path_source;
185 NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
186
187 GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
188
189 if (path_resource) {
190 path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
191 } else {
192 path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
193 }
194
195 if (path_source == nil) {
196 GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
197 path_source = @"ggml-metal.metal";
198 }
199
200 GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
201
202 src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
203 if (error) {
204 GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
205 return nil;
206 }
207 }
208#endif
209
210 if (!library) {
211 @autoreleasepool {
212 // dictionary of preprocessor macros
213 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
214
215 if (ggml_metal_device_get_props(dev)->has_bfloat) {
216 [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
217 }
218
219 if (ggml_metal_device_get_props(dev)->has_tensor) {
220 [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
221 }
222
223#if GGML_METAL_EMBED_LIBRARY
224 [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
225#endif
226
227 MTLCompileOptions * options = [MTLCompileOptions new];
228 options.preprocessorMacros = prep;
229
230 //[options setFastMathEnabled:false];
231
232 library = [device newLibraryWithSource:src options:options error:&error];
233 if (error) {
234 GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
235 return nil;
236 }
237
238#if !__has_feature(objc_arc)
239 [options release];
240#endif
241 }
242 }
243
244#if GGML_METAL_EMBED_LIBRARY
245 [src release];
246#endif // GGML_METAL_EMBED_LIBRARY
247
248 GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
249 }
250
251 ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
252
253 res->obj = library;
254 res->device = device;
255 res->pipelines = ggml_metal_pipelines_init();
256 res->lock = [NSLock new];
257
258 return res;
259}
260
261ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
262 if (source == NULL) {
263 GGML_LOG_ERROR("%s: source is NULL\n", __func__);
264 return NULL;
265 }
266
267 id<MTLDevice> device = ggml_metal_device_get_obj(dev);
268 id<MTLLibrary> library = nil;
269 NSError * error = nil;
270
271 const int64_t t_start = ggml_time_us();
272
273 NSString * src = [[NSString alloc] initWithBytes:source
274 length:strlen(source)
275 encoding:NSUTF8StringEncoding];
276 if (!src) {
277 GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
278 return NULL;
279 }
280
281 @autoreleasepool {
282 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
283
284 MTLCompileOptions * options = [MTLCompileOptions new];
285 options.preprocessorMacros = prep;
286
287 library = [device newLibraryWithSource:src options:options error:&error];
288 if (error) {
289 if (verbose) {
290 GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
291 } else {
292 GGML_LOG_ERROR("%s: error compiling source\n", __func__);
293 }
294 library = nil;
295 }
296
297 [options release];
298 }
299
300 [src release];
301
302 if (!library) {
303 if (verbose) {
304 GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
305 }
306
307 return NULL;
308 }
309
310 if (verbose) {
311 GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
312 }
313
314 ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
315 if (!res) {
316 GGML_LOG_ERROR("%s: calloc failed\n", __func__);
317 return NULL;
318 }
319
320 res->obj = library;
321 res->device = device;
322 res->pipelines = ggml_metal_pipelines_init();
323 res->lock = [NSLock new];
324
325 return res;
326}
327
328void ggml_metal_library_free(ggml_metal_library_t lib) {
329 if (!lib) {
330 return;
331 }
332
333 if (lib->obj) {
334 [lib->obj release];
335 }
336
337 ggml_metal_pipelines_free(lib->pipelines);
338
339 [lib->lock release];
340
341 free(lib);
342}
343
344struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
345 [lib->lock lock];
346
347 struct ggml_metal_pipeline_with_params res = {
348 /*.pipeline =*/ nil,
349 /*.nsg =*/ 0,
350 /*.nr0 =*/ 0,
351 /*.nr1 =*/ 0,
352 /*.smem =*/ 0,
353 /*.c4 =*/ false,
354 /*.cnt =*/ false,
355 };
356
357 res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
358
359 [lib->lock unlock];
360
361 return res;
362}
363
364struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
365 struct ggml_metal_pipeline_with_params res = {
366 /*.pipeline =*/ nil,
367 /*.nsg =*/ 0,
368 /*.nr0 =*/ 0,
369 /*.nr1 =*/ 0,
370 /*.smem =*/ 0,
371 /*.c4 =*/ false,
372 /*.cnt =*/ false,
373 };
374
375 [lib->lock lock];
376
377 res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
378 if (res.pipeline) {
379 [lib->lock unlock];
380
381 return res;
382 }
383
384 @autoreleasepool {
385 NSError * error = nil;
386
387 NSString * base_func = [NSString stringWithUTF8String:base];
388
389 GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name);
390
391 id<MTLFunction> mtl_function;
392 if (!cv) {
393 mtl_function = [lib->obj newFunctionWithName:base_func];
394 } else {
395 mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
396 }
397 if (!mtl_function) {
398 [lib->lock unlock];
399
400 GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
401 if (error) {
402 GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
403 }
404
405 return res;
406 }
407
408 id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
409
410 [mtl_function release];
411
412 if (!obj) {
413 [lib->lock unlock];
414
415 GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
416 if (error) {
417 GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
418 }
419
420 return res;
421 }
422
423 GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
424 (void *) obj,
425 (int) obj.maxTotalThreadsPerThreadgroup,
426 (int) obj.threadExecutionWidth);
427
428 if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
429 [obj release];
430
431 [lib->lock unlock];
432
433 GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
434
435 return res;
436 }
437
438 res.pipeline = ggml_metal_pipeline_init();
439 res.pipeline->obj = obj;
440
441 ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
442 }
443
444 [lib->lock unlock];
445
446 return res;
447}
448
449//
450// MTLComputeCommandEncoder wrapper
451//
452
453struct ggml_metal_encoder {
454 id<MTLComputeCommandEncoder> obj;
455};
456
457ggml_metal_encoder_t ggml_metal_encoder_init(ggml_metal_cmd_buf_t cmd_buf_raw, bool concurrent) {
458 ggml_metal_encoder_t res = calloc(1, sizeof(struct ggml_metal_encoder));
459
460 id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
461
462 if (concurrent) {
463 res->obj = [cmd_buf computeCommandEncoderWithDispatchType: MTLDispatchTypeConcurrent];
464 } else {
465 res->obj = [cmd_buf computeCommandEncoder];
466 }
467
468 [res->obj retain];
469
470 return res;
471}
472
473void ggml_metal_encoder_free(ggml_metal_encoder_t encoder) {
474 [encoder->obj release];
475 free(encoder);
476}
477
478void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name) {
479 [encoder->obj pushDebugGroup:[NSString stringWithCString:name encoding:NSUTF8StringEncoding]];
480}
481
482void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
483 [encoder->obj popDebugGroup];
484}
485
486void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
487 [encoder->obj setComputePipelineState:pipeline.pipeline->obj];
488}
489
490void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
491 [encoder->obj setBytes:data length:size atIndex:idx];
492}
493
494void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx) {
495 [encoder->obj setBuffer:buffer.metal offset:buffer.offs atIndex:idx];
496}
497
498void ggml_metal_encoder_set_threadgroup_memory_size(ggml_metal_encoder_t encoder, size_t size, int idx) {
499 [encoder->obj setThreadgroupMemoryLength:size atIndex:idx];
500}
501
502void ggml_metal_encoder_dispatch_threadgroups(ggml_metal_encoder_t encoder, int tg0, int tg1, int tg2, int tptg0, int tptg1, int tptg2) {
503 [encoder->obj dispatchThreadgroups:MTLSizeMake(tg0, tg1, tg2) threadsPerThreadgroup:MTLSizeMake(tptg0, tptg1, tptg2)];
504}
505
506void ggml_metal_encoder_memory_barrier(ggml_metal_encoder_t encoder) {
507 [encoder->obj memoryBarrierWithScope:MTLBarrierScopeBuffers];
508}
509
510void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) {
511 [encoder->obj endEncoding];
512}
513
514struct ggml_metal_device {
515 id<MTLDevice> mtl_device;
516
517 // a single global queue shared by all Metal backends
518 // technically not needed for devices with unified memory, but enables discrete GPUs support
519 // ref: https://github.com/ggml-org/llama.cpp/pull/15906
520 id<MTLCommandQueue> mtl_queue;
521
522 ggml_metal_rsets_t rsets;
523
524 ggml_metal_library_t library;
525
526 struct ggml_metal_device_props props;
527
528 // virtual address for GPU memory allocations
529 atomic_uintptr_t addr_virt;
530};
531
532//
533// MTLResidenceSet wrapper
534//
535
536struct ggml_metal_rsets {
537 NSLock * lock;
538
539 NSMutableArray * data;
540
541 // number of seconds since the last graph computation
542 // keep the residency sets wired for that amount of time to avoid being collected by the OS
543 int keep_alive_s;
544
545 // background heartbeat thread to keep the residency sets alive
546 atomic_bool d_stop;
547 atomic_int d_loop;
548
549 dispatch_group_t d_group;
550};
551
552ggml_metal_rsets_t ggml_metal_rsets_init(void) {
553 ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
554
555 res->lock = [[NSLock alloc] init];
556 res->data = [[NSMutableArray alloc] init];
557
558 // by default keep the memory wired for 3 minutes
559 res->keep_alive_s = 3*60;
560
561 const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
562 if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
563 res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
564 }
565
566 if (res->keep_alive_s <= 0) {
567 res->keep_alive_s = 3*60;
568 }
569
570 GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
571
572 atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
573 atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
574
575 res->d_group = dispatch_group_create();
576
577 // start a background thread that periodically requests residency for all the currently active sets in the collection
578 // the requests stop after a certain amount of time (keep_alive_s) of inactivity
579 dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
580 dispatch_group_async(res->d_group, d_queue, ^{
581#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
582 if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
583 while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
584 if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
585 [res->lock lock];
586
587 for (int i = 0; i < (int) res->data.count; ++i) {
588 [res->data[i] requestResidency];
589 }
590
591 atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
592
593 [res->lock unlock];
594 }
595
596 // half a second
597 usleep(500 * 1000);
598 }
599 }
600#endif
601 });
602
603 return res;
604}
605
606void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
607 if (rsets == NULL) {
608 return;
609 }
610
611 // note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
612 GGML_ASSERT([rsets->data count] == 0);
613
614 atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
615
616 dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
617 dispatch_release(rsets->d_group);
618
619 [rsets->data release];
620 [rsets->lock release];
621
622 free(rsets);
623}
624
625ggml_metal_device_t ggml_metal_device_init(int device) {
626 ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
627
628 assert(dev != NULL);
629
630 if (dev->mtl_device == nil) {
631 dev->mtl_device = MTLCreateSystemDefaultDevice();
632
633 if (dev->mtl_device) {
634 dev->mtl_queue = [dev->mtl_device newCommandQueue];
635 if (dev->mtl_queue == nil) {
636 GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
637 }
638
639 dev->addr_virt = 0x000000400ULL;
640
641 dev->props.device = device;
642 dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
643 dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
644
645 dev->props.has_simdgroup_mm = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
646 dev->props.has_unified_memory = dev->mtl_device.hasUnifiedMemory;
647
648 dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
649 dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
650 if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
651 dev->props.has_bfloat = false;
652 }
653
654 dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
655 if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
656 dev->props.has_tensor = false;
657 }
658
659 // note: disable the tensor API by default for old chips because with the current implementation it is not useful
660 // - M2 Ultra: ~5% slower
661 // - M4, M4 Max: no significant difference
662 //
663 // TODO: try to update the tensor API kernels to at least match the simdgroup performance
664 if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
665 ![[dev->mtl_device name] containsString:@"M5"] &&
666 ![[dev->mtl_device name] containsString:@"M6"] &&
667 ![[dev->mtl_device name] containsString:@"A19"] &&
668 ![[dev->mtl_device name] containsString:@"A20"]) {
669 GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
670 dev->props.has_tensor = false;
671 }
672
673 // double-check that the tensor API compiles
674 if (dev->props.has_tensor) {
675 const char * src_tensor_f16 = "\n"
676 "#include <metal_stdlib> \n"
677 "#include <metal_tensor> \n"
678 "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
679 " \n"
680 "using namespace metal; \n"
681 "using namespace mpp::tensor_ops; \n"
682 " \n"
683 "kernel void dummy_kernel( \n"
684 " tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
685 " tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
686 " device float * C [[buffer(2)]], \n"
687 " uint2 tgid [[threadgroup_position_in_grid]]) \n"
688 "{ \n"
689 " auto tA = A.slice(0, (int)tgid.y); \n"
690 " auto tB = B.slice((int)tgid.x, 0); \n"
691 " \n"
692 " matmul2d< \n"
693 " matmul2d_descriptor(8, 8, dynamic_extent), \n"
694 " execution_simdgroups<4>> mm; \n"
695 " \n"
696 " auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
697 " \n"
698 " auto sA = tA.slice(0, 0); \n"
699 " auto sB = tB.slice(0, 0); \n"
700 " mm.run(sB, sA, cT); \n"
701 " \n"
702 " auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
703 " \n"
704 " cT.store(tC); \n"
705 "}";
706
707 GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
708 ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
709 if (lib == NULL) {
710 GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
711 dev->props.has_tensor = false;
712 } else {
713 struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
714 if (!ppl.pipeline) {
715 GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
716 dev->props.has_tensor = false;
717 }
718
719 ggml_metal_library_free(lib);
720 }
721 }
722
723 // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
724 if (dev->props.has_tensor && dev->props.has_bfloat) {
725 const char * src_tensor_bf16 = "\n"
726 "#include <metal_stdlib> \n"
727 "#include <metal_tensor> \n"
728 "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
729 " \n"
730 "using namespace metal; \n"
731 "using namespace mpp::tensor_ops; \n"
732 " \n"
733 "kernel void dummy_kernel( \n"
734 " tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
735 " tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
736 " device float * C [[buffer(2)]], \n"
737 " uint2 tgid [[threadgroup_position_in_grid]]) \n"
738 "{ \n"
739 " auto tA = A.slice(0, (int)tgid.y); \n"
740 " auto tB = B.slice((int)tgid.x, 0); \n"
741 " \n"
742 " matmul2d< \n"
743 " matmul2d_descriptor(8, 8, dynamic_extent), \n"
744 " execution_simdgroups<4>> mm; \n"
745 " \n"
746 " auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
747 " \n"
748 " auto sA = tA.slice(0, 0); \n"
749 " auto sB = tB.slice(0, 0); \n"
750 " mm.run(sB, sA, cT); \n"
751 " \n"
752 " auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
753 " \n"
754 " cT.store(tC); \n"
755 "}";
756
757 GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
758 ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
759 if (lib == NULL) {
760 GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
761 dev->props.has_bfloat = false;
762 } else {
763 struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
764 if (!ppl.pipeline) {
765 GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
766 dev->props.has_bfloat = false;
767 }
768
769 ggml_metal_library_free(lib);
770 }
771 }
772
773 dev->props.use_residency_sets = true;
774#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
775 dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
776#endif
777
778 dev->props.use_shared_buffers = dev->props.has_unified_memory;
779#if TARGET_OS_OSX
780 // In case of eGPU, shared memory may be preferable.
781 dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
782#endif
783 if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
784 dev->props.use_shared_buffers = false;
785 }
786 if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
787 dev->props.use_shared_buffers = true;
788 }
789
790 dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
791
792 dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
793
794 dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;
795 dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
796 if (@available(macOS 10.12, iOS 16.0, *)) {
797 dev->props.max_working_set_size = dev->mtl_device.recommendedMaxWorkingSetSize;
798 } else {
799 dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
800 }
801
802 snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
803 snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
804
805 dev->library = ggml_metal_library_init(dev);
806 if (!dev->library) {
807 GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
808 }
809
810 if (dev->props.use_residency_sets) {
811 dev->rsets = ggml_metal_rsets_init();
812 } else {
813 dev->rsets = nil;
814 }
815
816 // print MTL GPU family:
817 GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
818
819 // determine max supported GPU family
820 // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
821 // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
822 {
823 for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
824 if ([dev->mtl_device supportsFamily:i]) {
825 GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
826 break;
827 }
828 }
829
830 for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
831 if ([dev->mtl_device supportsFamily:i]) {
832 GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
833 break;
834 }
835 }
836
837 for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) {
838 if ([dev->mtl_device supportsFamily:i]) {
839 GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i);
840 break;
841 }
842 }
843 }
844
845 GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, dev->props.has_simdgroup_reduction ? "true" : "false");
846 GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
847 GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
848 GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
849 GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
850 GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
851 GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
852
853#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
854 if (@available(macOS 10.12, iOS 16.0, *)) {
855 GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, dev->props.max_working_set_size / 1e6);
856 }
857#endif
858 }
859 }
860
861 return dev;
862}
863
864void ggml_metal_device_free(ggml_metal_device_t dev) {
865 assert(dev != NULL);
866
867 ggml_metal_rsets_free(dev->rsets);
868
869 ggml_metal_library_free(dev->library);
870 dev->library = NULL;
871
872 if (dev->mtl_queue) {
873 [dev->mtl_queue release];
874 dev->mtl_queue = nil;
875 }
876
877 if (dev->mtl_device) {
878 [dev->mtl_device release];
879 dev->mtl_device = nil;
880 }
881
882 free(dev);
883}
884
885void * ggml_metal_device_get_obj(ggml_metal_device_t dev) {
886 return dev->mtl_device;
887}
888
889void * ggml_metal_device_get_queue(ggml_metal_device_t dev) {
890 return dev->mtl_queue;
891}
892
893ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
894 return dev->library;
895}
896
897void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
898 if (rset == nil) {
899 return;
900 }
901
902 GGML_ASSERT(dev->rsets);
903
904 [dev->rsets->lock lock];
905
906 [dev->rsets->data addObject:rset];
907
908 [dev->rsets->lock unlock];
909}
910
911void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
912 if (rset == nil) {
913 return;
914 }
915
916 GGML_ASSERT(dev->rsets);
917
918 [dev->rsets->lock lock];
919
920 [dev->rsets->data removeObject:rset];
921
922 [dev->rsets->lock unlock];
923}
924
925void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
926 if (dev->rsets == NULL) {
927 return;
928 }
929
930 atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
931}
932
933struct ggml_metal_event {
934 void * obj; // id<MTLEvent>
935
936 atomic_int value;
937};
938
939void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
940 id<MTLEvent> event = (id<MTLEvent>)ev->obj;
941
942 id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
943
944 [cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
945}
946
947void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
948 id<MTLEvent> event = (id<MTLEvent>)ev->obj;
949
950 id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
951
952 [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
953}
954
955ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
956 id<MTLEvent> event = [dev->mtl_device newEvent];
957
958 ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
959
960 ev->obj = (__bridge void *)event;
961 ev->value = 0;
962
963 return ev;
964}
965
966void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
967 id<MTLEvent> event = ev->obj;
968 [event release];
969
970 free(ev);
971
972 GGML_UNUSED(dev);
973}
974
975void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
976 @autoreleasepool {
977 id<MTLEvent> event = ev->obj;
978
979 id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
980 [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
981 [cmd_buf commit];
982 [cmd_buf waitUntilCompleted];
983 }
984}
985
986void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
987 if (@available(macOS 10.12, iOS 16.0, *)) {
988 *total = dev->mtl_device.recommendedMaxWorkingSetSize;
989 *free = *total - dev->mtl_device.currentAllocatedSize;
990 } else {
991 *free = 0;
992 *total = 0;
993 }
994}
995
996bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op) {
997 const bool has_simdgroup_mm = dev->props.has_simdgroup_mm;
998 const bool has_simdgroup_reduction = dev->props.has_simdgroup_reduction;
999 const bool has_bfloat = dev->props.has_bfloat;
1000
1001 if (!has_bfloat) {
1002 if (op->type == GGML_TYPE_BF16) {
1003 return false;
1004 }
1005
1006 for (size_t i = 0, n = 3; i < n; ++i) {
1007 if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
1008 return false;
1009 }
1010 }
1011 }
1012
1013 switch (op->op) {
1014 case GGML_OP_SCALE:
1015 case GGML_OP_FILL:
1016 case GGML_OP_CLAMP:
1017 case GGML_OP_SQR:
1018 case GGML_OP_SQRT:
1019 case GGML_OP_SIN:
1020 case GGML_OP_COS:
1021 case GGML_OP_LOG:
1022 return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
1023 case GGML_OP_UNARY:
1024 switch (ggml_get_unary_op(op)) {
1025 case GGML_UNARY_OP_TANH:
1026 case GGML_UNARY_OP_RELU:
1027 case GGML_UNARY_OP_SIGMOID:
1028 case GGML_UNARY_OP_GELU:
1029 case GGML_UNARY_OP_GELU_ERF:
1030 case GGML_UNARY_OP_GELU_QUICK:
1031 case GGML_UNARY_OP_SILU:
1032 case GGML_UNARY_OP_ELU:
1033 case GGML_UNARY_OP_NEG:
1034 case GGML_UNARY_OP_ABS:
1035 case GGML_UNARY_OP_SGN:
1036 case GGML_UNARY_OP_STEP:
1037 case GGML_UNARY_OP_HARDSWISH:
1038 case GGML_UNARY_OP_HARDSIGMOID:
1039 case GGML_UNARY_OP_EXP:
1040 case GGML_UNARY_OP_SOFTPLUS:
1041 case GGML_UNARY_OP_EXPM1:
1042 return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
1043 default:
1044 return false;
1045 }
1046 case GGML_OP_GLU:
1047 switch (ggml_get_glu_op(op)) {
1048 case GGML_GLU_OP_REGLU:
1049 case GGML_GLU_OP_GEGLU:
1050 case GGML_GLU_OP_SWIGLU:
1051 case GGML_GLU_OP_SWIGLU_OAI:
1052 case GGML_GLU_OP_GEGLU_ERF:
1053 case GGML_GLU_OP_GEGLU_QUICK:
1054 return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1055 default:
1056 return false;
1057 }
1058 case GGML_OP_NONE:
1059 case GGML_OP_RESHAPE:
1060 case GGML_OP_VIEW:
1061 case GGML_OP_TRANSPOSE:
1062 case GGML_OP_PERMUTE:
1063 case GGML_OP_CONCAT:
1064 return true;
1065 case GGML_OP_ADD:
1066 case GGML_OP_SUB:
1067 case GGML_OP_MUL:
1068 case GGML_OP_DIV:
1069 case GGML_OP_ADD_ID:
1070 return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
1071 case GGML_OP_ACC:
1072 case GGML_OP_REPEAT:
1073 case GGML_OP_CONV_TRANSPOSE_1D:
1074 return true;
1075 case GGML_OP_CONV_TRANSPOSE_2D:
1076 return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) &&
1077 (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
1078 op->src[1]->type == GGML_TYPE_F32 &&
1079 op->type == GGML_TYPE_F32;
1080 case GGML_OP_SUM:
1081 return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
1082 case GGML_OP_TRI:
1083 return ggml_is_contiguous_rows(op->src[0]);
1084 case GGML_OP_SUM_ROWS:
1085 case GGML_OP_CUMSUM:
1086 case GGML_OP_MEAN:
1087 case GGML_OP_SOFT_MAX:
1088 case GGML_OP_GROUP_NORM:
1089 case GGML_OP_L2_NORM:
1090 return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
1091 case GGML_OP_COUNT_EQUAL:
1092 return has_simdgroup_reduction &&
1093 op->src[0]->type == GGML_TYPE_I32 &&
1094 op->src[1]->type == GGML_TYPE_I32 &&
1095 op->type == GGML_TYPE_I64;
1096 case GGML_OP_ARGMAX:
1097 return has_simdgroup_reduction;
1098 case GGML_OP_NORM:
1099 case GGML_OP_RMS_NORM:
1100 return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
1101 case GGML_OP_ROPE:
1102 return true;
1103 case GGML_OP_IM2COL:
1104 return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
1105 case GGML_OP_CONV_2D:
1106 return ggml_is_contiguous(op->src[0]) &&
1107 op->src[1]->type == GGML_TYPE_F32 &&
1108 op->type == GGML_TYPE_F32 &&
1109 (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
1110 case GGML_OP_UPSCALE:
1111 return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
1112 case GGML_OP_POOL_1D:
1113 return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1114 case GGML_OP_POOL_2D:
1115 return op->src[0]->type == GGML_TYPE_F32;
1116 case GGML_OP_PAD:
1117 // TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
1118 if (ggml_get_op_params_i32(op, 8) != 0) {
1119 return false;
1120 }
1121
1122 return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
1123 (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
1124 case GGML_OP_PAD_REFLECT_1D:
1125 case GGML_OP_TIMESTEP_EMBEDDING:
1126 case GGML_OP_LEAKY_RELU:
1127 return op->src[0]->type == GGML_TYPE_F32;
1128 case GGML_OP_ARGSORT:
1129 case GGML_OP_TOP_K:
1130 case GGML_OP_ARANGE:
1131 return true;
1132 case GGML_OP_FLASH_ATTN_EXT:
1133 // for new head sizes, add checks here
1134 if (op->src[0]->ne[0] != 32 &&
1135 op->src[0]->ne[0] != 40 &&
1136 op->src[0]->ne[0] != 48 &&
1137 op->src[0]->ne[0] != 64 &&
1138 op->src[0]->ne[0] != 72 &&
1139 op->src[0]->ne[0] != 80 &&
1140 op->src[0]->ne[0] != 96 &&
1141 op->src[0]->ne[0] != 112 &&
1142 op->src[0]->ne[0] != 128 &&
1143 op->src[0]->ne[0] != 192 &&
1144 op->src[0]->ne[0] != 256 &&
1145 op->src[0]->ne[0] != 576) {
1146 return false;
1147 }
1148 if (op->src[1]->type != op->src[2]->type) {
1149 return false;
1150 }
1151 return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
1152 case GGML_OP_SSM_CONV:
1153 case GGML_OP_SSM_SCAN:
1154 return has_simdgroup_reduction;
1155 case GGML_OP_RWKV_WKV6:
1156 case GGML_OP_RWKV_WKV7:
1157 return true;
1158 case GGML_OP_SOLVE_TRI:
1159 case GGML_OP_MUL_MAT:
1160 case GGML_OP_MUL_MAT_ID:
1161 return has_simdgroup_reduction;
1162 case GGML_OP_CPY:
1163 case GGML_OP_DUP:
1164 case GGML_OP_CONT:
1165 {
1166 switch (op->src[0]->type) {
1167 case GGML_TYPE_F32:
1168 switch (op->type) {
1169 case GGML_TYPE_F32:
1170 case GGML_TYPE_F16:
1171 case GGML_TYPE_BF16:
1172 case GGML_TYPE_Q8_0:
1173 case GGML_TYPE_Q4_0:
1174 case GGML_TYPE_Q4_1:
1175 case GGML_TYPE_Q5_0:
1176 case GGML_TYPE_Q5_1:
1177 case GGML_TYPE_IQ4_NL:
1178 case GGML_TYPE_I32:
1179 return true;
1180 default:
1181 return false;
1182 }
1183 case GGML_TYPE_F16:
1184 switch (op->type) {
1185 case GGML_TYPE_F32:
1186 case GGML_TYPE_F16:
1187 return true;
1188 default:
1189 return false;
1190 }
1191 case GGML_TYPE_BF16:
1192 switch (op->type) {
1193 case GGML_TYPE_F32:
1194 case GGML_TYPE_BF16:
1195 return true;
1196 default:
1197 return false;
1198 }
1199 case GGML_TYPE_Q4_0:
1200 case GGML_TYPE_Q4_1:
1201 case GGML_TYPE_Q5_0:
1202 case GGML_TYPE_Q5_1:
1203 case GGML_TYPE_Q8_0:
1204 switch (op->type) {
1205 case GGML_TYPE_F32:
1206 case GGML_TYPE_F16:
1207 return true;
1208 default:
1209 return false;
1210 }
1211 case GGML_TYPE_I32:
1212 return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32;
1213 default:
1214 return false;
1215 };
1216 }
1217 case GGML_OP_GET_ROWS:
1218 return true;
1219 case GGML_OP_SET_ROWS:
1220 {
1221 if (op->src[0]->type != GGML_TYPE_F32) {
1222 return false;
1223 }
1224
1225 switch (op->type) {
1226 case GGML_TYPE_F32:
1227 case GGML_TYPE_F16:
1228 case GGML_TYPE_BF16:
1229 case GGML_TYPE_Q8_0:
1230 case GGML_TYPE_Q4_0:
1231 case GGML_TYPE_Q4_1:
1232 case GGML_TYPE_Q5_0:
1233 case GGML_TYPE_Q5_1:
1234 case GGML_TYPE_IQ4_NL:
1235 return true;
1236 default:
1237 return false;
1238 };
1239 }
1240 case GGML_OP_DIAG:
1241 return true;
1242 case GGML_OP_OPT_STEP_ADAMW:
1243 case GGML_OP_OPT_STEP_SGD:
1244 return has_simdgroup_reduction;
1245 default:
1246 return false;
1247 }
1248}
1249
1250const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_device_t dev) {
1251 return &dev->props;
1252}
1253
1254//
1255// device buffers
1256//
1257
1258// max memory buffers that can be mapped to the device
1259#define GGML_METAL_MAX_BUFFERS 64
1260
1261struct ggml_metal_buffer_wrapper {
1262 void * data;
1263 size_t size;
1264
1265 id<MTLBuffer> metal;
1266};
1267
1268struct ggml_metal_buffer {
1269 void * all_data;
1270 size_t all_size;
1271
1272 // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
1273 bool is_shared;
1274 bool owned;
1275
1276 // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1277 int n_buffers;
1278 struct ggml_metal_buffer_wrapper buffers[GGML_METAL_MAX_BUFFERS];
1279
1280 bool use_residency_sets;
1281
1282 // optional MTLResidencySet
1283 // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
1284 id rset;
1285
1286 // pointers to global device
1287 ggml_metal_device_t dev;
1288};
1289
1290static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
1291#ifndef GGML_METAL_NDEBUG
1292#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
1293 if (@available(macOS 10.12, iOS 16.0, *)) {
1294 GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
1295 __func__,
1296 size_aligned / 1024.0 / 1024.0,
1297 device.currentAllocatedSize / 1024.0 / 1024.0,
1298 device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
1299
1300 if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
1301 GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
1302 }
1303 } else {
1304 GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
1305 __func__,
1306 size_aligned / 1024.0 / 1024.0,
1307 device.currentAllocatedSize / 1024.0 / 1024.0);
1308 }
1309#endif
1310#endif
1311 GGML_UNUSED(device);
1312 GGML_UNUSED(size_aligned);
1313}
1314
1315// rset init
1316static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
1317 buf->rset = nil;
1318
1319 if (!buf->use_residency_sets) {
1320 return true;
1321 }
1322
1323#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1324 if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
1325 MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
1326 desc.label = @"ggml_metal";
1327 desc.initialCapacity = buf->n_buffers;
1328
1329 NSError * error;
1330 buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
1331 if (error) {
1332 GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1333 [desc release];
1334 return false;
1335 }
1336
1337 [desc release];
1338
1339 for (int i = 0; i < buf->n_buffers; i++) {
1340 [buf->rset addAllocation:buf->buffers[i].metal];
1341 }
1342
1343 [buf->rset commit];
1344 [buf->rset requestResidency];
1345
1346 return true;
1347 }
1348#endif
1349
1350 return true;
1351}
1352
1353// rset free
1354static void ggml_metal_buffer_rset_free(ggml_metal_buffer_t buf) {
1355#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1356 if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
1357 if (buf->rset) {
1358 [buf->rset endResidency];
1359 [buf->rset removeAllAllocations];
1360 [buf->rset release];
1361 }
1362 }
1363#else
1364 GGML_UNUSED(buf);
1365#endif
1366}
1367
1368static void * ggml_metal_host_malloc(size_t n) {
1369 void * data = NULL;
1370
1371#if TARGET_OS_OSX
1372 kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
1373 if (err != KERN_SUCCESS) {
1374 GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
1375 return NULL;
1376 }
1377#else
1378 const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
1379 if (result != 0) {
1380 GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
1381 return NULL;
1382 }
1383#endif
1384
1385 return data;
1386}
1387
1388ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
1389 ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1390
1391 res->dev = dev;
1392
1393 const size_t size_page = sysconf(_SC_PAGESIZE);
1394
1395 size_t size_aligned = size;
1396 if ((size_aligned % size_page) != 0) {
1397 size_aligned += (size_page - (size_aligned % size_page));
1398 }
1399
1400 const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1401
1402 shared = shared && props_dev->use_shared_buffers;
1403
1404 // allocate shared buffer if the device supports it and it is required by the buffer type
1405 if (shared) {
1406 res->all_data = ggml_metal_host_malloc(size_aligned);
1407 res->is_shared = true;
1408 } else {
1409 // use virtual address
1410 res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
1411 res->is_shared = false;
1412 }
1413 res->all_size = size_aligned;
1414
1415 res->owned = true;
1416
1417 res->n_buffers = 1;
1418
1419 if (res->all_data != NULL) {
1420 res->buffers[0].size = size;
1421 res->buffers[0].metal = nil;
1422
1423 if (size_aligned > 0) {
1424 if (props_dev->use_shared_buffers && shared) {
1425 res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
1426 length:size_aligned
1427 options:MTLResourceStorageModeShared
1428 deallocator:nil];
1429 } else {
1430 res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
1431 }
1432 }
1433
1434 res->buffers[0].data = res->all_data;
1435 }
1436
1437 if (size_aligned > 0 && (res->all_data == NULL || res->buffers[0].metal == nil)) {
1438 GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
1439 free(res);
1440 return NULL;
1441 }
1442
1443 res->use_residency_sets = props_dev->use_residency_sets;
1444
1445 if (!ggml_metal_buffer_rset_init(res)) {
1446 GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
1447 free(res);
1448 return NULL;
1449 }
1450
1451 ggml_metal_device_rsets_add(dev, res->rset);
1452
1453 //ggml_metal_log_allocated_size(device, size_aligned);
1454
1455 return res;
1456}
1457
1458ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1459 ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1460
1461 res->dev = dev;
1462
1463 res->all_data = ptr;
1464 res->all_size = size;
1465
1466 res->is_shared = true;
1467 res->owned = false;
1468
1469 res->n_buffers = 0;
1470
1471 const size_t size_page = sysconf(_SC_PAGESIZE);
1472
1473 // page-align the data ptr
1474 {
1475 const uintptr_t offs = (uintptr_t) ptr % size_page;
1476 ptr = (void *) ((char *) ptr - offs);
1477 size += offs;
1478 }
1479
1480 size_t size_aligned = size;
1481 if ((size_aligned % size_page) != 0) {
1482 size_aligned += (size_page - (size_aligned % size_page));
1483 }
1484
1485 const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1486
1487 // the buffer fits into the max buffer size allowed by the device
1488 if (size_aligned <= props_dev->max_buffer_size) {
1489 res->buffers[res->n_buffers].data = ptr;
1490 res->buffers[res->n_buffers].size = size;
1491 res->buffers[res->n_buffers].metal = nil;
1492
1493 if (size_aligned > 0) {
1494 res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1495
1496 if (res->buffers[res->n_buffers].metal == nil) {
1497 GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
1498 free(res);
1499 return NULL;
1500 }
1501 }
1502
1503 ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
1504
1505 ++res->n_buffers;
1506 } else {
1507 // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
1508 // one of the views
1509 const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
1510 const size_t size_step = props_dev->max_buffer_size - size_ovlp;
1511 const size_t size_view = props_dev->max_buffer_size;
1512
1513 for (size_t i = 0; i < size; i += size_step) {
1514 const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
1515
1516 res->buffers[res->n_buffers].data = (void *) ((uint8_t *) ptr + i);
1517 res->buffers[res->n_buffers].size = size_step_aligned;
1518 res->buffers[res->n_buffers].metal = nil;
1519
1520 if (size_step_aligned > 0) {
1521 res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1522
1523 if (res->buffers[res->n_buffers].metal == nil) {
1524 GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
1525 free(res);
1526 return NULL;
1527 }
1528 }
1529
1530 ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
1531
1532 if (i + size_step < size) {
1533 GGML_LOG_INFO("\n");
1534 }
1535
1536 ++res->n_buffers;
1537 }
1538 }
1539
1540 res->use_residency_sets = props_dev->use_residency_sets;
1541
1542 if (!ggml_metal_buffer_rset_init(res)) {
1543 GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
1544 free(res);
1545 return NULL;
1546 }
1547
1548 ggml_metal_device_rsets_add(dev, res->rset);
1549
1550 return res;
1551}
1552
1553void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
1554 ggml_metal_device_rsets_rm(buf->dev, buf->rset);
1555
1556 for (int i = 0; i < buf->n_buffers; i++) {
1557 [buf->buffers[i].metal release];
1558 }
1559
1560 ggml_metal_buffer_rset_free(buf);
1561
1562 if (buf->is_shared && buf->owned) {
1563#if TARGET_OS_OSX
1564 vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)buf->all_data, buf->all_size);
1565#else
1566 free(buf->all_data);
1567#endif
1568 }
1569
1570 free(buf);
1571}
1572
1573void * ggml_metal_buffer_get_base(ggml_metal_buffer_t buf) {
1574 return buf->all_data;
1575}
1576
1577bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {
1578 return buf->is_shared;
1579}
1580
1581void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1582 if (buf->is_shared) {
1583 memset((char *) tensor->data + offset, value, size);
1584 return;
1585 }
1586
1587 @autoreleasepool {
1588 // dst
1589 struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1590 bid_dst.offs += offset;
1591
1592 id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1593
1594 {
1595 id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1596
1597 [encoder fillBuffer:bid_dst.metal
1598 range:NSMakeRange(bid_dst.offs, bid_dst.offs + size)
1599 value:value];
1600
1601 [encoder endEncoding];
1602 }
1603
1604 [cmd_buf commit];
1605 [cmd_buf waitUntilCompleted];
1606 }
1607}
1608
1609void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1610 if (buf->is_shared) {
1611 memcpy((char *) tensor->data + offset, data, size);
1612 return;
1613 }
1614
1615 @autoreleasepool {
1616 // src
1617 void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
1618 id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
1619 length:size
1620 options:MTLResourceStorageModeShared
1621 deallocator:nil];
1622
1623 GGML_ASSERT(buf_src);
1624
1625 // dst
1626 struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1627 bid_dst.offs += offset;
1628
1629 // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete
1630 // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
1631 dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
1632
1633 id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1634
1635 {
1636 id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1637
1638 [encoder copyFromBuffer:buf_src
1639 sourceOffset:0
1640 toBuffer:bid_dst.metal
1641 destinationOffset:bid_dst.offs
1642 size:size];
1643
1644 [encoder endEncoding];
1645 }
1646
1647 [cmd_buf addCompletedHandler:^(id<MTLCommandBuffer> cb) {
1648 // TODO: can check for errors here
1649 GGML_UNUSED(cb);
1650
1651 dispatch_semaphore_signal(completion_semaphore);
1652 }];
1653
1654 [cmd_buf commit];
1655
1656 dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER);
1657 dispatch_release(completion_semaphore);
1658
1659 //[cmd_buf waitUntilCompleted];
1660 }
1661}
1662
1663void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1664 if (buf->is_shared) {
1665 memcpy(data, (const char *) tensor->data + offset, size);
1666 return;
1667 }
1668
1669 @autoreleasepool {
1670 // src
1671 struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf, tensor);
1672 bid_src.offs += offset;
1673
1674 // dst
1675 id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
1676 length:size
1677 options:MTLResourceStorageModeShared
1678 deallocator:nil];
1679
1680 GGML_ASSERT(buf_dst);
1681
1682 id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1683
1684 {
1685 id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1686
1687 [encoder copyFromBuffer:bid_src.metal
1688 sourceOffset:bid_src.offs
1689 toBuffer:buf_dst
1690 destinationOffset:0
1691 size:size];
1692
1693 [encoder endEncoding];
1694 }
1695
1696 [cmd_buf commit];
1697 [cmd_buf waitUntilCompleted];
1698 }
1699}
1700
1701void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
1702 if (buf->is_shared) {
1703 memset(buf->all_data, value, buf->all_size);
1704 return;
1705 }
1706
1707 @autoreleasepool {
1708 id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1709
1710 {
1711 id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1712
1713 [encoder fillBuffer:buf->buffers[0].metal
1714 range:NSMakeRange(0, buf->buffers[0].size)
1715 value:value];
1716
1717 [encoder endEncoding];
1718 }
1719
1720 [cmd_buf commit];
1721 [cmd_buf waitUntilCompleted];
1722 }
1723}
1724
1725struct ggml_metal_buffer_id ggml_metal_buffer_get_id(ggml_metal_buffer_t buf, const struct ggml_tensor * t) {
1726 struct ggml_metal_buffer_id res = { nil, 0 };
1727
1728 const int64_t tsize = ggml_nbytes(t);
1729
1730 // find the view that contains the tensor fully
1731 for (int i = 0; i < buf->n_buffers; ++i) {
1732 const int64_t ioffs = (int64_t) t->data - (int64_t) buf->buffers[i].data;
1733
1734 //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf->buffers[i].size);
1735 if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf->buffers[i].size) {
1736 res.metal = buf->buffers[i].metal;
1737 res.offs = (size_t) ioffs;
1738
1739 //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
1740
1741 return res;
1742 }
1743 }
1744
1745 GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
1746
1747 return res;
1748}