llmnpc - llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m

Path: llmnpc / llama.cpp / ggml / src / ggml-metal / ggml-metal-device.m (raw)
   1#import "ggml-metal-device.h"
   2
   3#import "ggml-impl.h"
   4
   5#include <Foundation/Foundation.h>
   6
   7#include <Metal/Metal.h>
   8
   9#include <stdatomic.h>
  10
  11#ifndef TARGET_OS_VISION
  12#define TARGET_OS_VISION 0
  13#endif
  14
  15// create residency sets only on macOS >= 15.0
  16#if !TARGET_CPU_X86_64 && TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 || \
  17    TARGET_OS_IOS && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000 || \
  18    TARGET_OS_TV && __TV_OS_VERSION_MAX_ALLOWED >= 180000 || \
  19    TARGET_OS_VISION && __VISION_OS_VERSION_MAX_ALLOWED >= 200000
  20#define GGML_METAL_HAS_RESIDENCY_SETS 1
  21#endif
  22
  23// overload of MTLGPUFamilyMetalX (not available in some environments)
  24static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
  25static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
  26
  27#if !GGML_METAL_EMBED_LIBRARY
  28// Here to assist with NSBundle Path Hack
  29@interface GGMLMetalClass : NSObject
  30@end
  31@implementation GGMLMetalClass
  32@end
  33#endif
  34
  35//
  36// MTLFunctionConstantValues wrapper
  37//
  38
  39struct ggml_metal_cv {
  40    MTLFunctionConstantValues * obj;
  41};
  42
  43ggml_metal_cv_t ggml_metal_cv_init(void) {
  44    ggml_metal_cv_t res = calloc(1, sizeof(struct ggml_metal_cv));
  45
  46    res->obj = [[MTLFunctionConstantValues alloc] init];
  47
  48    return res;
  49}
  50
  51void ggml_metal_cv_free(ggml_metal_cv_t cv) {
  52    [cv->obj release];
  53    free(cv);
  54}
  55
  56void ggml_metal_cv_set_int16(ggml_metal_cv_t cv, int16_t value, int32_t idx) {
  57    [cv->obj setConstantValue:&value type:MTLDataTypeShort atIndex:idx];
  58}
  59
  60void ggml_metal_cv_set_int32(ggml_metal_cv_t cv, int32_t value, int32_t idx) {
  61    [cv->obj setConstantValue:&value type:MTLDataTypeInt atIndex:idx];
  62}
  63
  64void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
  65    [cv->obj setConstantValue:&value type:MTLDataTypeBool atIndex:idx];
  66}
  67
  68//
  69// MTLComputePipelineState wrapper
  70//
  71
  72struct ggml_metal_pipeline {
  73    id<MTLComputePipelineState> obj;
  74};
  75
  76ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
  77    ggml_metal_pipeline_t res = calloc(1, sizeof(struct ggml_metal_pipeline));
  78
  79    *res = (struct ggml_metal_pipeline) {
  80        /*.obj  =*/ nil,
  81    };
  82
  83    return res;
  84}
  85
  86void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
  87    [pipeline->obj release];
  88
  89    free(pipeline);
  90}
  91
  92int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
  93    return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
  94}
  95
  96struct ggml_metal_library {
  97    id<MTLLibrary> obj;
  98    id<MTLDevice> device;
  99
 100    ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
 101
 102    NSLock * lock;
 103};
 104
 105ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
 106    id<MTLLibrary> library = nil;
 107    id<MTLDevice> device = ggml_metal_device_get_obj(dev);
 108
 109    // load library
 110    //
 111    // - first check if the library is embedded
 112    // - then check if the library is in the bundle
 113    // - if not found, load the source and compile it
 114    // - if that fails, return NULL
 115    //
 116    // TODO: move to a function
 117    {
 118        const int64_t t_start = ggml_time_us();
 119
 120        NSError * error = nil;
 121        NSString * src = nil;
 122
 123#if GGML_METAL_EMBED_LIBRARY
 124        GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
 125
 126        extern const char ggml_metallib_start[];
 127        extern const char ggml_metallib_end[];
 128
 129        src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 130#else
 131
 132#ifdef SWIFT_PACKAGE
 133        NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
 134#else
 135        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 136#endif
 137
 138        NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
 139        if (path_lib == nil) {
 140            // Try to find the resource in the directory where the current binary located.
 141            NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
 142            NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent];
 143
 144            NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
 145            if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
 146                GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]);
 147
 148                NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error];
 149                if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
 150                    // Optionally, if this is a symlink, try to resolve it.
 151                    path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error];
 152                    if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) {
 153                        // It is a relative path, adding the binary directory as directory prefix.
 154                        path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]];
 155                    }
 156                    if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
 157                        // Link to the resource could not be resolved.
 158                        path_lib_default = nil;
 159                    } else {
 160                        GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]);
 161                    }
 162                }
 163            } else {
 164                // The resource couldn't be found in the binary's directory.
 165                path_lib_default = nil;
 166            }
 167
 168            path_lib = path_lib_default;
 169        }
 170
 171        if (path_lib != nil) {
 172            // pre-compiled library found
 173            NSURL * libURL = [NSURL fileURLWithPath:path_lib];
 174            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
 175
 176            library = [device newLibraryWithURL:libURL error:&error];
 177            if (error) {
 178                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
 179                return nil;
 180            }
 181        } else {
 182            GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 183
 184            NSString * path_source;
 185            NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
 186
 187            GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
 188
 189            if (path_resource) {
 190                path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
 191            } else {
 192                path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
 193            }
 194
 195            if (path_source == nil) {
 196                GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
 197                path_source = @"ggml-metal.metal";
 198            }
 199
 200            GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
 201
 202            src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
 203            if (error) {
 204                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
 205                return nil;
 206            }
 207        }
 208#endif
 209
 210        if (!library) {
 211            @autoreleasepool {
 212                // dictionary of preprocessor macros
 213                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 214
 215                if (ggml_metal_device_get_props(dev)->has_bfloat) {
 216                    [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
 217                }
 218
 219                if (ggml_metal_device_get_props(dev)->has_tensor) {
 220                    [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
 221                }
 222
 223#if GGML_METAL_EMBED_LIBRARY
 224                [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
 225#endif
 226
 227                MTLCompileOptions * options = [MTLCompileOptions new];
 228                options.preprocessorMacros = prep;
 229
 230                //[options setFastMathEnabled:false];
 231
 232                library = [device newLibraryWithSource:src options:options error:&error];
 233                if (error) {
 234                    GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
 235                    return nil;
 236                }
 237
 238#if !__has_feature(objc_arc)
 239                [options release];
 240#endif
 241            }
 242        }
 243
 244#if GGML_METAL_EMBED_LIBRARY
 245        [src release];
 246#endif // GGML_METAL_EMBED_LIBRARY
 247
 248        GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
 249    }
 250
 251    ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
 252
 253    res->obj       = library;
 254    res->device    = device;
 255    res->pipelines = ggml_metal_pipelines_init();
 256    res->lock      = [NSLock new];
 257
 258    return res;
 259}
 260
 261ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
 262    if (source == NULL) {
 263        GGML_LOG_ERROR("%s: source is NULL\n", __func__);
 264        return NULL;
 265    }
 266
 267    id<MTLDevice> device = ggml_metal_device_get_obj(dev);
 268    id<MTLLibrary> library = nil;
 269    NSError * error = nil;
 270
 271    const int64_t t_start = ggml_time_us();
 272
 273    NSString * src = [[NSString alloc] initWithBytes:source
 274                                              length:strlen(source)
 275                                            encoding:NSUTF8StringEncoding];
 276    if (!src) {
 277        GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
 278        return NULL;
 279    }
 280
 281    @autoreleasepool {
 282        NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 283
 284        MTLCompileOptions * options = [MTLCompileOptions new];
 285        options.preprocessorMacros = prep;
 286
 287        library = [device newLibraryWithSource:src options:options error:&error];
 288        if (error) {
 289            if (verbose) {
 290                GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
 291            } else {
 292                GGML_LOG_ERROR("%s: error compiling source\n", __func__);
 293            }
 294            library = nil;
 295        }
 296
 297        [options release];
 298    }
 299
 300    [src release];
 301
 302    if (!library) {
 303        if (verbose) {
 304            GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
 305        }
 306
 307        return NULL;
 308    }
 309
 310    if (verbose) {
 311        GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
 312    }
 313
 314    ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
 315    if (!res) {
 316        GGML_LOG_ERROR("%s: calloc failed\n", __func__);
 317        return NULL;
 318    }
 319
 320    res->obj       = library;
 321    res->device    = device;
 322    res->pipelines = ggml_metal_pipelines_init();
 323    res->lock      = [NSLock new];
 324
 325    return res;
 326}
 327
 328void ggml_metal_library_free(ggml_metal_library_t lib) {
 329    if (!lib) {
 330        return;
 331    }
 332
 333    if (lib->obj) {
 334        [lib->obj release];
 335    }
 336
 337    ggml_metal_pipelines_free(lib->pipelines);
 338
 339    [lib->lock release];
 340
 341    free(lib);
 342}
 343
 344struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
 345    [lib->lock lock];
 346
 347    struct ggml_metal_pipeline_with_params res = {
 348        /*.pipeline =*/ nil,
 349        /*.nsg      =*/ 0,
 350        /*.nr0      =*/ 0,
 351        /*.nr1      =*/ 0,
 352        /*.smem     =*/ 0,
 353        /*.c4       =*/ false,
 354        /*.cnt      =*/ false,
 355    };
 356
 357    res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
 358
 359    [lib->lock unlock];
 360
 361    return res;
 362}
 363
 364struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
 365    struct ggml_metal_pipeline_with_params res = {
 366        /*.pipeline =*/ nil,
 367        /*.nsg      =*/ 0,
 368        /*.nr0      =*/ 0,
 369        /*.nr1      =*/ 0,
 370        /*.smem     =*/ 0,
 371        /*.c4       =*/ false,
 372        /*.cnt      =*/ false,
 373    };
 374
 375    [lib->lock lock];
 376
 377    res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
 378    if (res.pipeline) {
 379        [lib->lock unlock];
 380
 381        return res;
 382    }
 383
 384    @autoreleasepool {
 385        NSError * error = nil;
 386
 387        NSString * base_func = [NSString stringWithUTF8String:base];
 388
 389        GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name);
 390
 391        id<MTLFunction> mtl_function;
 392        if (!cv) {
 393            mtl_function = [lib->obj newFunctionWithName:base_func];
 394        } else {
 395            mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
 396        }
 397        if (!mtl_function) {
 398            [lib->lock unlock];
 399
 400            GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
 401            if (error) {
 402                GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
 403            }
 404
 405            return res;
 406        }
 407
 408        id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
 409
 410        [mtl_function release];
 411
 412        if (!obj) {
 413            [lib->lock unlock];
 414
 415            GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
 416            if (error) {
 417                GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
 418            }
 419
 420            return res;
 421        }
 422
 423        GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
 424                (void *) obj,
 425                (int)    obj.maxTotalThreadsPerThreadgroup,
 426                (int)    obj.threadExecutionWidth);
 427
 428        if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
 429            [obj release];
 430
 431            [lib->lock unlock];
 432
 433            GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
 434
 435            return res;
 436        }
 437
 438        res.pipeline = ggml_metal_pipeline_init();
 439        res.pipeline->obj = obj;
 440
 441        ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
 442    }
 443
 444    [lib->lock unlock];
 445
 446    return res;
 447}
 448
 449//
 450// MTLComputeCommandEncoder wrapper
 451//
 452
 453struct ggml_metal_encoder {
 454    id<MTLComputeCommandEncoder> obj;
 455};
 456
 457ggml_metal_encoder_t ggml_metal_encoder_init(ggml_metal_cmd_buf_t cmd_buf_raw, bool concurrent) {
 458    ggml_metal_encoder_t res = calloc(1, sizeof(struct ggml_metal_encoder));
 459
 460    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
 461
 462    if (concurrent) {
 463        res->obj = [cmd_buf computeCommandEncoderWithDispatchType: MTLDispatchTypeConcurrent];
 464    } else {
 465        res->obj = [cmd_buf computeCommandEncoder];
 466    }
 467
 468    [res->obj retain];
 469
 470    return res;
 471}
 472
 473void ggml_metal_encoder_free(ggml_metal_encoder_t encoder) {
 474    [encoder->obj release];
 475    free(encoder);
 476}
 477
 478void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name) {
 479    [encoder->obj pushDebugGroup:[NSString stringWithCString:name encoding:NSUTF8StringEncoding]];
 480}
 481
 482void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
 483    [encoder->obj popDebugGroup];
 484}
 485
 486void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
 487    [encoder->obj setComputePipelineState:pipeline.pipeline->obj];
 488}
 489
 490void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
 491    [encoder->obj setBytes:data length:size atIndex:idx];
 492}
 493
 494void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx) {
 495    [encoder->obj setBuffer:buffer.metal offset:buffer.offs atIndex:idx];
 496}
 497
 498void ggml_metal_encoder_set_threadgroup_memory_size(ggml_metal_encoder_t encoder, size_t size, int idx) {
 499    [encoder->obj setThreadgroupMemoryLength:size atIndex:idx];
 500}
 501
 502void ggml_metal_encoder_dispatch_threadgroups(ggml_metal_encoder_t encoder, int tg0, int tg1, int tg2, int tptg0, int tptg1, int tptg2) {
 503    [encoder->obj dispatchThreadgroups:MTLSizeMake(tg0, tg1, tg2) threadsPerThreadgroup:MTLSizeMake(tptg0, tptg1, tptg2)];
 504}
 505
 506void ggml_metal_encoder_memory_barrier(ggml_metal_encoder_t encoder) {
 507    [encoder->obj memoryBarrierWithScope:MTLBarrierScopeBuffers];
 508}
 509
 510void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder) {
 511    [encoder->obj endEncoding];
 512}
 513
 514struct ggml_metal_device {
 515    id<MTLDevice> mtl_device;
 516
 517    // a single global queue shared by all Metal backends
 518    // technically not needed for devices with unified memory, but enables discrete GPUs support
 519    // ref: https://github.com/ggml-org/llama.cpp/pull/15906
 520    id<MTLCommandQueue> mtl_queue;
 521
 522    ggml_metal_rsets_t rsets;
 523
 524    ggml_metal_library_t library;
 525
 526    struct ggml_metal_device_props props;
 527
 528    // virtual address for GPU memory allocations
 529    atomic_uintptr_t addr_virt;
 530};
 531
 532//
 533// MTLResidenceSet wrapper
 534//
 535
 536struct ggml_metal_rsets {
 537    NSLock * lock;
 538
 539    NSMutableArray * data;
 540
 541    // number of seconds since the last graph computation
 542    // keep the residency sets wired for that amount of time to avoid being collected by the OS
 543    int keep_alive_s;
 544
 545    // background heartbeat thread to keep the residency sets alive
 546    atomic_bool d_stop;
 547    atomic_int  d_loop;
 548
 549    dispatch_group_t d_group;
 550};
 551
 552ggml_metal_rsets_t ggml_metal_rsets_init(void) {
 553    ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
 554
 555    res->lock = [[NSLock alloc] init];
 556    res->data = [[NSMutableArray alloc] init];
 557
 558    // by default keep the memory wired for 3 minutes
 559    res->keep_alive_s = 3*60;
 560
 561    const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
 562    if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
 563        res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
 564    }
 565
 566    if (res->keep_alive_s <= 0) {
 567        res->keep_alive_s = 3*60;
 568    }
 569
 570    GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
 571
 572    atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
 573    atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
 574
 575    res->d_group = dispatch_group_create();
 576
 577    // start a background thread that periodically requests residency for all the currently active sets in the collection
 578    // the requests stop after a certain amount of time (keep_alive_s) of inactivity
 579    dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
 580    dispatch_group_async(res->d_group, d_queue, ^{
 581#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
 582        if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
 583              while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
 584                  if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
 585                      [res->lock lock];
 586
 587                      for (int i = 0; i < (int) res->data.count; ++i) {
 588                          [res->data[i] requestResidency];
 589                      }
 590
 591                      atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
 592
 593                      [res->lock unlock];
 594                  }
 595
 596                  // half a second
 597                  usleep(500 * 1000);
 598              }
 599        }
 600#endif
 601    });
 602
 603    return res;
 604}
 605
 606void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
 607    if (rsets == NULL) {
 608        return;
 609    }
 610
 611    // note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
 612    GGML_ASSERT([rsets->data count] == 0);
 613
 614    atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
 615
 616    dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
 617    dispatch_release(rsets->d_group);
 618
 619    [rsets->data release];
 620    [rsets->lock release];
 621
 622    free(rsets);
 623}
 624
 625ggml_metal_device_t ggml_metal_device_init(int device) {
 626    ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
 627
 628    assert(dev != NULL);
 629
 630    if (dev->mtl_device == nil) {
 631        dev->mtl_device = MTLCreateSystemDefaultDevice();
 632
 633        if (dev->mtl_device) {
 634            dev->mtl_queue = [dev->mtl_device newCommandQueue];
 635            if (dev->mtl_queue == nil) {
 636                GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
 637            }
 638
 639            dev->addr_virt = 0x000000400ULL;
 640
 641            dev->props.device = device;
 642            dev->props.has_simdgroup_reduction  = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
 643            dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 644
 645            dev->props.has_simdgroup_mm = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
 646            dev->props.has_unified_memory = dev->mtl_device.hasUnifiedMemory;
 647
 648            dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 649            dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
 650            if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
 651                dev->props.has_bfloat = false;
 652            }
 653
 654            dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
 655            if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
 656                dev->props.has_tensor = false;
 657            }
 658
 659            // note: disable the tensor API by default for old chips because with the current implementation it is not useful
 660            // - M2 Ultra:   ~5% slower
 661            // - M4, M4 Max: no significant difference
 662            //
 663            // TODO: try to update the tensor API kernels to at least match the simdgroup performance
 664            if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
 665                ![[dev->mtl_device name] containsString:@"M5"] &&
 666                ![[dev->mtl_device name] containsString:@"M6"] &&
 667                ![[dev->mtl_device name] containsString:@"A19"] &&
 668                ![[dev->mtl_device name] containsString:@"A20"]) {
 669                GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
 670                dev->props.has_tensor = false;
 671            }
 672
 673            // double-check that the tensor API compiles
 674            if (dev->props.has_tensor) {
 675                const char * src_tensor_f16 = "\n"
 676                    "#include <metal_stdlib> \n"
 677                    "#include <metal_tensor> \n"
 678                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
 679                    " \n"
 680                    "using namespace metal; \n"
 681                    "using namespace mpp::tensor_ops; \n"
 682                    " \n"
 683                    "kernel void dummy_kernel( \n"
 684                    "    tensor<device  half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
 685                    "    tensor<device  half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
 686                    "    device float * C [[buffer(2)]], \n"
 687                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
 688                    "{ \n"
 689                    "    auto tA = A.slice(0, (int)tgid.y); \n"
 690                    "    auto tB = B.slice((int)tgid.x, 0); \n"
 691                    " \n"
 692                    "    matmul2d< \n"
 693                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
 694                    "        execution_simdgroups<4>> mm; \n"
 695                    " \n"
 696                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
 697                    " \n"
 698                    "    auto sA = tA.slice(0, 0); \n"
 699                    "    auto sB = tB.slice(0, 0); \n"
 700                    "    mm.run(sB, sA, cT); \n"
 701                    " \n"
 702                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
 703                    " \n"
 704                    "    cT.store(tC); \n"
 705                    "}";
 706
 707                GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
 708                ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
 709                if (lib == NULL) {
 710                    GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
 711                    dev->props.has_tensor = false;
 712                } else {
 713                    struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
 714                    if (!ppl.pipeline) {
 715                        GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
 716                        dev->props.has_tensor = false;
 717                    }
 718
 719                    ggml_metal_library_free(lib);
 720                }
 721            }
 722
 723            // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
 724            if (dev->props.has_tensor && dev->props.has_bfloat) {
 725                const char * src_tensor_bf16 = "\n"
 726                    "#include <metal_stdlib> \n"
 727                    "#include <metal_tensor> \n"
 728                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
 729                    " \n"
 730                    "using namespace metal; \n"
 731                    "using namespace mpp::tensor_ops; \n"
 732                    " \n"
 733                    "kernel void dummy_kernel( \n"
 734                    "    tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
 735                    "    tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
 736                    "    device float * C [[buffer(2)]], \n"
 737                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
 738                    "{ \n"
 739                    "    auto tA = A.slice(0, (int)tgid.y); \n"
 740                    "    auto tB = B.slice((int)tgid.x, 0); \n"
 741                    " \n"
 742                    "    matmul2d< \n"
 743                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
 744                    "        execution_simdgroups<4>> mm; \n"
 745                    " \n"
 746                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
 747                    " \n"
 748                    "    auto sA = tA.slice(0, 0); \n"
 749                    "    auto sB = tB.slice(0, 0); \n"
 750                    "    mm.run(sB, sA, cT); \n"
 751                    " \n"
 752                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
 753                    " \n"
 754                    "    cT.store(tC); \n"
 755                    "}";
 756
 757                GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
 758                ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
 759                if (lib == NULL) {
 760                    GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
 761                    dev->props.has_bfloat = false;
 762                } else {
 763                    struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
 764                    if (!ppl.pipeline) {
 765                        GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
 766                        dev->props.has_bfloat = false;
 767                    }
 768
 769                    ggml_metal_library_free(lib);
 770                }
 771            }
 772
 773            dev->props.use_residency_sets = true;
 774#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
 775            dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
 776#endif
 777
 778            dev->props.use_shared_buffers = dev->props.has_unified_memory;
 779#if TARGET_OS_OSX
 780            // In case of eGPU, shared memory may be preferable.
 781            dev->props.use_shared_buffers |= [dev->mtl_device location] == MTLDeviceLocationExternal;
 782#endif
 783            if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
 784                dev->props.use_shared_buffers = false;
 785            }
 786            if (getenv("GGML_METAL_SHARED_BUFFERS_ENABLE") != NULL) {
 787                dev->props.use_shared_buffers = true;
 788            }
 789
 790            dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
 791
 792            dev->props.op_offload_min_batch_size  = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
 793
 794            dev->props.max_buffer_size            = dev->mtl_device.maxBufferLength;
 795            dev->props.max_theadgroup_memory_size = dev->mtl_device.maxThreadgroupMemoryLength;
 796            if (@available(macOS 10.12, iOS 16.0, *)) {
 797                dev->props.max_working_set_size   = dev->mtl_device.recommendedMaxWorkingSetSize;
 798            } else {
 799                dev->props.max_working_set_size   = dev->mtl_device.maxBufferLength;
 800            }
 801
 802            snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
 803            snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
 804
 805            dev->library = ggml_metal_library_init(dev);
 806            if (!dev->library) {
 807                GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
 808            }
 809
 810            if (dev->props.use_residency_sets) {
 811                dev->rsets = ggml_metal_rsets_init();
 812            } else {
 813                dev->rsets = nil;
 814            }
 815
 816            // print MTL GPU family:
 817            GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);
 818
 819            // determine max supported GPU family
 820            // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
 821            // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 822            {
 823                for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
 824                    if ([dev->mtl_device supportsFamily:i]) {
 825                        GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d  (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i);
 826                        break;
 827                    }
 828                }
 829
 830                for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) {
 831                    if ([dev->mtl_device supportsFamily:i]) {
 832                        GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i);
 833                        break;
 834                    }
 835                }
 836
 837                for (int i = MTLGPUFamilyMetal3_GGML + 5; i >= MTLGPUFamilyMetal3_GGML; --i) {
 838                    if ([dev->mtl_device supportsFamily:i]) {
 839                        GGML_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d  (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3_GGML + 3, i);
 840                        break;
 841                    }
 842                }
 843            }
 844
 845            GGML_LOG_INFO("%s: simdgroup reduction   = %s\n", __func__, dev->props.has_simdgroup_reduction ? "true" : "false");
 846            GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm        ? "true" : "false");
 847            GGML_LOG_INFO("%s: has unified memory    = %s\n", __func__, dev->props.has_unified_memory      ? "true" : "false");
 848            GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, dev->props.has_bfloat              ? "true" : "false");
 849            GGML_LOG_INFO("%s: has tensor            = %s\n", __func__, dev->props.has_tensor              ? "true" : "false");
 850            GGML_LOG_INFO("%s: use residency sets    = %s\n", __func__, dev->props.use_residency_sets      ? "true" : "false");
 851            GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, dev->props.use_shared_buffers      ? "true" : "false");
 852
 853#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
 854            if (@available(macOS 10.12, iOS 16.0, *)) {
 855                GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, dev->props.max_working_set_size / 1e6);
 856            }
 857#endif
 858        }
 859    }
 860
 861    return dev;
 862}
 863
 864void ggml_metal_device_free(ggml_metal_device_t dev) {
 865    assert(dev != NULL);
 866
 867    ggml_metal_rsets_free(dev->rsets);
 868
 869    ggml_metal_library_free(dev->library);
 870    dev->library = NULL;
 871
 872    if (dev->mtl_queue) {
 873        [dev->mtl_queue release];
 874        dev->mtl_queue = nil;
 875    }
 876
 877    if (dev->mtl_device) {
 878        [dev->mtl_device release];
 879        dev->mtl_device = nil;
 880    }
 881
 882    free(dev);
 883}
 884
 885void * ggml_metal_device_get_obj(ggml_metal_device_t dev) {
 886    return dev->mtl_device;
 887}
 888
 889void * ggml_metal_device_get_queue(ggml_metal_device_t dev) {
 890    return dev->mtl_queue;
 891}
 892
 893ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
 894    return dev->library;
 895}
 896
 897void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
 898    if (rset == nil) {
 899        return;
 900    }
 901
 902    GGML_ASSERT(dev->rsets);
 903
 904    [dev->rsets->lock lock];
 905
 906    [dev->rsets->data addObject:rset];
 907
 908    [dev->rsets->lock unlock];
 909}
 910
 911void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
 912    if (rset == nil) {
 913        return;
 914    }
 915
 916    GGML_ASSERT(dev->rsets);
 917
 918    [dev->rsets->lock lock];
 919
 920    [dev->rsets->data removeObject:rset];
 921
 922    [dev->rsets->lock unlock];
 923}
 924
 925void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
 926    if (dev->rsets == NULL) {
 927        return;
 928    }
 929
 930    atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
 931}
 932
 933struct ggml_metal_event {
 934    void * obj; // id<MTLEvent>
 935
 936    atomic_int value;
 937};
 938
 939void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
 940    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
 941
 942    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
 943
 944    [cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
 945}
 946
 947void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
 948    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
 949
 950    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
 951
 952    [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
 953}
 954
 955ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
 956    id<MTLEvent> event = [dev->mtl_device newEvent];
 957
 958    ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
 959
 960    ev->obj = (__bridge void *)event;
 961    ev->value = 0;
 962
 963    return ev;
 964}
 965
 966void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
 967    id<MTLEvent> event = ev->obj;
 968    [event release];
 969
 970    free(ev);
 971
 972    GGML_UNUSED(dev);
 973}
 974
 975void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
 976    @autoreleasepool {
 977        id<MTLEvent> event = ev->obj;
 978
 979        id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
 980        [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
 981        [cmd_buf commit];
 982        [cmd_buf waitUntilCompleted];
 983    }
 984}
 985
 986void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
 987    if (@available(macOS 10.12, iOS 16.0, *)) {
 988        *total = dev->mtl_device.recommendedMaxWorkingSetSize;
 989        *free  = *total - dev->mtl_device.currentAllocatedSize;
 990    } else {
 991        *free = 0;
 992        *total = 0;
 993    }
 994}
 995
 996bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op) {
 997    const bool has_simdgroup_mm        = dev->props.has_simdgroup_mm;
 998    const bool has_simdgroup_reduction = dev->props.has_simdgroup_reduction;
 999    const bool has_bfloat              = dev->props.has_bfloat;
1000
1001    if (!has_bfloat) {
1002        if (op->type == GGML_TYPE_BF16) {
1003            return false;
1004        }
1005
1006        for (size_t i = 0, n = 3; i < n; ++i) {
1007            if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
1008                return false;
1009            }
1010        }
1011    }
1012
1013    switch (op->op) {
1014        case GGML_OP_SCALE:
1015        case GGML_OP_FILL:
1016        case GGML_OP_CLAMP:
1017        case GGML_OP_SQR:
1018        case GGML_OP_SQRT:
1019        case GGML_OP_SIN:
1020        case GGML_OP_COS:
1021        case GGML_OP_LOG:
1022            return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
1023        case GGML_OP_UNARY:
1024            switch (ggml_get_unary_op(op)) {
1025                case GGML_UNARY_OP_TANH:
1026                case GGML_UNARY_OP_RELU:
1027                case GGML_UNARY_OP_SIGMOID:
1028                case GGML_UNARY_OP_GELU:
1029                case GGML_UNARY_OP_GELU_ERF:
1030                case GGML_UNARY_OP_GELU_QUICK:
1031                case GGML_UNARY_OP_SILU:
1032                case GGML_UNARY_OP_ELU:
1033                case GGML_UNARY_OP_NEG:
1034                case GGML_UNARY_OP_ABS:
1035                case GGML_UNARY_OP_SGN:
1036                case GGML_UNARY_OP_STEP:
1037                case GGML_UNARY_OP_HARDSWISH:
1038                case GGML_UNARY_OP_HARDSIGMOID:
1039                case GGML_UNARY_OP_EXP:
1040                case GGML_UNARY_OP_SOFTPLUS:
1041                case GGML_UNARY_OP_EXPM1:
1042                    return ggml_is_contiguous_rows(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
1043                default:
1044                    return false;
1045            }
1046        case GGML_OP_GLU:
1047            switch (ggml_get_glu_op(op)) {
1048                case GGML_GLU_OP_REGLU:
1049                case GGML_GLU_OP_GEGLU:
1050                case GGML_GLU_OP_SWIGLU:
1051                case GGML_GLU_OP_SWIGLU_OAI:
1052                case GGML_GLU_OP_GEGLU_ERF:
1053                case GGML_GLU_OP_GEGLU_QUICK:
1054                    return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1055               default:
1056                    return false;
1057            }
1058        case GGML_OP_NONE:
1059        case GGML_OP_RESHAPE:
1060        case GGML_OP_VIEW:
1061        case GGML_OP_TRANSPOSE:
1062        case GGML_OP_PERMUTE:
1063        case GGML_OP_CONCAT:
1064            return true;
1065        case GGML_OP_ADD:
1066        case GGML_OP_SUB:
1067        case GGML_OP_MUL:
1068        case GGML_OP_DIV:
1069        case GGML_OP_ADD_ID:
1070            return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
1071        case GGML_OP_ACC:
1072        case GGML_OP_REPEAT:
1073        case GGML_OP_CONV_TRANSPOSE_1D:
1074            return true;
1075        case GGML_OP_CONV_TRANSPOSE_2D:
1076            return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) &&
1077                (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32) &&
1078                op->src[1]->type == GGML_TYPE_F32 &&
1079                op->type == GGML_TYPE_F32;
1080        case GGML_OP_SUM:
1081            return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
1082        case GGML_OP_TRI:
1083            return ggml_is_contiguous_rows(op->src[0]);
1084        case GGML_OP_SUM_ROWS:
1085        case GGML_OP_CUMSUM:
1086        case GGML_OP_MEAN:
1087        case GGML_OP_SOFT_MAX:
1088        case GGML_OP_GROUP_NORM:
1089        case GGML_OP_L2_NORM:
1090            return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
1091        case GGML_OP_COUNT_EQUAL:
1092            return has_simdgroup_reduction &&
1093                op->src[0]->type == GGML_TYPE_I32 &&
1094                op->src[1]->type == GGML_TYPE_I32 &&
1095                op->type == GGML_TYPE_I64;
1096        case GGML_OP_ARGMAX:
1097            return has_simdgroup_reduction;
1098        case GGML_OP_NORM:
1099        case GGML_OP_RMS_NORM:
1100            return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
1101        case GGML_OP_ROPE:
1102            return true;
1103        case GGML_OP_IM2COL:
1104            return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32);
1105        case GGML_OP_CONV_2D:
1106            return ggml_is_contiguous(op->src[0]) &&
1107                   op->src[1]->type == GGML_TYPE_F32 &&
1108                   op->type == GGML_TYPE_F32 &&
1109                   (op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32);
1110        case GGML_OP_UPSCALE:
1111            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
1112        case GGML_OP_POOL_1D:
1113            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1114        case GGML_OP_POOL_2D:
1115            return op->src[0]->type == GGML_TYPE_F32;
1116        case GGML_OP_PAD:
1117            // TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
1118            if (ggml_get_op_params_i32(op, 8) != 0) {
1119                return false;
1120            }
1121
1122            return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
1123                   (ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
1124        case GGML_OP_PAD_REFLECT_1D:
1125        case GGML_OP_TIMESTEP_EMBEDDING:
1126        case GGML_OP_LEAKY_RELU:
1127            return op->src[0]->type == GGML_TYPE_F32;
1128        case GGML_OP_ARGSORT:
1129        case GGML_OP_TOP_K:
1130        case GGML_OP_ARANGE:
1131            return true;
1132        case GGML_OP_FLASH_ATTN_EXT:
1133            // for new head sizes, add checks here
1134            if (op->src[0]->ne[0] != 32 &&
1135                op->src[0]->ne[0] != 40 &&
1136                op->src[0]->ne[0] != 48 &&
1137                op->src[0]->ne[0] != 64 &&
1138                op->src[0]->ne[0] != 72 &&
1139                op->src[0]->ne[0] != 80 &&
1140                op->src[0]->ne[0] != 96 &&
1141                op->src[0]->ne[0] != 112 &&
1142                op->src[0]->ne[0] != 128 &&
1143                op->src[0]->ne[0] != 192 &&
1144                op->src[0]->ne[0] != 256 &&
1145                op->src[0]->ne[0] != 576) {
1146                return false;
1147            }
1148            if (op->src[1]->type != op->src[2]->type) {
1149                return false;
1150            }
1151            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
1152        case GGML_OP_SSM_CONV:
1153        case GGML_OP_SSM_SCAN:
1154            return has_simdgroup_reduction;
1155        case GGML_OP_RWKV_WKV6:
1156        case GGML_OP_RWKV_WKV7:
1157            return true;
1158        case GGML_OP_SOLVE_TRI:
1159        case GGML_OP_MUL_MAT:
1160        case GGML_OP_MUL_MAT_ID:
1161            return has_simdgroup_reduction;
1162        case GGML_OP_CPY:
1163        case GGML_OP_DUP:
1164        case GGML_OP_CONT:
1165            {
1166                switch (op->src[0]->type) {
1167                    case GGML_TYPE_F32:
1168                        switch (op->type) {
1169                           case GGML_TYPE_F32:
1170                           case GGML_TYPE_F16:
1171                           case GGML_TYPE_BF16:
1172                           case GGML_TYPE_Q8_0:
1173                           case GGML_TYPE_Q4_0:
1174                           case GGML_TYPE_Q4_1:
1175                           case GGML_TYPE_Q5_0:
1176                           case GGML_TYPE_Q5_1:
1177                           case GGML_TYPE_IQ4_NL:
1178                           case GGML_TYPE_I32:
1179                                return true;
1180                           default:
1181                                return false;
1182                        }
1183                    case GGML_TYPE_F16:
1184                        switch (op->type) {
1185                            case GGML_TYPE_F32:
1186                            case GGML_TYPE_F16:
1187                                return true;
1188                            default:
1189                                return false;
1190                        }
1191                    case GGML_TYPE_BF16:
1192                        switch (op->type) {
1193                            case GGML_TYPE_F32:
1194                            case GGML_TYPE_BF16:
1195                                return true;
1196                            default:
1197                                return false;
1198                        }
1199                    case GGML_TYPE_Q4_0:
1200                    case GGML_TYPE_Q4_1:
1201                    case GGML_TYPE_Q5_0:
1202                    case GGML_TYPE_Q5_1:
1203                    case GGML_TYPE_Q8_0:
1204                        switch (op->type) {
1205                            case GGML_TYPE_F32:
1206                            case GGML_TYPE_F16:
1207                                return true;
1208                            default:
1209                                return false;
1210                        }
1211                    case GGML_TYPE_I32:
1212                        return op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32;
1213                    default:
1214                        return false;
1215                };
1216            }
1217        case GGML_OP_GET_ROWS:
1218            return true;
1219        case GGML_OP_SET_ROWS:
1220            {
1221                if (op->src[0]->type != GGML_TYPE_F32) {
1222                    return false;
1223                }
1224
1225                switch (op->type) {
1226                    case GGML_TYPE_F32:
1227                    case GGML_TYPE_F16:
1228                    case GGML_TYPE_BF16:
1229                    case GGML_TYPE_Q8_0:
1230                    case GGML_TYPE_Q4_0:
1231                    case GGML_TYPE_Q4_1:
1232                    case GGML_TYPE_Q5_0:
1233                    case GGML_TYPE_Q5_1:
1234                    case GGML_TYPE_IQ4_NL:
1235                        return true;
1236                    default:
1237                        return false;
1238                };
1239            }
1240        case GGML_OP_DIAG:
1241            return true;
1242        case GGML_OP_OPT_STEP_ADAMW:
1243        case GGML_OP_OPT_STEP_SGD:
1244            return has_simdgroup_reduction;
1245        default:
1246            return false;
1247    }
1248}
1249
1250const struct ggml_metal_device_props * ggml_metal_device_get_props(ggml_metal_device_t dev) {
1251    return &dev->props;
1252}
1253
1254//
1255// device buffers
1256//
1257
1258// max memory buffers that can be mapped to the device
1259#define GGML_METAL_MAX_BUFFERS 64
1260
1261struct ggml_metal_buffer_wrapper {
1262    void   * data;
1263    size_t   size;
1264
1265    id<MTLBuffer> metal;
1266};
1267
1268struct ggml_metal_buffer {
1269    void * all_data;
1270    size_t all_size;
1271
1272    // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
1273    bool is_shared;
1274    bool owned;
1275
1276    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1277    int n_buffers;
1278    struct ggml_metal_buffer_wrapper buffers[GGML_METAL_MAX_BUFFERS];
1279
1280    bool use_residency_sets;
1281
1282    // optional MTLResidencySet
1283    // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
1284    id rset;
1285
1286    // pointers to global device
1287    ggml_metal_device_t dev;
1288};
1289
1290static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
1291#ifndef GGML_METAL_NDEBUG
1292#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
1293    if (@available(macOS 10.12, iOS 16.0, *)) {
1294        GGML_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
1295                __func__,
1296                size_aligned / 1024.0 / 1024.0,
1297                device.currentAllocatedSize / 1024.0 / 1024.0,
1298                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
1299
1300        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
1301            GGML_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
1302        }
1303    } else {
1304        GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
1305                __func__,
1306                size_aligned / 1024.0 / 1024.0,
1307                device.currentAllocatedSize / 1024.0 / 1024.0);
1308    }
1309#endif
1310#endif
1311    GGML_UNUSED(device);
1312    GGML_UNUSED(size_aligned);
1313}
1314
1315// rset init
1316static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
1317    buf->rset = nil;
1318
1319    if (!buf->use_residency_sets) {
1320        return true;
1321    }
1322
1323#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1324    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
1325        MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
1326        desc.label = @"ggml_metal";
1327        desc.initialCapacity = buf->n_buffers;
1328
1329        NSError * error;
1330        buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
1331        if (error) {
1332            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1333            [desc release];
1334            return false;
1335        }
1336
1337        [desc release];
1338
1339        for (int i = 0; i < buf->n_buffers; i++) {
1340            [buf->rset addAllocation:buf->buffers[i].metal];
1341        }
1342
1343        [buf->rset commit];
1344        [buf->rset requestResidency];
1345
1346        return true;
1347    }
1348#endif
1349
1350    return true;
1351}
1352
1353// rset free
1354static void ggml_metal_buffer_rset_free(ggml_metal_buffer_t buf) {
1355#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1356    if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
1357        if (buf->rset) {
1358            [buf->rset endResidency];
1359            [buf->rset removeAllAllocations];
1360            [buf->rset release];
1361        }
1362    }
1363#else
1364    GGML_UNUSED(buf);
1365#endif
1366}
1367
1368static void * ggml_metal_host_malloc(size_t n) {
1369    void * data = NULL;
1370
1371#if TARGET_OS_OSX
1372    kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
1373    if (err != KERN_SUCCESS) {
1374        GGML_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
1375        return NULL;
1376    }
1377#else
1378    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
1379    if (result != 0) {
1380        GGML_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
1381        return NULL;
1382    }
1383#endif
1384
1385    return data;
1386}
1387
1388ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
1389    ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1390
1391    res->dev = dev;
1392
1393    const size_t size_page = sysconf(_SC_PAGESIZE);
1394
1395    size_t size_aligned = size;
1396    if ((size_aligned % size_page) != 0) {
1397        size_aligned += (size_page - (size_aligned % size_page));
1398    }
1399
1400    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1401
1402    shared = shared && props_dev->use_shared_buffers;
1403
1404    // allocate shared buffer if the device supports it and it is required by the buffer type
1405    if (shared) {
1406        res->all_data = ggml_metal_host_malloc(size_aligned);
1407        res->is_shared = true;
1408    } else {
1409        // use virtual address
1410        res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
1411        res->is_shared = false;
1412    }
1413    res->all_size = size_aligned;
1414
1415    res->owned = true;
1416
1417    res->n_buffers = 1;
1418
1419    if (res->all_data != NULL) {
1420        res->buffers[0].size  = size;
1421        res->buffers[0].metal = nil;
1422
1423        if (size_aligned > 0) {
1424            if (props_dev->use_shared_buffers && shared) {
1425                res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
1426                                                                  length:size_aligned
1427                                                                 options:MTLResourceStorageModeShared
1428                                                             deallocator:nil];
1429            } else {
1430                res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
1431            }
1432        }
1433
1434        res->buffers[0].data = res->all_data;
1435    }
1436
1437    if (size_aligned > 0 && (res->all_data == NULL || res->buffers[0].metal == nil)) {
1438        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
1439        free(res);
1440        return NULL;
1441    }
1442
1443    res->use_residency_sets = props_dev->use_residency_sets;
1444
1445    if (!ggml_metal_buffer_rset_init(res)) {
1446        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
1447        free(res);
1448        return NULL;
1449    }
1450
1451    ggml_metal_device_rsets_add(dev, res->rset);
1452
1453    //ggml_metal_log_allocated_size(device, size_aligned);
1454
1455    return res;
1456}
1457
1458ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
1459    ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
1460
1461    res->dev = dev;
1462
1463    res->all_data = ptr;
1464    res->all_size = size;
1465
1466    res->is_shared = true;
1467    res->owned = false;
1468
1469    res->n_buffers = 0;
1470
1471    const size_t size_page = sysconf(_SC_PAGESIZE);
1472
1473    // page-align the data ptr
1474    {
1475        const uintptr_t offs = (uintptr_t) ptr % size_page;
1476        ptr  = (void *) ((char *) ptr - offs);
1477        size += offs;
1478    }
1479
1480    size_t size_aligned = size;
1481    if ((size_aligned % size_page) != 0) {
1482        size_aligned += (size_page - (size_aligned % size_page));
1483    }
1484
1485    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
1486
1487    // the buffer fits into the max buffer size allowed by the device
1488    if (size_aligned <= props_dev->max_buffer_size) {
1489        res->buffers[res->n_buffers].data  = ptr;
1490        res->buffers[res->n_buffers].size  = size;
1491        res->buffers[res->n_buffers].metal = nil;
1492
1493        if (size_aligned > 0) {
1494            res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
1495
1496            if (res->buffers[res->n_buffers].metal == nil) {
1497                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
1498                free(res);
1499                return NULL;
1500            }
1501        }
1502
1503        ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
1504
1505        ++res->n_buffers;
1506    } else {
1507        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
1508        // one of the views
1509        const size_t size_ovlp = ((max_tensor_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
1510        const size_t size_step = props_dev->max_buffer_size - size_ovlp;
1511        const size_t size_view = props_dev->max_buffer_size;
1512
1513        for (size_t i = 0; i < size; i += size_step) {
1514            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
1515
1516            res->buffers[res->n_buffers].data  = (void *) ((uint8_t *) ptr + i);
1517            res->buffers[res->n_buffers].size  = size_step_aligned;
1518            res->buffers[res->n_buffers].metal = nil;
1519
1520            if (size_step_aligned > 0) {
1521                res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
1522
1523                if (res->buffers[res->n_buffers].metal == nil) {
1524                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
1525                    free(res);
1526                    return NULL;
1527                }
1528            }
1529
1530            ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
1531
1532            if (i + size_step < size) {
1533                GGML_LOG_INFO("\n");
1534            }
1535
1536            ++res->n_buffers;
1537        }
1538    }
1539
1540    res->use_residency_sets = props_dev->use_residency_sets;
1541
1542    if (!ggml_metal_buffer_rset_init(res)) {
1543        GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
1544        free(res);
1545        return NULL;
1546    }
1547
1548    ggml_metal_device_rsets_add(dev, res->rset);
1549
1550    return res;
1551}
1552
1553void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
1554    ggml_metal_device_rsets_rm(buf->dev, buf->rset);
1555
1556    for (int i = 0; i < buf->n_buffers; i++) {
1557        [buf->buffers[i].metal release];
1558    }
1559
1560    ggml_metal_buffer_rset_free(buf);
1561
1562    if (buf->is_shared && buf->owned) {
1563#if TARGET_OS_OSX
1564        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)buf->all_data, buf->all_size);
1565#else
1566        free(buf->all_data);
1567#endif
1568    }
1569
1570    free(buf);
1571}
1572
1573void * ggml_metal_buffer_get_base(ggml_metal_buffer_t buf) {
1574    return buf->all_data;
1575}
1576
1577bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {
1578    return buf->is_shared;
1579}
1580
1581void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
1582    if (buf->is_shared) {
1583        memset((char *) tensor->data + offset, value, size);
1584        return;
1585    }
1586
1587    @autoreleasepool {
1588        // dst
1589        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1590        bid_dst.offs += offset;
1591
1592        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1593
1594        {
1595            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1596
1597            [encoder fillBuffer:bid_dst.metal
1598                          range:NSMakeRange(bid_dst.offs, bid_dst.offs + size)
1599                          value:value];
1600
1601            [encoder endEncoding];
1602        }
1603
1604        [cmd_buf commit];
1605        [cmd_buf waitUntilCompleted];
1606    }
1607}
1608
1609void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1610    if (buf->is_shared) {
1611        memcpy((char *) tensor->data + offset, data, size);
1612        return;
1613    }
1614
1615    @autoreleasepool {
1616        // src
1617        void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
1618        id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
1619                                                               length:size
1620                                                              options:MTLResourceStorageModeShared
1621                                                          deallocator:nil];
1622
1623        GGML_ASSERT(buf_src);
1624
1625        // dst
1626        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
1627        bid_dst.offs += offset;
1628
1629        // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete
1630        //       this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
1631        dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
1632
1633        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1634
1635        {
1636            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1637
1638            [encoder copyFromBuffer:buf_src
1639                       sourceOffset:0
1640                           toBuffer:bid_dst.metal
1641                  destinationOffset:bid_dst.offs
1642                               size:size];
1643
1644            [encoder endEncoding];
1645        }
1646
1647        [cmd_buf addCompletedHandler:^(id<MTLCommandBuffer> cb) {
1648                             // TODO: can check for errors here
1649            GGML_UNUSED(cb);
1650
1651            dispatch_semaphore_signal(completion_semaphore);
1652        }];
1653
1654        [cmd_buf commit];
1655
1656        dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER);
1657        dispatch_release(completion_semaphore);
1658
1659        //[cmd_buf waitUntilCompleted];
1660    }
1661}
1662
1663void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1664    if (buf->is_shared) {
1665        memcpy(data, (const char *) tensor->data + offset, size);
1666        return;
1667    }
1668
1669    @autoreleasepool {
1670        // src
1671        struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf, tensor);
1672        bid_src.offs += offset;
1673
1674        // dst
1675        id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
1676                                                               length:size
1677                                                              options:MTLResourceStorageModeShared
1678                                                          deallocator:nil];
1679
1680        GGML_ASSERT(buf_dst);
1681
1682        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1683
1684        {
1685            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1686
1687            [encoder copyFromBuffer:bid_src.metal
1688                       sourceOffset:bid_src.offs
1689                           toBuffer:buf_dst
1690                  destinationOffset:0
1691                               size:size];
1692
1693            [encoder endEncoding];
1694        }
1695
1696        [cmd_buf commit];
1697        [cmd_buf waitUntilCompleted];
1698    }
1699}
1700
1701void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
1702    if (buf->is_shared) {
1703        memset(buf->all_data, value, buf->all_size);
1704        return;
1705    }
1706
1707    @autoreleasepool {
1708        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
1709
1710        {
1711            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
1712
1713            [encoder fillBuffer:buf->buffers[0].metal
1714                          range:NSMakeRange(0, buf->buffers[0].size)
1715                          value:value];
1716
1717            [encoder endEncoding];
1718        }
1719
1720        [cmd_buf commit];
1721        [cmd_buf waitUntilCompleted];
1722    }
1723}
1724
1725struct ggml_metal_buffer_id ggml_metal_buffer_get_id(ggml_metal_buffer_t buf, const struct ggml_tensor * t) {
1726    struct ggml_metal_buffer_id res = { nil, 0 };
1727
1728    const int64_t tsize = ggml_nbytes(t);
1729
1730    // find the view that contains the tensor fully
1731    for (int i = 0; i < buf->n_buffers; ++i) {
1732        const int64_t ioffs = (int64_t) t->data - (int64_t) buf->buffers[i].data;
1733
1734        //GGML_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf->buffers[i].size);
1735        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf->buffers[i].size) {
1736            res.metal = buf->buffers[i].metal;
1737            res.offs  = (size_t) ioffs;
1738
1739            //GGML_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
1740
1741            return res;
1742        }
1743    }
1744
1745    GGML_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
1746
1747    return res;
1748}