metal : Cache the Metal library at the device context level (llama/12265)

2025-08-20 11:52:10 +02:00 · 2025-03-11 19:45:02 +08:00
parent 776cdceb9e
commit 774c519433
1 changed files with 147 additions and 132 deletions
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -46,6 +46,7 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
 static struct ggml_backend_metal_device_context {
    id<MTLDevice> mtl_device;
    int           mtl_device_ref_count;
+    id<MTLLibrary> mtl_library;

    bool has_simdgroup_reduction;
    bool has_simdgroup_mm;
@@ -57,6 +58,7 @@ static struct ggml_backend_metal_device_context {
 } g_ggml_ctx_dev_main = {
    /*.mtl_device              =*/ nil,
    /*.mtl_device_ref_count    =*/ 0,
+    /*.mtl_library             =*/ nil,
    /*.has_simdgroup_reduction =*/ false,
    /*.has_simdgroup_mm        =*/ false,
    /*.has_residency_sets      =*/ false,
@@ -108,6 +110,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
    ctx->mtl_device_ref_count--;

    if (ctx->mtl_device_ref_count == 0) {
+        if (ctx->mtl_library) {
+            [ctx->mtl_library release];
+            ctx->mtl_library = nil;
+        }
+
        if (ctx->mtl_device) {
            [ctx->mtl_device release];
            ctx->mtl_device = nil;
@@ -495,42 +502,14 @@ static void * ggml_metal_host_malloc(size_t n) {
    return data;
 }

-static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
-    GGML_LOG_INFO("%s: allocating\n", __func__);
-
-#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
-    // Show all the Metal device instances in the system
-    NSArray * devices = MTLCopyAllDevices();
-    for (id<MTLDevice> device in devices) {
-        GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
-    }
-    [devices release]; // since it was created by a *Copy* C method
-#endif
-
-    // init context
-    struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
-    struct ggml_backend_metal_device_context * ctx_dev = dev->context;
-
-    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
-    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
-
-    ctx->queue  = [device newCommandQueue];
-    if (ctx->queue == nil) {
-        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
-        return NULL;
-    }
-
-    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-
+// load library
+//
+// - first check if the library is embedded
+// - then check if the library is in the bundle
+// - if not found, load the source and compile it
+// - if that fails, return NULL
+static id<MTLLibrary> ggml_metal_load_library(id<MTLDevice> device, bool use_bfloat) {
    id<MTLLibrary> metal_library = nil;
-
-    // load library
-    //
-    // - first check if the library is embedded
-    // - then check if the library is in the bundle
-    // - if not found, load the source and compile it
-    // - if that fails, return NULL
-    {
    NSError * error = nil;
    NSString * src = nil;

@@ -624,7 +603,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
            // dictionary of preprocessor macros
            NSMutableDictionary * prep = [NSMutableDictionary dictionary];

-                if (ctx_dev->use_bfloat) {
+            if (use_bfloat) {
                [prep setObject:@"1" forKey:@"GGML_METAL_USE_BF16"];
            }

@@ -652,6 +631,45 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
 #if GGML_METAL_EMBED_LIBRARY
    [src release];
 #endif // GGML_METAL_EMBED_LIBRARY
+
+    return metal_library;
+}
+
+static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t dev) {
+    GGML_LOG_INFO("%s: allocating\n", __func__);
+
+#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
+    // Show all the Metal device instances in the system
+    NSArray * devices = MTLCopyAllDevices();
+    for (id<MTLDevice> device in devices) {
+        GGML_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
+    }
+    [devices release]; // since it was created by a *Copy* C method
+#endif
+
+    // init context
+    struct ggml_backend_metal_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_context));
+    struct ggml_backend_metal_device_context * ctx_dev = dev->context;
+
+    id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
+
+    ctx->queue  = [device newCommandQueue];
+    if (ctx->queue == nil) {
+        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
+        return NULL;
+    }
+
+    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
+
+    // load library
+    if (ctx_dev->mtl_library == nil) {
+        ctx_dev->mtl_library = ggml_metal_load_library(device, ctx_dev->use_bfloat);
+    }
+    id<MTLLibrary> metal_library = ctx_dev->mtl_library;
+    if (metal_library == nil) {
+        GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
+        return NULL;
    }

    // print MTL GPU family:
@@ -725,7 +743,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
            [metal_function release]; \
            if (error) { \
                GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
-                [metal_library release]; \
                return NULL; \
            } \
        } else { \
@@ -1044,8 +1061,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,               pool_2d_max_f32,                true);
    }

-    [metal_library release];
-
    return ctx;
 }