From 6f6103f6d0034945a9377d16e29cf0d3ec2b4c35 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 5 May 2026 06:35:07 +0300 Subject: [PATCH] llama : add option to save memory in device buffers (llama/22679) * llama : add option to save memory in device buffers * tests : extend llama-save-load-state --- ggml/src/ggml-metal/ggml-metal-device.h | 1 + ggml/src/ggml-metal/ggml-metal-device.m | 42 +++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.cpp | 19 ++++++----- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index a6c1dab55..4718ca083 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -282,6 +282,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf); void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); void ggml_metal_buffer_set_tensor (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_metal_buffer_get_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); +bool ggml_metal_buffer_cpy_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * src, struct ggml_tensor * dst); void ggml_metal_buffer_clear (ggml_metal_buffer_t buf, uint8_t value); // finds the Metal buffer that contains the tensor data on the GPU device diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index fe90aafe7..fab7891c0 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1,6 +1,7 @@ #import "ggml-metal-device.h" #import "ggml-impl.h" +#import "ggml-backend-impl.h" #include @@ -1737,6 +1738,47 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten } } +bool ggml_metal_buffer_cpy_tensor(ggml_metal_buffer_t buf_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_metal_buffer_t buf_src = (ggml_metal_buffer_t)src->buffer->context; + + const size_t size = ggml_nbytes(src); + + // if both buffers are shared, we can use memcpy directly + if (buf_dst->is_shared && buf_src->is_shared) { + memcpy(dst->data, src->data, size); + return true; + } + + // for private buffers, we need to use Metal blit commands + @autoreleasepool { + struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf_src, src); + struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf_dst, dst); + + if (bid_src.metal == nil || bid_dst.metal == nil) { + return false; + } + + id cmd_buf = [buf_dst->dev->mtl_queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:bid_src.metal + sourceOffset:bid_src.offs + toBuffer:bid_dst.metal + destinationOffset:bid_dst.offs + size:size]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } + + return true; +} + void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) { if (buf->is_shared) { memset(buf->all_data, value, buf->all_size); diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index cc329d675..357742549 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -17,6 +17,9 @@ // note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices static int g_devices = 1; +// forward declaration +static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer); + //////////////////////////////////////////////////////////////////////////////// // backend interface //////////////////////////////////////////////////////////////////////////////// @@ -68,11 +71,11 @@ static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t bu GGML_ASSERT(ggml_metal_buffer_is_shared(ctx)); - GGML_UNUSED(buffer); - GGML_UNUSED(src); - GGML_UNUSED(dst); + if (!ggml_backend_buffer_is_metal(src->buffer)) { + return false; + } - return false; + return ggml_metal_buffer_cpy_tensor(ctx, src, dst); } static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -144,11 +147,11 @@ static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t b GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx)); - GGML_UNUSED(buffer); - GGML_UNUSED(src); - GGML_UNUSED(dst); + if (!ggml_backend_buffer_is_metal(src->buffer)) { + return false; + } - return false; + return ggml_metal_buffer_cpy_tensor(ctx, src, dst); } static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) {