llama : add option to save memory in device buffers (llama/22679)

* llama : add option to save memory in device buffers

* tests : extend llama-save-load-state
This commit is contained in:
Georgi Gerganov 2026-05-05 06:35:07 +03:00
parent 4794432337
commit 6f6103f6d0
3 changed files with 54 additions and 8 deletions

View File

@ -282,6 +282,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf);
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void ggml_metal_buffer_set_tensor (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_metal_buffer_get_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool ggml_metal_buffer_cpy_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * src, struct ggml_tensor * dst);
void ggml_metal_buffer_clear (ggml_metal_buffer_t buf, uint8_t value);
// finds the Metal buffer that contains the tensor data on the GPU device

View File

@ -1,6 +1,7 @@
#import "ggml-metal-device.h"
#import "ggml-impl.h"
#import "ggml-backend-impl.h"
#include <Foundation/Foundation.h>
@ -1737,6 +1738,47 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
}
}
bool ggml_metal_buffer_cpy_tensor(ggml_metal_buffer_t buf_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_metal_buffer_t buf_src = (ggml_metal_buffer_t)src->buffer->context;
const size_t size = ggml_nbytes(src);
// if both buffers are shared, we can use memcpy directly
if (buf_dst->is_shared && buf_src->is_shared) {
memcpy(dst->data, src->data, size);
return true;
}
// for private buffers, we need to use Metal blit commands
@autoreleasepool {
struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf_src, src);
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf_dst, dst);
if (bid_src.metal == nil || bid_dst.metal == nil) {
return false;
}
id<MTLCommandBuffer> cmd_buf = [buf_dst->dev->mtl_queue commandBufferWithUnretainedReferences];
{
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs
size:size];
[encoder endEncoding];
}
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
}
return true;
}
void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
if (buf->is_shared) {
memset(buf->all_data, value, buf->all_size);

View File

@ -17,6 +17,9 @@
// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
static int g_devices = 1;
// forward declaration
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer);
////////////////////////////////////////////////////////////////////////////////
// backend interface
////////////////////////////////////////////////////////////////////////////////
@ -68,11 +71,11 @@ static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t bu
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
GGML_UNUSED(buffer);
GGML_UNUSED(src);
GGML_UNUSED(dst);
if (!ggml_backend_buffer_is_metal(src->buffer)) {
return false;
}
return false;
return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
}
static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -144,11 +147,11 @@ static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t b
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
GGML_UNUSED(buffer);
GGML_UNUSED(src);
GGML_UNUSED(dst);
if (!ggml_backend_buffer_is_metal(src->buffer)) {
return false;
}
return false;
return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
}
static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) {