llama : add option to save memory in device buffers (llama/22679)
* llama : add option to save memory in device buffers * tests : extend llama-save-load-state
This commit is contained in:
parent
4794432337
commit
6f6103f6d0
|
|
@ -282,6 +282,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf);
|
|||
void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void ggml_metal_buffer_set_tensor (ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void ggml_metal_buffer_get_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool ggml_metal_buffer_cpy_tensor (ggml_metal_buffer_t buf, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void ggml_metal_buffer_clear (ggml_metal_buffer_t buf, uint8_t value);
|
||||
|
||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#import "ggml-metal-device.h"
|
||||
|
||||
#import "ggml-impl.h"
|
||||
#import "ggml-backend-impl.h"
|
||||
|
||||
#include <Foundation/Foundation.h>
|
||||
|
||||
|
|
@ -1737,6 +1738,47 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
|
|||
}
|
||||
}
|
||||
|
||||
bool ggml_metal_buffer_cpy_tensor(ggml_metal_buffer_t buf_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||
ggml_metal_buffer_t buf_src = (ggml_metal_buffer_t)src->buffer->context;
|
||||
|
||||
const size_t size = ggml_nbytes(src);
|
||||
|
||||
// if both buffers are shared, we can use memcpy directly
|
||||
if (buf_dst->is_shared && buf_src->is_shared) {
|
||||
memcpy(dst->data, src->data, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
// for private buffers, we need to use Metal blit commands
|
||||
@autoreleasepool {
|
||||
struct ggml_metal_buffer_id bid_src = ggml_metal_buffer_get_id(buf_src, src);
|
||||
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf_dst, dst);
|
||||
|
||||
if (bid_src.metal == nil || bid_dst.metal == nil) {
|
||||
return false;
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> cmd_buf = [buf_dst->dev->mtl_queue commandBufferWithUnretainedReferences];
|
||||
|
||||
{
|
||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||
|
||||
[encoder copyFromBuffer:bid_src.metal
|
||||
sourceOffset:bid_src.offs
|
||||
toBuffer:bid_dst.metal
|
||||
destinationOffset:bid_dst.offs
|
||||
size:size];
|
||||
|
||||
[encoder endEncoding];
|
||||
}
|
||||
|
||||
[cmd_buf commit];
|
||||
[cmd_buf waitUntilCompleted];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
|
||||
if (buf->is_shared) {
|
||||
memset(buf->all_data, value, buf->all_size);
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@
|
|||
// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
|
||||
static int g_devices = 1;
|
||||
|
||||
// forward declaration
|
||||
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// backend interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
@ -68,11 +71,11 @@ static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t bu
|
|||
|
||||
GGML_ASSERT(ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
if (!ggml_backend_buffer_is_metal(src->buffer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
|
|
@ -144,11 +147,11 @@ static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t b
|
|||
|
||||
GGML_ASSERT(!ggml_metal_buffer_is_shared(ctx));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(src);
|
||||
GGML_UNUSED(dst);
|
||||
if (!ggml_backend_buffer_is_metal(src->buffer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
return ggml_metal_buffer_cpy_tensor(ctx, src, dst);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue