|
|
|
|
@ -6845,7 +6845,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
|
|
|
|
|
static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
|
|
|
|
|
// Check if src is pinned memory
|
|
|
|
|
vk_buffer buf = nullptr;
|
|
|
|
|
@ -6855,7 +6855,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|
|
|
|
if (buf != nullptr) {
|
|
|
|
|
// Memory is pinned, use as staging buffer
|
|
|
|
|
std::vector<vk::BufferCopy> slices(1);
|
|
|
|
|
if (width == spitch) {
|
|
|
|
|
if (width == spitch && width == dpitch) {
|
|
|
|
|
// Only do single write if stride is equal
|
|
|
|
|
slices[0].srcOffset = buf_offset;
|
|
|
|
|
slices[0].dstOffset = offset;
|
|
|
|
|
@ -6864,7 +6864,7 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|
|
|
|
slices.resize(height);
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
slices[i].srcOffset = buf_offset + i * spitch;
|
|
|
|
|
slices[i].dstOffset = offset + i * width;
|
|
|
|
|
slices[i].dstOffset = offset + i * dpitch;
|
|
|
|
|
slices[i].size = width;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@ -6881,21 +6881,30 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Staging buffer required
|
|
|
|
|
const size_t copy_size = width*height;
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(dst->device, copy_size);
|
|
|
|
|
const size_t staging_size = width * height;
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(dst->device, staging_size);
|
|
|
|
|
|
|
|
|
|
vk_buffer& staging_buffer = dst->device->sync_staging;
|
|
|
|
|
|
|
|
|
|
VkBufferCopy buf_copy = {
|
|
|
|
|
0,
|
|
|
|
|
offset,
|
|
|
|
|
copy_size};
|
|
|
|
|
std::vector<vk::BufferCopy> slices(1);
|
|
|
|
|
if (width == dpitch) {
|
|
|
|
|
slices[0].srcOffset = 0;
|
|
|
|
|
slices[0].dstOffset = offset;
|
|
|
|
|
slices[0].size = staging_size;
|
|
|
|
|
} else {
|
|
|
|
|
slices.resize(height);
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
slices[i].srcOffset = i * width;
|
|
|
|
|
slices[i].dstOffset = offset + i * dpitch;
|
|
|
|
|
slices[i].size = width;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
|
|
vkCmdCopyBuffer(subctx->s->buffer->buf, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy);
|
|
|
|
|
subctx->s->buffer->buf.copyBuffer((VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, slices);
|
|
|
|
|
|
|
|
|
|
if (width == spitch) {
|
|
|
|
|
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys);
|
|
|
|
|
deferred_memcpy((uint8_t *)staging_buffer->ptr, src, staging_size, &subctx->in_memcpys);
|
|
|
|
|
} else {
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
|
|
|
|
|
@ -6906,24 +6915,24 @@ static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
|
|
|
|
|
|
|
|
|
|
static bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
|
|
|
|
|
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
|
|
|
|
|
return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, size, 1, sync_staging);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) {
|
|
|
|
|
static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t dpitch, size_t width, size_t height) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_write_2d(" << width << ", " << height << ")");
|
|
|
|
|
// Buffer is already mapped
|
|
|
|
|
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
|
|
|
|
GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
|
|
|
|
memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
std::lock_guard<std::recursive_mutex> guard(dst->device->mutex);
|
|
|
|
|
|
|
|
|
|
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
|
|
|
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
|
|
|
bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
|
|
|
|
bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, dpitch, width, height, true);
|
|
|
|
|
GGML_ASSERT(ret);
|
|
|
|
|
ggml_vk_ctx_end(subctx);
|
|
|
|
|
|
|
|
|
|
@ -6944,7 +6953,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_buffer_write(vk_buffer& dst, size_t offset, const void * src, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_write(" << size << ")");
|
|
|
|
|
ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1);
|
|
|
|
|
ggml_vk_buffer_write_2d(dst, offset, src, size, size, size, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) {
|
|
|
|
|
@ -6990,15 +6999,35 @@ static bool ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fall back to staging buffer
|
|
|
|
|
const size_t copy_size = dpitch * height;
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(src->device, copy_size);
|
|
|
|
|
const size_t staging_size = width * height;
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(src->device, staging_size);
|
|
|
|
|
|
|
|
|
|
vk_buffer& staging_buffer = src->device->sync_staging;
|
|
|
|
|
|
|
|
|
|
ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
|
|
subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, slices);
|
|
|
|
|
std::vector<vk::BufferCopy> staging_slices(1);
|
|
|
|
|
if (width == spitch) {
|
|
|
|
|
staging_slices[0].srcOffset = offset;
|
|
|
|
|
staging_slices[0].dstOffset = 0;
|
|
|
|
|
staging_slices[0].size = staging_size;
|
|
|
|
|
} else {
|
|
|
|
|
staging_slices.resize(height);
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
staging_slices[i].srcOffset = offset + i * spitch;
|
|
|
|
|
staging_slices[i].dstOffset = i * width;
|
|
|
|
|
staging_slices[i].size = width;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys);
|
|
|
|
|
ggml_vk_sync_buffers(nullptr, subctx);
|
|
|
|
|
subctx->s->buffer->buf.copyBuffer(src->buffer, staging_buffer->buffer, staging_slices);
|
|
|
|
|
|
|
|
|
|
if (width == dpitch) {
|
|
|
|
|
deferred_memcpy(dst, staging_buffer->ptr, staging_size, &subctx->out_memcpys);
|
|
|
|
|
} else {
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
deferred_memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) staging_buffer->ptr + i * width, width, &subctx->out_memcpys);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -7006,8 +7035,8 @@ static bool ggml_vk_buffer_read_async(vk_context subctx, vk_buffer& src, size_t
|
|
|
|
|
return ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, size, size, size, 1, sync_staging);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
|
|
|
|
|
static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_read_2d(" << src->buffer << ", " << offset << ", " << width << ", " << height << ")");
|
|
|
|
|
|
|
|
|
|
// If the device is not an UMA device the memory is host-accessible through rebar. While writing
|
|
|
|
|
// through PCIe is sufficient fast reading back data from PCIe is slower than going through
|
|
|
|
|
@ -7015,18 +7044,20 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
|
|
if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) {
|
|
|
|
|
GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent);
|
|
|
|
|
|
|
|
|
|
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
|
|
|
|
for (size_t i = 0; i < height; i++) {
|
|
|
|
|
memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
std::lock_guard<std::recursive_mutex> guard(src->device->mutex);
|
|
|
|
|
|
|
|
|
|
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
|
|
|
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
|
|
|
bool ret = ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
|
|
|
|
bool ret = ggml_vk_buffer_read_2d_async(subctx, src, offset, dst, spitch, dpitch, width, height, true);
|
|
|
|
|
GGML_ASSERT(ret);
|
|
|
|
|
ggml_vk_ctx_end(subctx);
|
|
|
|
|
|
|
|
|
|
ggml_vk_submit(subctx, src->device->fence);
|
|
|
|
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
|
|
|
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read_2d waitForFences");
|
|
|
|
|
src->device->device.resetFences({ src->device->fence });
|
|
|
|
|
ggml_vk_queue_command_pools_cleanup(src->device);
|
|
|
|
|
|
|
|
|
|
@ -7036,6 +7067,11 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_read(" << src->buffer << ", " << offset << ", " << size << ")");
|
|
|
|
|
ggml_vk_buffer_read_2d(src, offset, dst, size, size, size, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_vk_buffer_copy_async(" << size << ")");
|
|
|
|
|
// Make sure both buffers are on same device
|
|
|
|
|
@ -7067,7 +7103,7 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
|
|
|
|
|
// Copy to src staging buffer
|
|
|
|
|
ggml_vk_buffer_copy(src->device->sync_staging, 0, src, src_offset, size);
|
|
|
|
|
// Copy to dst buffer
|
|
|
|
|
ggml_vk_buffer_write_2d(dst, dst_offset, src->device->sync_staging->ptr, 0, size, 1);
|
|
|
|
|
ggml_vk_buffer_write(dst, dst_offset, src->device->sync_staging->ptr, size);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@ -13615,6 +13651,20 @@ static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml
|
|
|
|
|
ggml_vk_buffer_write(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_buffer_set_tensor_2d(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset,
|
|
|
|
|
size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_set_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " <<
|
|
|
|
|
n_copies << ", " << stride_tensor << ", " << stride_data << ")");
|
|
|
|
|
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
|
|
|
vk_buffer buf = buf_ctx->dev_buffer;
|
|
|
|
|
|
|
|
|
|
if (size == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_vk_buffer_write_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_data, stride_tensor, size, n_copies);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
|
|
|
|
|
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
|
|
|
@ -13628,6 +13678,21 @@ static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, cons
|
|
|
|
|
ggml_vk_buffer_read(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_buffer_get_tensor_2d(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset,
|
|
|
|
|
size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor_2d(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ", " <<
|
|
|
|
|
n_copies << ", " << stride_tensor << ", " << stride_data << ")");
|
|
|
|
|
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
|
|
|
|
|
|
|
|
|
if (size == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
vk_buffer buf = buf_ctx->dev_buffer;
|
|
|
|
|
|
|
|
|
|
ggml_vk_buffer_read_2d(buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, stride_tensor, stride_data, size, n_copies);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
|
|
|
|
if (ggml_nbytes(src) == 0) {
|
|
|
|
|
return true;
|
|
|
|
|
@ -13662,8 +13727,8 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
|
|
|
|
|
/* .memset_tensor = */ ggml_backend_vk_buffer_memset_tensor,
|
|
|
|
|
/* .set_tensor = */ ggml_backend_vk_buffer_set_tensor,
|
|
|
|
|
/* .get_tensor = */ ggml_backend_vk_buffer_get_tensor,
|
|
|
|
|
/* .set_tensor_2d = */ NULL,
|
|
|
|
|
/* .get_tensor_2d = */ NULL,
|
|
|
|
|
/* .set_tensor_2d = */ ggml_backend_vk_buffer_set_tensor_2d,
|
|
|
|
|
/* .get_tensor_2d = */ ggml_backend_vk_buffer_get_tensor_2d,
|
|
|
|
|
/* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor,
|
|
|
|
|
/* .clear = */ ggml_backend_vk_buffer_clear,
|
|
|
|
|
/* .reset = */ NULL,
|
|
|
|
|
@ -13819,8 +13884,9 @@ static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_b
|
|
|
|
|
return &ctx->device->buffer_type;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
|
|
|
|
static void ggml_backend_vk_set_tensor_2d_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset,
|
|
|
|
|
size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_2d_async(" << size << ", " << n_copies << ")");
|
|
|
|
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
|
|
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
|
|
|
|
|
|
|
|
@ -13834,7 +13900,6 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
|
|
|
|
|
|
|
|
|
if (ctx->device->async_use_transfer_queue) {
|
|
|
|
|
if (ctx->transfer_ctx.expired()) {
|
|
|
|
|
// Initialize new transfer context
|
|
|
|
|
cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
|
|
|
ctx->transfer_ctx = cpy_ctx;
|
|
|
|
|
ggml_vk_ctx_begin(ctx->device, cpy_ctx);
|
|
|
|
|
@ -13849,25 +13914,48 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
|
|
|
|
|
|
|
|
|
auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
|
|
|
|
|
|
|
|
|
bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size);
|
|
|
|
|
bool ret = ggml_vk_buffer_write_2d_async(cpy_ctx, buf, dst_offset, data, stride_data, stride_tensor, size, n_copies);
|
|
|
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(ctx, size);
|
|
|
|
|
const size_t staging_size = size * n_copies;
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(ctx, staging_size);
|
|
|
|
|
ggml_vk_sync_buffers(nullptr, cpy_ctx);
|
|
|
|
|
|
|
|
|
|
vk::BufferCopy buffer_cpy;
|
|
|
|
|
buffer_cpy.srcOffset = 0;
|
|
|
|
|
buffer_cpy.dstOffset = dst_offset;
|
|
|
|
|
buffer_cpy.size = size;
|
|
|
|
|
std::vector<vk::BufferCopy> slices(1);
|
|
|
|
|
if (size == stride_tensor) {
|
|
|
|
|
slices[0].srcOffset = 0;
|
|
|
|
|
slices[0].dstOffset = dst_offset;
|
|
|
|
|
slices[0].size = staging_size;
|
|
|
|
|
} else {
|
|
|
|
|
slices.resize(n_copies);
|
|
|
|
|
for (size_t i = 0; i < n_copies; i++) {
|
|
|
|
|
slices[i].srcOffset = i * size;
|
|
|
|
|
slices[i].dstOffset = dst_offset + i * stride_tensor;
|
|
|
|
|
slices[i].size = size;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
|
|
|
|
|
deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys);
|
|
|
|
|
cpy_ctx->s->buffer->buf.copyBuffer(ctx->sync_staging->buffer, buf->buffer, slices);
|
|
|
|
|
|
|
|
|
|
if (size == stride_data) {
|
|
|
|
|
deferred_memcpy(ctx->sync_staging->ptr, data, staging_size, &cpy_ctx->in_memcpys);
|
|
|
|
|
} else {
|
|
|
|
|
for (size_t i = 0; i < n_copies; i++) {
|
|
|
|
|
deferred_memcpy((uint8_t *)ctx->sync_staging->ptr + i * size, (const uint8_t *)data + i * stride_data, size, &cpy_ctx->in_memcpys);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
ggml_vk_synchronize(ctx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
|
|
|
|
static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_set_tensor_async(" << size << ")");
|
|
|
|
|
ggml_backend_vk_set_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_get_tensor_2d_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset,
|
|
|
|
|
size_t size, size_t n_copies, size_t stride_tensor, size_t stride_data) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_2d_async(" << size << ", " << n_copies << ")");
|
|
|
|
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
|
|
|
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
|
|
|
|
|
|
|
|
@ -13882,24 +13970,45 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
|
|
|
|
vk_buffer buf = buf_ctx->dev_buffer;
|
|
|
|
|
|
|
|
|
|
auto src_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
|
|
|
|
|
bool ret = ggml_vk_buffer_read_async(compute_ctx, buf, src_offset, data, size);
|
|
|
|
|
bool ret = ggml_vk_buffer_read_2d_async(compute_ctx, buf, src_offset, data, stride_tensor, stride_data, size, n_copies);
|
|
|
|
|
|
|
|
|
|
// If that failed, copy synchronously through a staging buffer
|
|
|
|
|
if (!ret) {
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(ctx, size);
|
|
|
|
|
const size_t staging_size = size * n_copies;
|
|
|
|
|
ggml_vk_ensure_sync_staging_buffer(ctx, staging_size);
|
|
|
|
|
ggml_vk_sync_buffers(nullptr, compute_ctx);
|
|
|
|
|
|
|
|
|
|
vk::BufferCopy buffer_cpy;
|
|
|
|
|
buffer_cpy.srcOffset = src_offset;
|
|
|
|
|
buffer_cpy.dstOffset = 0;
|
|
|
|
|
buffer_cpy.size = size;
|
|
|
|
|
std::vector<vk::BufferCopy> slices(1);
|
|
|
|
|
if (size == stride_tensor) {
|
|
|
|
|
slices[0].srcOffset = src_offset;
|
|
|
|
|
slices[0].dstOffset = 0;
|
|
|
|
|
slices[0].size = staging_size;
|
|
|
|
|
} else {
|
|
|
|
|
slices.resize(n_copies);
|
|
|
|
|
for (size_t i = 0; i < n_copies; i++) {
|
|
|
|
|
slices[i].srcOffset = src_offset + i * stride_tensor;
|
|
|
|
|
slices[i].dstOffset = i * size;
|
|
|
|
|
slices[i].size = size;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, { buffer_cpy });
|
|
|
|
|
deferred_memcpy(data, ctx->sync_staging->ptr, size, &compute_ctx->out_memcpys);
|
|
|
|
|
compute_ctx->s->buffer->buf.copyBuffer(buf->buffer, ctx->sync_staging->buffer, slices);
|
|
|
|
|
|
|
|
|
|
if (size == stride_data) {
|
|
|
|
|
deferred_memcpy(data, ctx->sync_staging->ptr, staging_size, &compute_ctx->out_memcpys);
|
|
|
|
|
} else {
|
|
|
|
|
for (size_t i = 0; i < n_copies; i++) {
|
|
|
|
|
deferred_memcpy((uint8_t *)data + i * stride_data, (const uint8_t *)ctx->sync_staging->ptr + i * size, size, &compute_ctx->out_memcpys);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
ggml_vk_synchronize(ctx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_get_tensor_async(" << size << ")");
|
|
|
|
|
ggml_backend_vk_get_tensor_2d_async(backend, tensor, data, offset, size, 1, size, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
|
|
|
|
|
VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async(" << src << " -> " << dst << ", size=" << ggml_nbytes(src) << ")");
|
|
|
|
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context;
|
|
|
|
|
@ -15123,8 +15232,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
|
|
|
/* .free = */ ggml_backend_vk_free,
|
|
|
|
|
/* .set_tensor_async = */ ggml_backend_vk_set_tensor_async,
|
|
|
|
|
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
|
|
|
|
|
/* .get_tensor_2d_async = */ NULL,
|
|
|
|
|
/* .set_tensor_2d_async = */ NULL,
|
|
|
|
|
/* .set_tensor_2d_async = */ ggml_backend_vk_set_tensor_2d_async,
|
|
|
|
|
/* .get_tensor_2d_async = */ ggml_backend_vk_get_tensor_2d_async,
|
|
|
|
|
/* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async,
|
|
|
|
|
/* .synchronize = */ ggml_backend_vk_synchronize,
|
|
|
|
|
/* .graph_plan_create = */ NULL,
|
|
|
|
|
|