talk-llama : sync llama.cpp

This commit is contained in:
Georgi Gerganov 2026-05-14 11:53:43 +03:00
parent 69500f5502
commit 46ca43d639
2 changed files with 24 additions and 5 deletions

View File

@ -2475,11 +2475,29 @@ public:
}
if (need_alloc) {
mbuf_cur = std::move(mbuf);
if (!mbuf_cur.buf || mbuf_cur.total_size != mbuf.total_size) {
mbuf_cur = std::move(mbuf);
mbuf_cur.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(mbuf_cur.ctx.get(), buft));
mbuf_cur.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(mbuf_cur.ctx.get(), buft));
LLAMA_LOG_INFO("%s: allocated '%s' buffer %.3f MiB\n", __func__, ggml_backend_buft_name(buft), mbuf.total_size/1024.0/1024.0);
LLAMA_LOG_INFO("%s: allocated '%s' buffer %.3f MiB\n", __func__, ggml_backend_buft_name(buft), mbuf.total_size/1024.0/1024.0);
} else {
//LLAMA_LOG_INFO("%s: reallocating tensors in '%s' buffer %.3f MiB\n", __func__, ggml_backend_buft_name(buft), mbuf.total_size/1024.0/1024.0);
// save the old buffer and allocate the new tensors in it
auto buf = std::move(mbuf_cur.buf);
mbuf_cur = std::move(mbuf);
ggml_tallocr talloc = ggml_tallocr_new(buf.get());
for (size_t i = 0; i < mbuf_cur.org.size(); ++i) {
ggml_backend_view_init(mbuf_cur.org[i]);
ggml_tallocr_alloc(&talloc, mbuf_cur.cpy[i]);
}
mbuf_cur.buf = std::move(buf);
}
}
for (size_t i = 0; i < mbuf_cur.org.size(); ++i) {
@ -2559,8 +2577,7 @@ public:
mbuf.org.push_back(ggml_view_1d(mbuf.ctx.get(), rinfo.tensor, n, rinfo.offset));
auto & view = mbuf.org.back();
view->buffer = rinfo.tensor->buffer;
ggml_backend_view_init(mbuf.org.back());
}
for (auto & [buft, mbuf] : mbufs_new) {

View File

@ -858,6 +858,8 @@ extern "C" {
size_t n_token_capacity,
size_t * n_token_count_out);
#define LLAMA_STATE_SEQ_FLAGS_NONE 0
// for backwards-compat
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1