talk-llama : sync llama.cpp

This commit is contained in:
Georgi Gerganov 2026-05-25 12:18:31 +03:00
parent 0a62a579cc
commit 865ec171aa
129 changed files with 1593 additions and 395 deletions

View File

@ -757,14 +757,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
// These tensors only exist in the last layer(s) and are treated as output tensors
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
// last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
// the model loader doesn't fault on the block index.
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
// Nemotron 3 Super
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@ -877,6 +878,16 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
}
}
bool llm_arch_supports_rs_rollback(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_QWEN35:
case LLM_ARCH_QWEN35MOE:
return true;
default:
return false;
}
}
bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
switch (arch) {
case LLM_ARCH_GROK:

View File

@ -637,3 +637,4 @@ bool llm_arch_is_recurrent (const llm_arch & arch);
bool llm_arch_is_hybrid (const llm_arch & arch);
bool llm_arch_is_diffusion (const llm_arch & arch);
bool llm_arch_supports_sm_tensor(const llm_arch & arch);
bool llm_arch_supports_rs_rollback(const llm_arch & arch);

View File

@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
{ "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
} else if (tmpl_contains("<hy_Assistant>") && tmpl_contains("<hy_begin▁of▁sentence>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
} else if (tmpl_contains("<hy_Assistant>") && tmpl_contains("<hy_place▁holder▁no▁3>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
ss << "<hy_User>" << chat[i]->content << "<hy_Assistant>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
// tencent/HunyuanOCR
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
// tencent/HunyuanOCR & tencent/HunyuanVL
ss << "<hy_begin▁of▁sentence>";
for (size_t i = 0; i < chat.size(); i++) {
std::string role(chat[i]->role);

View File

@ -53,7 +53,7 @@ enum llm_chat_template {
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
LLM_CHAT_TEMPLATE_HUNYUAN_VL,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,

View File

@ -2,6 +2,7 @@
#include "ggml.h"
#include "llama-arch.h"
#include "llama-graph.h"
#include "llama-impl.h"
#include "llama-batch.h"
#include "llama-io.h"
@ -21,6 +22,14 @@
// llama_context
//
static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) {
switch (ctx_type) {
case LLAMA_CONTEXT_TYPE_DEFAULT: return LLM_GRAPH_TYPE_DEFAULT;
case LLAMA_CONTEXT_TYPE_MTP : return LLM_GRAPH_TYPE_DECODER_MTP;
}
throw std::runtime_error("Unsupported ctx type");
}
llama_context::llama_context(
const llama_model & model,
llama_context_params params) :
@ -42,13 +51,22 @@ llama_context::llama_context(
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
}
cparams.n_rs_seq = params.n_rs_seq;
if (cparams.n_rs_seq > 0 && !llm_arch_supports_rs_rollback(model.arch)) {
LLAMA_LOG_DEBUG("%s: n_rs_seq=%u requested but model arch does not support recurrent partial rollback; clamping to 0\n",
__func__, cparams.n_rs_seq);
cparams.n_rs_seq = 0;
}
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch;
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
cparams.embeddings = params.embeddings;
cparams.embeddings = params.embeddings;
cparams.embeddings_pre_norm = false;
cparams.embeddings_pre_norm_masked = false;
cparams.offload_kqv = params.offload_kqv;
cparams.no_perf = params.no_perf;
cparams.pooling_type = params.pooling_type;
@ -65,6 +83,8 @@ llama_context::llama_context(
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
cparams.ctx_type = params.ctx_type;
// Initialize backend samplers here so they are part of the sampling graph
// before the reserve passes run later in this function. This avoids a later
// re-reserve when graph nodes change.
@ -206,6 +226,7 @@ llama_context::llama_context(
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
LLAMA_LOG_INFO("%s: n_rs_seq = %u\n", __func__, cparams.n_rs_seq);
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
@ -278,6 +299,7 @@ llama_context::llama_context(
/*.type_k =*/ params.type_k,
/*.type_v =*/ params.type_v,
/*.swa_full =*/ params.swa_full,
/*.ctx_type= */ cparams.ctx_type,
};
memory.reset(model.create_memory(params_mem, cparams));
@ -860,6 +882,42 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
return it->second.data();
}
float * llama_context::get_embeddings_pre_norm() {
output_reorder();
return embd_pre_norm.data;
}
float * llama_context::get_embeddings_pre_norm_ith(int32_t i) {
output_reorder();
try {
if (embd_pre_norm.data == nullptr) {
throw std::runtime_error("no pre-norm embeddings");
}
const uint32_t n_embd = model.hparams.n_embd;
if (!cparams.embeddings_pre_norm_masked) {
// unmasked: pre-norm rows are stored densely, indexed by raw token position.
if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) {
throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd));
}
return embd_pre_norm.data + (size_t) i * n_embd;
}
const int64_t j = output_resolve_row(i);
return embd_pre_norm.data + j*n_embd;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
GGML_ABORT("fatal error");
#else
return nullptr;
#endif
}
}
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
output_reorder();
@ -1040,6 +1098,13 @@ void llama_context::set_embeddings(bool value) {
//sched_need_reserve = true;
}
void llama_context::set_embeddings_pre_norm(bool value, bool masked) {
LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked);
cparams.embeddings_pre_norm = value;
cparams.embeddings_pre_norm_masked = masked;
}
void llama_context::set_causal_attn(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
@ -1072,6 +1137,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
static bool warned = false;
if (!warned) {
LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__);
warned = true;
}
if (sampling.samplers.count(seq_id) > 0) {
sched_need_reserve = true;
}
sampling.samplers.erase(seq_id);
return false;
}
const bool can_offload =
sampler &&
sampler->iface->backend_init &&
@ -1241,7 +1319,9 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
}
int llama_context::encode(const llama_batch & batch_inp) {
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
// MTP hook batches carry both token (next-token id) and embd (h_pre_norm row),
// so accept either present rather than requiring exactly one.
GGML_ASSERT(batch_inp.token || batch_inp.embd);
if (batch_inp.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@ -1312,8 +1392,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}
auto * t_logits = res->get_logits();
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
auto * t_logits = res->get_logits();
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr;
// extract logits
if (logits.data && t_logits) {
@ -1379,6 +1460,16 @@ int llama_context::encode(const llama_batch & batch_inp) {
}
}
// extract pre-norm embeddings (hidden state before the final output norm)
if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
GGML_ASSERT(backend_h != nullptr);
const uint32_t n_embd = hparams.n_embd;
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_pre_norm.size);
ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm.data, 0, n_tokens*n_embd*sizeof(float));
}
// TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd;
@ -1531,7 +1622,9 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_s
}
int llama_context::decode(const llama_batch & batch_inp) {
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
// MTP hook batches carry both token (next-token id) and embd (h_pre_norm row),
// so accept either present rather than requiring exactly one.
GGML_ASSERT(batch_inp.token || batch_inp.embd);
if (!memory) {
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
@ -1668,6 +1761,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
};
int64_t n_outputs_prev = 0;
int64_t n_tokens_prev = 0;
do {
const auto & ubatch = mctx->get_ubatch();
@ -1689,7 +1783,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
ggml_status status;
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
const auto * res = process_ubatch(ubatch, ctx_type_to_graph_type(cparams.ctx_type), mctx.get(), status);
if (!res) {
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
@ -1727,8 +1822,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
auto * t_logits = res->get_logits();
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
auto * t_logits = res->get_logits();
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr;
if (t_embd && res->get_embd_pooled()) {
t_embd = res->get_embd_pooled();
@ -1809,6 +1905,25 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
}
// extract pre-norm embeddings (hidden state before the final output norm)
// only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
{
const bool masked = cparams.embeddings_pre_norm_masked;
const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens;
const int64_t offset = masked ? n_outputs_prev : n_tokens_prev;
if (embd_pre_norm.data && t_h_pre_norm && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
GGML_ASSERT(backend_h != nullptr);
const uint32_t n_embd = hparams.n_embd;
float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd;
GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size);
ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_rows*n_embd*sizeof(float));
}
}
// Copy backend sampling output if this ubatch produced any sampling tensors.
if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) {
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
@ -1823,6 +1938,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
n_outputs_prev += n_outputs;
n_tokens_prev += ubatch.n_tokens;
} while (mctx->next());
// set to total number of outputs in the batch, for use in llama_get_logits_ith
@ -1893,10 +2009,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_batch = cparams.n_batch;
const auto n_vocab = vocab.n_tokens();
const auto n_embd = hparams.n_embd;
const auto n_embd_out = hparams.n_embd_out();
bool has_logits = true;
bool has_embd = cparams.embeddings;
bool has_logits = true;
bool has_embd = cparams.embeddings;
bool has_embd_pre_norm = cparams.embeddings_pre_norm;
// TODO: hacky enc-dec support
if (model.arch == LLM_ARCH_T5) {
@ -1908,8 +2026,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
size_t backend_float_count = 0;
size_t backend_token_count = 0;
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0;
if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) {
// unmasked: pre-norm row exists for every token in the batch, not just
// those flagged via batch.logits[i] -> size by token count instead.
embd_pre_norm.size = (size_t) n_embd * n_batch;
}
// Allocate backend sampling output buffers if there are backend samplers configured.
const bool has_sampling = !sampling.samplers.empty();
@ -1925,8 +2050,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
const size_t new_size =
(logits.size + embd.size + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
(logits.size + embd.size + embd_pre_norm.size + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
// alloc only when more than the current capacity is required
// TODO: also consider shrinking the buffer
@ -1942,6 +2067,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
buf_output = nullptr;
logits.data = nullptr;
embd.data = nullptr;
embd_pre_norm.data = nullptr;
}
auto * buft = ggml_backend_cpu_buffer_type();
@ -1970,6 +2096,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
embd = has_embd ? buffer_view<float>{(float *) (base + offset), embd.size} : buffer_view<float>{nullptr, 0};
offset += embd.size * sizeof(float);
embd_pre_norm = has_embd_pre_norm ? buffer_view<float>{(float *) (base + offset), embd_pre_norm.size} : buffer_view<float>{nullptr, 0};
offset += embd_pre_norm.size * sizeof(float);
if (has_sampling) {
sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
offset += sampling.logits.size * sizeof(float);
@ -2034,6 +2163,12 @@ void llama_context::output_reorder() {
}
}
if (embd_pre_norm.size > 0) {
for (uint64_t k = 0; k < n_embd; k++) {
std::swap(embd_pre_norm.data[i0*n_embd + k], embd_pre_norm.data[i1*n_embd + k]);
}
}
if (!sampling.samplers.empty()) {
assert(sampling.logits.size > 0);
assert(sampling.probs.size > 0);
@ -2121,7 +2256,7 @@ ggml_cgraph * llama_context::graph_reserve(
auto * res = gf_res_reserve.get();
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
const auto gparams = graph_params(res, ubatch, mctx, ctx_type_to_graph_type(cparams.ctx_type));
res->reset();
@ -3100,7 +3235,7 @@ void llama_context::opt_epoch_iter(
auto * res = gf_res_prev.get();
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
const auto gparams = graph_params(res, ubatch, mctx.get(), ctx_type_to_graph_type(cparams.ctx_type));
res->reset();
@ -3201,8 +3336,10 @@ llama_context_params llama_context_default_params() {
/*.n_batch =*/ 2048,
/*.n_ubatch =*/ 512,
/*.n_seq_max =*/ 1,
/*.n_rs_seq =*/ 0,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
/*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
@ -3306,6 +3443,13 @@ llama_context * llama_init_from_model(
model->hparams.pooling_type, params.pooling_type);
}
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
model->hparams.nextn_predict_layers == 0) {
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
return nullptr;
}
try {
auto * ctx = new llama_context(*model, params);
return ctx;
@ -3347,6 +3491,10 @@ uint32_t llama_n_seq_max(const llama_context * ctx) {
return ctx->n_seq_max();
}
uint32_t llama_n_rs_seq(const llama_context * ctx) {
return ctx->get_cparams().n_rs_seq;
}
const llama_model * llama_get_model(const llama_context * ctx) {
return &ctx->get_model();
}
@ -3436,6 +3584,22 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
return ctx->get_embeddings_seq(seq_id);
}
void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked) {
ctx->set_embeddings_pre_norm(value, masked);
}
float * llama_get_embeddings_pre_norm(llama_context * ctx) {
ctx->synchronize();
return ctx->get_embeddings_pre_norm();
}
float * llama_get_embeddings_pre_norm_ith(llama_context * ctx, int32_t i) {
ctx->synchronize();
return ctx->get_embeddings_pre_norm_ith(i);
}
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
return ctx->set_sampler(seq_id, smpl);
}

View File

@ -84,6 +84,9 @@ struct llama_context {
float * get_embeddings_ith(int32_t i);
float * get_embeddings_seq(llama_seq_id seq_id);
float * get_embeddings_pre_norm();
float * get_embeddings_pre_norm_ith(int32_t i);
llama_token * get_sampled_tokens() const;
llama_token get_sampled_token_ith(int32_t idx);
@ -107,6 +110,7 @@ struct llama_context {
void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
void set_embeddings (bool value);
void set_embeddings_pre_norm(bool value, bool masked);
void set_causal_attn(bool value);
void set_warmup(bool value);
@ -278,6 +282,11 @@ private:
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
buffer_view<float> embd = {nullptr, 0};
// hidden state before the final output norm (2-dimensional array: [n_outputs][n_embd])
// populated only when cparams.embeddings_pre_norm is enabled and the model graph
// sets llm_graph_result::t_h_pre_norm
buffer_view<float> embd_pre_norm = {nullptr, 0};
struct sampling_info {
// !samplers.empty() to check if any samplers are active
std::map<llama_seq_id, llama_sampler *> samplers;

View File

@ -12,6 +12,7 @@ struct llama_cparams {
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
@ -27,6 +28,8 @@ struct llama_cparams {
float yarn_beta_slow;
bool embeddings;
bool embeddings_pre_norm; // also extract the hidden state before the final output norm
bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0
bool causal_attn;
bool offload_kqv;
bool flash_attn;
@ -40,6 +43,7 @@ struct llama_cparams {
bool kv_unified;
bool pipeline_parallel;
enum llama_context_type ctx_type;
enum llama_pooling_type pooling_type;
ggml_backend_sched_eval_callback cb_eval;

View File

@ -88,3 +88,19 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
//
// pre-norm embeddings (hidden state before the final output norm)
//
// Set whether the context outputs pre-norm embeddings or not
// If masked == true, output the embeddings only for the tokens with batch.logits != 0
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked);
// mirrors:
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx);
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i);

View File

@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
}
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
// base tensors may not be allocated if there are no non-SWA attention layers
if (self_k_idxs && self_k_idxs->buffer) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (self_k_rot) {
mctx->get_base()->set_input_k_rot(self_k_rot);
@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
bool res = true;
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
// base tensors may not be allocated if there are no non-SWA attention layers
if (self_k_idxs && self_k_idxs->buffer) {
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
}
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
}
return res;
}
@ -848,6 +861,9 @@ void llm_graph_result::set_outputs() {
if (t_embd_pooled != nullptr) {
ggml_set_output(t_embd_pooled);
}
if (t_h_pre_norm != nullptr) {
ggml_set_output(t_h_pre_norm);
}
for (auto & [seq_id, t] : t_sampled) {
if (t != nullptr) {
ggml_set_output(t);
@ -2528,7 +2544,8 @@ ggml_tensor * llm_graph_context::build_rs(
int32_t rs_zero,
const llm_graph_get_rows_fn & get_state_rows) const {
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
GGML_UNUSED(rs_size);
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, s->ne[1]);
// Clear a single state which will then be copied to the other cleared states.
// Note that this is a no-op when the view is zero-sized.

View File

@ -32,6 +32,7 @@ enum llm_graph_type {
LLM_GRAPH_TYPE_DEFAULT,
LLM_GRAPH_TYPE_ENCODER,
LLM_GRAPH_TYPE_DECODER,
LLM_GRAPH_TYPE_DECODER_MTP,
};
enum llm_ffn_op_type {
@ -580,7 +581,8 @@ struct llm_graph_params {
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
(
(!ubatch.token && !other.ubatch.token) ||
(!ubatch.embd && !other.ubatch.embd)
(!ubatch.embd && !other.ubatch.embd) ||
(ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd)
);
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
@ -644,6 +646,7 @@ public:
ggml_tensor * get_logits() const { return t_logits; }
ggml_tensor * get_embd() const { return t_embd; }
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; }
ggml_cgraph * get_gf() const { return gf; }
ggml_context * get_ctx() const { return ctx_compute.get(); }
@ -672,6 +675,7 @@ public:
ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr;
ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
std::map<llama_seq_id, ggml_tensor*> t_candidates;

View File

@ -229,6 +229,12 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
}
bool llama_hparams::has_kv(uint32_t il) const {
if (kv_only_nextn) {
// MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
// the leading trunk blocks are not executed in this graph.
return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
}
if (n_layer_kv_from_start >= 0) {
if (il < (uint32_t) n_layer_kv_from_start) {
return true;

View File

@ -92,6 +92,8 @@ struct llama_hparams {
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;
bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
float f_norm_eps;
float f_norm_rms_eps;
float f_norm_group_eps;

View File

@ -24,6 +24,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
uint32_t rs_size,
/* common */
uint32_t n_seq_max,
uint32_t n_rs_seq,
bool offload,
bool unified,
/* layer filters */
@ -54,6 +55,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
offload,
rs_size,
n_seq_max,
n_rs_seq,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr
@ -73,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
// if all tokens are output, split by sequence
ubatch = balloc.split_seq(n_ubatch);
} else {
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
ubatch = balloc.split_equal(n_ubatch, !unified);
if (mem_recr->n_rs_seq > 0) {
// [TAG_RECURRENT_ROLLBACK_SPLITS]
// TODO: recurrent state rollback does not support equal splits
ubatch = balloc.split_seq(n_ubatch);
} else {
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
ubatch = balloc.split_equal(n_ubatch, !unified);
}
}
if (ubatch.n_tokens == 0) {

View File

@ -34,6 +34,7 @@ public:
uint32_t rs_size,
/* common */
uint32_t n_seq_max,
uint32_t n_rs_seq,
bool offload,
bool unified,
/* layer filters */

View File

@ -24,6 +24,7 @@ llama_memory_hybrid::llama_memory_hybrid(
uint32_t rs_size,
/* common */
uint32_t n_seq_max,
uint32_t n_rs_seq,
bool offload,
bool unified,
/* layer filters */
@ -54,6 +55,7 @@ llama_memory_hybrid::llama_memory_hybrid(
offload,
rs_size,
n_seq_max,
n_rs_seq,
filter_recr == nullptr ?
[&](int32_t il) { return hparams.is_recurrent(il); }
: filter_recr
@ -73,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
// if all tokens are output, split by sequence
ubatch = balloc.split_seq(n_ubatch);
} else {
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
const bool unified = (mem_attn->get_n_stream() == 1);
ubatch = balloc.split_equal(n_ubatch, !unified);
if (mem_recr->n_rs_seq > 0) {
// [TAG_RECURRENT_ROLLBACK_SPLITS]
// TODO: recurrent state rollback does not support equal splits
ubatch = balloc.split_seq(n_ubatch);
} else {
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
const bool unified = (mem_attn->get_n_stream() == 1);
ubatch = balloc.split_equal(n_ubatch, !unified);
}
}
if (ubatch.n_tokens == 0) {

View File

@ -34,6 +34,7 @@ public:
uint32_t rs_size,
/* common */
uint32_t n_seq_max,
uint32_t n_rs_seq,
bool offload,
bool unified,
/* layer filters */

View File

@ -24,6 +24,7 @@ llama_memory_recurrent::llama_memory_recurrent(
bool offload,
uint32_t mem_size,
uint32_t n_seq_max,
uint32_t n_rs_seq,
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
const int32_t n_layer = hparams.n_layer;
@ -31,6 +32,9 @@ llama_memory_recurrent::llama_memory_recurrent(
size = mem_size;
used = 0;
this->n_rs_seq = n_rs_seq;
rs_idx.assign(n_seq_max, 0);
cells.clear();
cells.resize(mem_size);
@ -92,8 +96,9 @@ llama_memory_recurrent::llama_memory_recurrent(
throw std::runtime_error("failed to create ggml context for rs cache");
}
ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), mem_size);
ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), mem_size);
const uint32_t n_rows = mem_size * (1 + n_rs_seq);
ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), n_rows);
ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), n_rows);
ggml_format_name(r, "cache_r_l%d", i);
ggml_format_name(s, "cache_s_l%d", i);
r_l[i] = r;
@ -115,8 +120,8 @@ llama_memory_recurrent::llama_memory_recurrent(
const size_t memory_size_r = size_r_bytes();
const size_t memory_size_s = size_s_bytes();
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs %2u rs_seq), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max, n_rs_seq,
ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
}
@ -138,10 +143,11 @@ void llama_memory_recurrent::clear(bool data) {
ggml_backend_buffer_clear(buf.get(), 0);
}
}
std::fill(rs_idx.begin(), rs_idx.end(), 0);
}
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
uint32_t new_head = size;
if (p0 < 0) {
@ -152,6 +158,15 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
p1 = std::numeric_limits<llama_pos>::max();
}
const bool rm_all = p0 == 0 && p1 == std::numeric_limits<llama_pos>::max();
if (rm_all) {
if (seq_id >= 0) {
set_rs_idx(seq_id, 0);
} else {
std::fill(rs_idx.begin(), rs_idx.end(), 0);
}
}
// models like Mamba or RWKV can't have a state partially erased at the end
// of the sequence because their state isn't preserved for previous tokens
if (seq_id >= (int64_t) size) {
@ -161,10 +176,16 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
if (0 <= seq_id) {
int32_t & tail_id = cells[seq_id].tail;
if (tail_id >= 0) {
const auto & cell = cells[tail_id];
// partial intersection is invalid if it includes the final pos
auto & cell = cells[tail_id];
// partial rollback via per-token snapshot index (bounded by n_rs_seq)
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
const llama_pos rollback = cell.pos - (p0 - 1);
if (rollback >= 1 && rollback <= (llama_pos) n_rs_seq) {
set_rs_idx(seq_id, (uint32_t) rollback);
cell.pos = p0 - 1;
return true;
}
return false;
}
// invalidate tails which will be cleared
@ -368,6 +389,13 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
return result;
}
void llama_memory_recurrent::set_rs_idx(llama_seq_id seq_id, uint32_t idx) {
if (seq_id < 0 || (size_t) seq_id >= rs_idx.size()) {
return;
}
rs_idx[seq_id] = (idx > n_rs_seq) ? n_rs_seq : idx;
}
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [_, buf] : ctxs_bufs) {
@ -388,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
// if all tokens are output, split by sequence
ubatch = balloc.split_seq(n_ubatch);
} else {
// TODO: non-sequential equal split can be done if using unified KV cache
// for simplicity, we always use sequential equal split for now
ubatch = balloc.split_equal(n_ubatch, true);
if (n_rs_seq > 0) {
// [TAG_RECURRENT_ROLLBACK_SPLITS]
// TODO: recurrent state rollback does not support equal splits
ubatch = balloc.split_seq(n_ubatch);
} else {
// TODO: non-sequential equal split can be done if using unified KV cache
// for simplicity, we always use sequential equal split for now
ubatch = balloc.split_equal(n_ubatch, true);
}
}
if (ubatch.n_tokens == 0) {
@ -703,6 +737,7 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
GGML_UNUSED(flags);
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges_data; // logical source row ranges
uint32_t cell_count = 0;
// Count the number of cells with the specified seq_id
@ -712,6 +747,35 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
const auto & cell = cells[i];
if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
++cell_count;
uint32_t rs_idx_cur = 0;
if (n_rs_seq != 0) {
if (seq_id != -1) {
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < rs_idx.size());
rs_idx_cur = rs_idx[seq_id];
} else {
bool has_rs_idx = false;
for (const llama_seq_id cell_seq_id : cell.seq_id) {
GGML_ASSERT(cell_seq_id >= 0 && (size_t) cell_seq_id < rs_idx.size());
const uint32_t seq_rs_idx = rs_idx[cell_seq_id];
if (!has_rs_idx) {
rs_idx_cur = seq_rs_idx;
has_rs_idx = true;
} else if (rs_idx_cur != seq_rs_idx) {
GGML_ABORT("cannot write shared recurrent state with different rollback indices");
}
}
}
}
const uint32_t cell_id = rs_idx_cur * size + (cell.src >= 0 ? cell.src : (int32_t) i);
if (cell_ranges_data.empty() || cell_ranges_data.back().second != cell_id) {
cell_ranges_data.emplace_back(cell_id, cell_id + 1);
} else {
cell_ranges_data.back().second++;
}
if (cell_range_begin == size) {
cell_range_begin = i;
}
@ -726,7 +790,7 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
cell_ranges.emplace_back(cell_range_begin, size);
}
if (flags % LLAMA_STATE_SEQ_FLAGS_ON_DEVICE && cell_ranges.size() > 1) {
if ((flags & LLAMA_STATE_SEQ_FLAGS_ON_DEVICE) && cell_ranges.size() > 1) {
GGML_ABORT("cannot save/load multiple ranges of cells to/from device memory\n");
}
@ -737,10 +801,16 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
}
GGML_ASSERT(cell_count == cell_count_check);
cell_count_check = 0;
for (const auto & range : cell_ranges_data) {
cell_count_check += range.second - range.first;
}
GGML_ASSERT(cell_count == cell_count_check);
io.write(&cell_count, sizeof(cell_count));
state_write_meta(io, cell_ranges, seq_id);
state_write_data(io, cell_ranges);
state_write_data(io, cell_ranges_data);
}
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
@ -762,6 +832,14 @@ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_i
}
throw std::runtime_error("failed to restore kv cache");
}
if (n_rs_seq != 0) {
if (seq_id == -1) {
std::fill(rs_idx.begin(), rs_idx.end(), 0);
} else {
set_rs_idx(seq_id, 0);
}
}
}
void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
@ -804,7 +882,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
io.write(&r_size_row, sizeof(r_size_row));
// Write each range of cells of r_size_row length
// Write each logical cell row range. With pending recurrent rollback,
// the logical current state may live in a rollback snapshot plane.
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * r_size_row;
@ -825,7 +904,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
io.write(&s_size_row, sizeof(s_size_row));
// Write each range of S tensor rows
// Write each logical cell row range. With pending recurrent rollback,
// the logical current state may live in a rollback snapshot plane.
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * s_size_row;
@ -852,9 +932,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
// Write GQA embedding size
io.write(&n_embd_s, sizeof(n_embd_s));
// For each row, we get the element values of each cell
// For each row, we get the element values of each logical cell
for (uint32_t j = 0; j < n_embd_s; ++j) {
// Write each range of cells of s_size_el length
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * mem_size) * s_size_el;
@ -1163,5 +1242,21 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
}
int32_t llama_memory_recurrent_context::s_copy(int i) const {
return mem->cells[i + mem->head].src0;
const uint32_t cell_idx = i + mem->head;
const int32_t src0 = mem->cells[cell_idx].src0;
if (mem->n_rs_seq == 0) {
return src0;
}
uint32_t idx = 0;
if (!mem->cells[cell_idx].seq_id.empty()) {
const llama_seq_id seq = *mem->cells[cell_idx].seq_id.begin();
if (seq >= 0 && (size_t) seq < mem->rs_idx.size()) {
idx = mem->rs_idx[seq];
// reset rollback idx
mem->rs_idx[seq] = 0;
}
}
return (int32_t)(idx * mem->size) + src0;
}

View File

@ -23,6 +23,7 @@ public:
bool offload,
uint32_t mem_size,
uint32_t n_seq_max,
uint32_t n_rs_seq,
const layer_filter_cb & filter);
~llama_memory_recurrent() = default;
@ -69,6 +70,14 @@ public:
uint32_t size = 0; // total number of cells, shared across all sequences
uint32_t used = 0; // used cells (i.e. at least one seq_id)
// number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups
uint32_t n_rs_seq = 0;
// per-seq rollback index
std::vector<uint32_t> rs_idx;
void set_rs_idx(llama_seq_id seq_id, uint32_t idx);
// computed before each graph build
uint32_t n = 0;

View File

@ -1,6 +1,7 @@
#pragma once
#include "llama.h"
#include "llama-graph.h"
#include <map>
#include <memory>
@ -20,6 +21,8 @@ struct llama_memory_params {
// use full-size SWA cache
bool swa_full;
llama_context_type ctx_type;
};
enum llama_memory_status {

View File

@ -1312,9 +1312,16 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte
return tensor;
}
void llama_model_loader::done_getting_tensors() const {
if (n_created != n_tensors) {
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
void llama_model_loader::done_getting_tensors(bool partial) const {
if (n_created > n_tensors) {
throw std::runtime_error(format("%s: too many tensors created; expected %d, got %d", __func__, n_tensors, n_created));
}
if (n_created < n_tensors) {
if (!partial) {
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
}
LLAMA_LOG_INFO("%s: partial load — used %d of %d tensors in the file (rest belong to a sibling model on the same .gguf)\n",
__func__, n_created, n_tensors);
}
if (n_tensors_moved > 0) {
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",

View File

@ -184,7 +184,7 @@ struct llama_model_loader {
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
void done_getting_tensors() const;
void done_getting_tensors(bool partial = false) const;
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);

View File

@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() {
add_tensor(model->output);
add_tensor(model->output_b);
add_tensor(model->output_norm_enc);
add_tensor(model->output_s);
add_tensor(model->output_in_s);
add_tensor(model->cls);
add_tensor(model->cls_b);
add_tensor(model->cls_out);

View File

@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
if (!layer.ssm_beta_s && layer.ssm_beta) {
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) {
layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) {
layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
// input scales
if (!layer.wq_in_s && layer.wq) {
@ -1393,11 +1399,30 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
if (!layer.ssm_beta_in_s && layer.ssm_beta) {
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) {
layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) {
layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
}
}
// output scales
if (output && output->type == GGML_TYPE_NVFP4) {
// weight scale
if (!output_s) {
output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED);
}
// input scale
if (!output_in_s) {
output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED);
}
}
}
ml.done_getting_tensors();
GGML_ASSERT(!(output && tok_embd &&
strcmp(output->name, tok_embd->name) == 0 &&
output->type == GGML_TYPE_NVFP4));
// populate tensors_by_name
for (auto & [_, ctx_ptr] : ml.ctx_map) {
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
@ -1934,6 +1959,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
// checks
default:
{
// The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain
// attention KV cache for the MTP context instead of the hybrid wrapper.
const bool mtp_on_hybrid_qwen35 =
params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
(arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE);
if (llm_arch_is_recurrent(arch)) {
res = new llama_memory_recurrent(
*this,
@ -1942,8 +1973,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
cparams.offload_kqv,
std::max((uint32_t) 1, cparams.n_seq_max),
cparams.n_seq_max,
cparams.n_rs_seq,
nullptr);
} else if (llm_arch_is_hybrid(arch)) {
} else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) {
// The main difference between hybrid architectures is the
// layer filters, so pick the right one here
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
@ -1958,6 +1990,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
filter_recr = [&](int32_t il) {
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
};
} else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
filter_attn = [&, n_main](int32_t il) {
return (uint32_t)il < n_main && !hparams.is_recurrent(il);
};
filter_recr = [&, n_main](int32_t il) {
return (uint32_t)il < n_main && hparams.is_recurrent(il);
};
}
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
@ -1975,6 +2015,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
/* recurrent_type_s */ GGML_TYPE_F32,
/* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
/* n_seq_max */ cparams.n_seq_max,
/* n_rs_seq */ cparams.n_rs_seq,
/* offload */ cparams.offload_kqv,
/* unified */ cparams.kv_unified,
/* filter_attn */ std::move(filter_attn),
@ -1993,6 +2034,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
/* recurrent_type_v */ GGML_TYPE_F32,
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
/* n_seq_max */ cparams.n_seq_max,
/* n_rs_seq */ cparams.n_rs_seq,
/* offload */ cparams.offload_kqv,
/* unified */ cparams.kv_unified,
/* filter_attn */ std::move(filter_attn),
@ -2000,6 +2042,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
}
} else {
llama_memory_i::layer_reuse_cb reuse = nullptr;
llama_kv_cache::layer_filter_cb filter = nullptr;
if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
reuse = [&](int32_t il) {
@ -2011,6 +2054,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
};
}
if (mtp_on_hybrid_qwen35) {
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
}
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
GGML_ASSERT(hparams.is_swa_any());
@ -2026,7 +2074,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
cparams.n_seq_max,
cparams.n_ubatch,
1,
nullptr,
filter,
reuse);
} else {
GGML_ASSERT(!hparams.is_swa_any());
@ -2043,7 +2091,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
1,
hparams.n_swa,
hparams.swa_type,
nullptr,
filter,
nullptr);
}
}
@ -2146,6 +2194,7 @@ int32_t llama_model_n_swa(const llama_model * model) {
return model->hparams.n_swa;
}
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
return model->hparams.n_cls_out;
}

View File

@ -202,12 +202,16 @@ struct llama_layer_shortconv {
};
struct llama_layer_nextn {
struct ggml_tensor * eh_proj = nullptr;
struct ggml_tensor * embed_tokens = nullptr;
struct ggml_tensor * enorm = nullptr;
struct ggml_tensor * hnorm = nullptr;
struct ggml_tensor * shared_head_head = nullptr;
struct ggml_tensor * shared_head_norm = nullptr;
struct ggml_tensor * eh_proj = nullptr;
struct ggml_tensor * eh_proj_s = nullptr;
struct ggml_tensor * eh_proj_in_s = nullptr;
struct ggml_tensor * embed_tokens = nullptr;
struct ggml_tensor * enorm = nullptr;
struct ggml_tensor * hnorm = nullptr;
struct ggml_tensor * shared_head_head = nullptr;
struct ggml_tensor * shared_head_head_s = nullptr;
struct ggml_tensor * shared_head_head_in_s = nullptr;
struct ggml_tensor * shared_head_norm = nullptr;
};
struct llama_layer {
@ -533,6 +537,11 @@ struct llama_model {
struct ggml_tensor * output_b = nullptr;
struct ggml_tensor * output_norm_enc = nullptr;
// NVFP4 per-tensor scale2, input_scale for LM head
struct ggml_tensor * output_s = nullptr;
struct ggml_tensor * output_in_s = nullptr;
// classifier
struct ggml_tensor * cls = nullptr;
struct ggml_tensor * cls_b = nullptr;

View File

@ -530,6 +530,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
struct llm_tokenizer_bpe_session {
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
virtual ~llm_tokenizer_bpe_session() = default;
static void append(const llama_token token_id, std::vector<llama_token> & output) {
output.push_back(token_id);
}
@ -567,7 +569,7 @@ struct llm_tokenizer_bpe_session {
}
}
void tokenize(const std::string & text, std::vector<llama_token> & output) {
virtual void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
@ -1579,6 +1581,88 @@ private:
const llm_tokenizer_plamo2 & tokenizer;
};
// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
// text at load
static const std::string dna_kmer_marker = "\xee\x80\x80";
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
static const std::string open_tag = "<dna>";
static const std::string close_tag = "</dna>";
const auto dna_begin_id = vocab.text_to_token(open_tag);
const auto dna_end_id = vocab.text_to_token(close_tag);
const auto dna_oov_id = vocab.text_to_token("<oov>");
// Fall back to plain BPE if the DNA pieces aren't in the vocab.
if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) {
llm_tokenizer_bpe_session::tokenize(text, output);
return;
}
const size_t k = 6;
size_t pos = 0;
while (pos < text.size()) {
const size_t start = text.find(open_tag, pos);
if (start == std::string::npos) {
if (pos < text.size()) {
llm_tokenizer_bpe_session::tokenize(text.substr(pos), output);
}
break;
}
if (start > pos) {
llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output);
}
output.push_back(dna_begin_id);
const size_t content_start = start + open_tag.size();
const size_t end = text.find(close_tag, content_start);
const size_t content_end = (end == std::string::npos) ? text.size() : end;
emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output);
if (end == std::string::npos) {
break;
}
output.push_back(dna_end_id);
pos = end + close_tag.size();
}
}
private:
void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector<llama_token> & output) {
std::string seq = raw;
for (char & c : seq) {
if (c >= 'a' && c <= 'z') {
c = char(c - 32);
}
}
// k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
// isn't in the vocab and falls back to <oov>
auto kmer_token = [&](const std::string & kmer) {
const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
};
size_t i = 0;
for (; i + k <= seq.size(); i += k) {
output.push_back(kmer_token(seq.substr(i, k)));
}
if (i < seq.size()) {
std::string kmer = seq.substr(i);
kmer.append(k - kmer.size(), 'A');
output.push_back(kmer_token(kmer));
}
}
const llama_vocab & vocab;
};
//
// impl
//
@ -1808,7 +1892,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
special_mask_id = 103;
add_sep = true;
} else if (tokenizer_model == "gpt2") {
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
type = LLAMA_VOCAB_TYPE_BPE;
// read bpe merges and populate bpe ranks
@ -2266,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
}
GGML_ASSERT(id_to_token.size() == token_to_id.size());
// hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
// it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
// k-mers are the block right after <oov>, so only scan from there.
if (tokenizer_model == "hybriddna") {
const auto idx = token_to_id.find("<oov>");
if (idx != token_to_id.end()) {
auto it = id_to_token.begin() + idx->second + 1;
for (; it != id_to_token.end(); ++it) {
std::string & text = it->text;
if (text.size() > dna_kmer_marker.size()
&& text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
text.erase(text.size() - dna_kmer_marker.size());
}
}
}
}
init_tokenizer(type);
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
@ -3144,11 +3245,19 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
} break;
case LLAMA_VOCAB_TYPE_BPE:
{
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
// it calls some other methods that are not exist in llm_tokenizer,
// here just cast it to bpe tokenizer object
const llm_tokenizer_bpe * tok_bpe = static_cast<const llm_tokenizer_bpe *>(tokenizer.get());
std::unique_ptr<llm_tokenizer_bpe_session> session;
if (vocab.get_tokenizer_model() == "hybriddna") {
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
} else {
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
}
if (add_special) {
session.append_bos(output);
session->append_bos(output);
}
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
@ -3161,15 +3270,15 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
#ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
#endif
session.tokenize(text, output);
session->tokenize(text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
session.append(fragment.token, output);
session->append(fragment.token, output);
}
}
if (add_special) {
session.append_eos(output);
session.check_double_bos_eos(output);
session->append_eos(output);
session->check_double_bos_eos(output);
}
} break;
case LLAMA_VOCAB_TYPE_WPM:

View File

@ -198,6 +198,11 @@ extern "C" {
LLAMA_SPLIT_MODE_TENSOR = 3,
};
enum llama_context_type {
LLAMA_CONTEXT_TYPE_DEFAULT = 0,
LLAMA_CONTEXT_TYPE_MTP = 1,
};
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
typedef struct llama_token_data {
llama_token id; // token id
@ -333,9 +338,11 @@ extern "C" {
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
enum llama_context_type ctx_type; // set the context type (e.g. MTP)
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
@ -530,6 +537,7 @@ extern "C" {
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx);
DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
@ -866,7 +874,8 @@ extern "C" {
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
// Keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load).
// Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag.
#define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
typedef uint32_t llama_state_seq_flags;

View File

@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output_with_img_logits", -1);
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.

View File

@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);

View File

@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);

View File

@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
if (f_logit_scale) {
cur = ggml_scale(ctx0, cur, f_logit_scale);

View File

@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -1,6 +1,7 @@
#include "models.h"
#include "llama-impl.h"
#include "llama-memory-recurrent.h"
// utility to get one slice from the third dimension
// input dim: [x, y, c, b]
@ -397,7 +398,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
// K=1 (final state only): reshape to 3D (S_v*S_v*H_v, 1, n_seqs) for ggml_gated_delta_net.
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, S_v * S_v * H_v, 1, n_seqs);
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d);
if (n_tokens == 1) {
cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
} else {
@ -443,3 +446,162 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
return build_delta_net_chunking(q, k, v, g, b, s, il);
}
ggml_tensor * llm_build_delta_net_base::build_conv_state(
llm_graph_input_rs * inp,
ggml_tensor * conv_states_all,
ggml_tensor * qkv_mixed,
int64_t conv_kernel_size,
int64_t conv_channels,
int il) {
const auto * mctx_cur = inp->mctx;
const auto kv_head = mctx_cur->get_head();
const auto mem_size = mctx_cur->get_size();
const int64_t n_seqs = ubatch.n_seqs;
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il);
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il);
qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
cb(qkv_mixed, "qkv_mixed_transposed", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il);
const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
const size_t row_size = ggml_row_size(conv_states_all->type, row_count);
if (cparams.n_rs_seq == 0) {
const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0];
const int64_t s_slot = 0;
ggml_tensor * conv_state_last =
ggml_view_3d(ctx0, conv_input,
conv_kernel_size - 1, conv_channels, n_seqs,
conv_input->nb[1], conv_input->nb[2],
ggml_row_size(conv_input->type, s_idx));
cb(conv_state_last, "conv_state_last", il);
ggml_tensor * conv_state_update =
ggml_view_2d(ctx0, conv_states_all,
row_count, n_seqs, conv_states_all->nb[1],
(s_slot * mem_size + kv_head) * row_size);
cb(conv_state_update, "conv_state_update", il);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
} else {
// [TAG_RECURRENT_ROLLBACK_SPLITS]
// TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are
// inside the same ubatch. currently with `split_equal()` this is not correct
const int64_t K = (int64_t) cparams.n_rs_seq + 1;
for (int64_t t = 1; t <= K; ++t) {
const int64_t s_idx = std::max<int64_t>(0, conv_input->ne[0] - conv_states->ne[0] - K + t);
const int64_t s_slot = K - t;
ggml_tensor * conv_state_last =
ggml_view_3d(ctx0, conv_input,
conv_kernel_size - 1, conv_channels, n_seqs,
conv_input->nb[1], conv_input->nb[2],
ggml_row_size(conv_input->type, s_idx));
ggml_tensor * conv_state_update =
ggml_view_2d(ctx0,
conv_states_all, row_count, n_seqs,
conv_states_all->nb[1],
(s_slot * mem_size + kv_head) * row_size);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
}
}
return conv_input;
}
ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
llm_graph_input_rs * inp,
ggml_tensor * ssm_states_all,
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * b,
ggml_tensor * s,
int il) {
const auto * mctx_cur = inp->mctx;
const auto kv_head = mctx_cur->get_head();
const uint32_t mem_size = mctx_cur->get_size();
const int64_t S_v = s->ne[0];
const int64_t H_v = s->ne[2];
const int64_t n_seqs = s->ne[3];
const int64_t n_seq_tokens = q->ne[2];
const bool keep = cparams.n_rs_seq > 0;
if (!keep) {
auto attn_out = build_delta_net(q, k, v, g, b, s, il);
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
cb(output, "attn_output", il);
cb(new_state, "new_state", il);
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state,
ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
return output;
}
const int64_t D = S_v * S_v * H_v;
const int64_t K = cparams.n_rs_seq + 1;
// TODO: remove pad + simplify
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0);
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad);
if (n_seq_tokens > 1) {
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
} else {
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il);
}
const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs;
const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs;
ggml_tensor * output = ggml_view_4d(ctx0, gdn_out,
S_v, H_v, n_seq_tokens, n_seqs,
ggml_row_size(gdn_out->type, S_v),
ggml_row_size(gdn_out->type, S_v * H_v),
ggml_row_size(gdn_out->type, S_v * H_v * n_seq_tokens),
0);
cb(output, "attn_output", il);
const size_t row_size = hparams.n_embd_s() * ggml_element_size(ssm_states_all);
for (int64_t k_i = 0; k_i < K; ++k_i) {
const uint32_t cache_slot = (uint32_t) (K - 1 - k_i);
ggml_tensor * src = ggml_view_4d(ctx0, gdn_out,
S_v, S_v, H_v, n_seqs,
ggml_row_size(gdn_out->type, S_v),
ggml_row_size(gdn_out->type, S_v * S_v),
ggml_row_size(gdn_out->type, S_v * S_v * H_v),
ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap));
ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all,
hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
((size_t) cache_slot * mem_size + kv_head) * row_size);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
}
return output;
}

View File

@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -163,7 +163,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
// final logit soft-capping
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);

View File

@ -207,7 +207,7 @@ llama_model_gemma3::graph<iswa>::graph(const llama_model & model, const llm_grap
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
if (hparams.f_final_logit_softcapping) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);

View File

@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
{
// final logit soft-capping

View File

@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
if (hparams.f_final_logit_softcapping) {
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);

View File

@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;
// Output projection
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
// For Granite architectures - scale logits
if (hparams.f_logit_scale) {

View File

@ -145,7 +145,7 @@ llama_model_granite::graph::graph(
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
// For Granite architectures - scale logits
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);

View File

@ -206,7 +206,7 @@ llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);

View File

@ -184,7 +184,7 @@ llama_model_grovemoe::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -179,7 +179,7 @@ llama_model_hunyuan_moe::graph::graph(const llama_model & model, const llm_graph
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -181,7 +181,7 @@ llama_model_hunyuan_vl::graph::graph(const llama_model & model, const llm_graph_
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -129,7 +129,7 @@ llama_model_internlm2::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -123,7 +123,7 @@ llama_model_jais::graph::graph(const llama_model & model, const llm_graph_params
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -152,7 +152,7 @@ llama_model_jais2::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// Output projection
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -189,7 +189,7 @@ llama_model_jamba::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -262,7 +262,7 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -153,7 +153,7 @@ llama_model_llada_moe::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -147,7 +147,7 @@ llama_model_llada::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -235,7 +235,7 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
if constexpr (!embed) {
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -260,7 +260,7 @@ llama_model_llama4::graph<iswa>::graph(const llama_model & model, const llm_grap
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -141,7 +141,7 @@ llama_model_maincoder::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -128,7 +128,7 @@ llama_model_mamba::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -231,7 +231,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -251,7 +251,7 @@ llama_model_minicpm3::graph::graph(const llama_model & model, const llm_graph_pa
cb(cur, "lmhead_scaling", -1);
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -158,7 +158,7 @@ llama_model_minimax_m2::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -222,7 +222,7 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -46,7 +46,7 @@ struct llm_build_delta_net_base : public llm_graph_context {
ggml_tensor * s,
int il);
// use the ggml_gated_delta_net fused operator
// use the ggml_gated_delta_net fused operator (K=1; state has shape (D, 1, n_seqs))
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
ggml_tensor * q,
ggml_tensor * k,
@ -65,6 +65,29 @@ struct llm_build_delta_net_base : public llm_graph_context {
ggml_tensor * b,
ggml_tensor * s,
int il);
// read conv state from cache, concat with qkv_mixed, write back (single slot or per-token)
// qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs)
ggml_tensor * build_conv_state(
llm_graph_input_rs * inp,
ggml_tensor * conv_states_all,
ggml_tensor * qkv_mixed,
int64_t conv_kernel_size,
int64_t conv_channels,
int il);
// run delta-net attention and write the new recurrent state(s) back to ssm_states_all
// s: (head_v_dim, head_v_dim, num_v_heads, n_seqs); returns output: (head_v_dim, num_v_heads, n_seq_tokens, n_seqs)
ggml_tensor * build_recurrent_attn(
llm_graph_input_rs * inp,
ggml_tensor * ssm_states_all,
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * b,
ggml_tensor * s,
int il);
};
struct llm_build_rwkv6_base : public llm_graph_context {
@ -1739,6 +1762,10 @@ struct llama_model_qwen35 : public llama_model_base {
const llama_model & model;
};
struct graph_mtp : public llm_graph_context {
graph_mtp(const llama_model & model, const llm_graph_params & params);
};
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
};
@ -1781,6 +1808,10 @@ struct llama_model_qwen35moe : public llama_model_base {
const llama_model & model;
};
struct graph_mtp : public llm_graph_context {
graph_mtp(const llama_model & model, const llm_graph_params & params);
};
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
};

View File

@ -161,7 +161,7 @@ llama_model_mpt::graph::graph(const llama_model & model, const llm_graph_params
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -174,7 +174,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -140,7 +140,7 @@ llama_model_nemotron::graph::graph(const llama_model & model, const llm_graph_pa
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -133,7 +133,7 @@ llama_model_olmo::graph::graph(const llama_model & model, const llm_graph_params
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -198,7 +198,7 @@ llama_model_olmo2::graph<iswa>::graph(const llama_model & model, const llm_graph
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -164,7 +164,7 @@ llama_model_olmoe::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -160,7 +160,7 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -162,7 +162,7 @@ llama_model_openelm::graph::graph(const llama_model & model, const llm_graph_par
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -132,7 +132,7 @@ llama_model_orion::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -98,7 +98,7 @@ llama_model_paddleocr::graph::graph(const llama_model & model, const llm_graph_p
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -148,7 +148,7 @@ llama_model_pangu_embed::graph::graph(const llama_model & model, const llm_graph
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
if (model.output_b != nullptr) {
cur = ggml_add(ctx0, cur, model.output_b);

View File

@ -130,7 +130,7 @@ llama_model_phi2::graph::graph(const llama_model & model, const llm_graph_params
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output_no_bias", -1);
cur = ggml_add(ctx0, cur, model.output_b);

View File

@ -179,7 +179,7 @@ llama_model_phi3::graph<iswa>::graph(const llama_model & model, const llm_graph_
cb(cur, "result_norm", -1);
res->t_embd = cur;
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
if (model.output_b != nullptr) {
cb(cur, "result_output_no_bias", -1);

View File

@ -127,7 +127,7 @@ llama_model_plamo::graph::graph(const llama_model & model, const llm_graph_param
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
res->t_logits = cur;

View File

@ -185,7 +185,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cur = build_lora_mm(model.output, cur, model.output_s);
cb(cur, "result_output", -1);
// Explicitly mark as output tensor to ensure proper backend assignment

Some files were not shown because too many files have changed in this diff Show More