talk-llama : sync llama.cpp
This commit is contained in:
parent
0a62a579cc
commit
865ec171aa
|
|
@ -757,14 +757,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
||||
// These tensors only exist in the last layer(s) and are treated as output tensors
|
||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
|
||||
// last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
|
||||
// the model loader doesn't fault on the block index.
|
||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
// Nemotron 3 Super
|
||||
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
|
|
@ -877,6 +878,16 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|||
}
|
||||
}
|
||||
|
||||
bool llm_arch_supports_rs_rollback(const llm_arch & arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_QWEN35:
|
||||
case LLM_ARCH_QWEN35MOE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_GROK:
|
||||
|
|
|
|||
|
|
@ -637,3 +637,4 @@ bool llm_arch_is_recurrent (const llm_arch & arch);
|
|||
bool llm_arch_is_hybrid (const llm_arch & arch);
|
||||
bool llm_arch_is_diffusion (const llm_arch & arch);
|
||||
bool llm_arch_supports_sm_tensor(const llm_arch & arch);
|
||||
bool llm_arch_supports_rs_rollback(const llm_arch & arch);
|
||||
|
|
|
|||
|
|
@ -73,7 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
|
||||
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
||||
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
|
||||
{ "hunyuan-vl", LLM_CHAT_TEMPLATE_HUNYUAN_VL },
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
||||
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
||||
|
|
@ -218,7 +218,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
|
||||
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_VL;
|
||||
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
|
||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||
|
|
@ -825,8 +825,8 @@ int32_t llm_chat_apply_template(
|
|||
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
|
||||
// tencent/HunyuanOCR
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_VL) {
|
||||
// tencent/HunyuanOCR & tencent/HunyuanVL
|
||||
ss << "<|hy_begin▁of▁sentence|>";
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
std::string role(chat[i]->role);
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||
LLM_CHAT_TEMPLATE_OPENAI_MOE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_VL,
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_SEED_OSS,
|
||||
LLM_CHAT_TEMPLATE_GROK_2,
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "ggml.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-graph.h"
|
||||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-io.h"
|
||||
|
|
@ -21,6 +22,14 @@
|
|||
// llama_context
|
||||
//
|
||||
|
||||
static llm_graph_type ctx_type_to_graph_type(llama_context_type ctx_type) {
|
||||
switch (ctx_type) {
|
||||
case LLAMA_CONTEXT_TYPE_DEFAULT: return LLM_GRAPH_TYPE_DEFAULT;
|
||||
case LLAMA_CONTEXT_TYPE_MTP : return LLM_GRAPH_TYPE_DECODER_MTP;
|
||||
}
|
||||
throw std::runtime_error("Unsupported ctx type");
|
||||
}
|
||||
|
||||
llama_context::llama_context(
|
||||
const llama_model & model,
|
||||
llama_context_params params) :
|
||||
|
|
@ -42,13 +51,22 @@ llama_context::llama_context(
|
|||
throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ));
|
||||
}
|
||||
|
||||
cparams.n_rs_seq = params.n_rs_seq;
|
||||
if (cparams.n_rs_seq > 0 && !llm_arch_supports_rs_rollback(model.arch)) {
|
||||
LLAMA_LOG_DEBUG("%s: n_rs_seq=%u requested but model arch does not support recurrent partial rollback; clamping to 0\n",
|
||||
__func__, cparams.n_rs_seq);
|
||||
cparams.n_rs_seq = 0;
|
||||
}
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
|
||||
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
|
||||
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.embeddings_pre_norm = false;
|
||||
cparams.embeddings_pre_norm_masked = false;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.no_perf = params.no_perf;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
|
@ -65,6 +83,8 @@ llama_context::llama_context(
|
|||
cparams.cb_eval = params.cb_eval;
|
||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||
|
||||
cparams.ctx_type = params.ctx_type;
|
||||
|
||||
// Initialize backend samplers here so they are part of the sampling graph
|
||||
// before the reserve passes run later in this function. This avoids a later
|
||||
// re-reserve when graph nodes change.
|
||||
|
|
@ -206,6 +226,7 @@ llama_context::llama_context(
|
|||
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
LLAMA_LOG_INFO("%s: n_rs_seq = %u\n", __func__, cparams.n_rs_seq);
|
||||
|
||||
if (cparams.n_ctx_seq < hparams.n_ctx_train) {
|
||||
LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
|
||||
|
|
@ -278,6 +299,7 @@ llama_context::llama_context(
|
|||
/*.type_k =*/ params.type_k,
|
||||
/*.type_v =*/ params.type_v,
|
||||
/*.swa_full =*/ params.swa_full,
|
||||
/*.ctx_type= */ cparams.ctx_type,
|
||||
};
|
||||
|
||||
memory.reset(model.create_memory(params_mem, cparams));
|
||||
|
|
@ -860,6 +882,42 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
|
|||
return it->second.data();
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_pre_norm() {
|
||||
output_reorder();
|
||||
|
||||
return embd_pre_norm.data;
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_pre_norm_ith(int32_t i) {
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (embd_pre_norm.data == nullptr) {
|
||||
throw std::runtime_error("no pre-norm embeddings");
|
||||
}
|
||||
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
if (!cparams.embeddings_pre_norm_masked) {
|
||||
// unmasked: pre-norm rows are stored densely, indexed by raw token position.
|
||||
if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) {
|
||||
throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd));
|
||||
}
|
||||
return embd_pre_norm.data + (size_t) i * n_embd;
|
||||
}
|
||||
|
||||
const int64_t j = output_resolve_row(i);
|
||||
return embd_pre_norm.data + j*n_embd;
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
GGML_ABORT("fatal error");
|
||||
#else
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
|
|
@ -1040,6 +1098,13 @@ void llama_context::set_embeddings(bool value) {
|
|||
//sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_embeddings_pre_norm(bool value, bool masked) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked);
|
||||
|
||||
cparams.embeddings_pre_norm = value;
|
||||
cparams.embeddings_pre_norm_masked = masked;
|
||||
}
|
||||
|
||||
void llama_context::set_causal_attn(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
|
|
@ -1072,6 +1137,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
|
|||
|
||||
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
|
||||
|
||||
if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
|
||||
static bool warned = false;
|
||||
if (!warned) {
|
||||
LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__);
|
||||
warned = true;
|
||||
}
|
||||
if (sampling.samplers.count(seq_id) > 0) {
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
sampling.samplers.erase(seq_id);
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool can_offload =
|
||||
sampler &&
|
||||
sampler->iface->backend_init &&
|
||||
|
|
@ -1241,7 +1319,9 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|||
}
|
||||
|
||||
int llama_context::encode(const llama_batch & batch_inp) {
|
||||
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
|
||||
// MTP hook batches carry both token (next-token id) and embd (h_pre_norm row),
|
||||
// so accept either present rather than requiring exactly one.
|
||||
GGML_ASSERT(batch_inp.token || batch_inp.embd);
|
||||
|
||||
if (batch_inp.n_tokens == 0) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
|
||||
|
|
@ -1312,8 +1392,9 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
}
|
||||
}
|
||||
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
||||
auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr;
|
||||
|
||||
// extract logits
|
||||
if (logits.data && t_logits) {
|
||||
|
|
@ -1379,6 +1460,16 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||
}
|
||||
}
|
||||
|
||||
// extract pre-norm embeddings (hidden state before the final output norm)
|
||||
if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
|
||||
GGML_ASSERT(backend_h != nullptr);
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_pre_norm.size);
|
||||
ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm.data, 0, n_tokens*n_embd*sizeof(float));
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
//cross.t_embd = t_embd;
|
||||
|
|
@ -1531,7 +1622,9 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_s
|
|||
}
|
||||
|
||||
int llama_context::decode(const llama_batch & batch_inp) {
|
||||
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
|
||||
// MTP hook batches carry both token (next-token id) and embd (h_pre_norm row),
|
||||
// so accept either present rather than requiring exactly one.
|
||||
GGML_ASSERT(batch_inp.token || batch_inp.embd);
|
||||
|
||||
if (!memory) {
|
||||
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
||||
|
|
@ -1668,6 +1761,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
};
|
||||
|
||||
int64_t n_outputs_prev = 0;
|
||||
int64_t n_tokens_prev = 0;
|
||||
|
||||
do {
|
||||
const auto & ubatch = mctx->get_ubatch();
|
||||
|
|
@ -1689,7 +1783,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
}
|
||||
|
||||
ggml_status status;
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
|
||||
const auto * res = process_ubatch(ubatch, ctx_type_to_graph_type(cparams.ctx_type), mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
|
||||
|
|
@ -1727,8 +1822,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
t_embd = res->get_embd_pooled();
|
||||
|
|
@ -1809,6 +1905,25 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
}
|
||||
}
|
||||
|
||||
// extract pre-norm embeddings (hidden state before the final output norm)
|
||||
// only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
|
||||
{
|
||||
const bool masked = cparams.embeddings_pre_norm_masked;
|
||||
const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens;
|
||||
const int64_t offset = masked ? n_outputs_prev : n_tokens_prev;
|
||||
|
||||
if (embd_pre_norm.data && t_h_pre_norm && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm);
|
||||
GGML_ASSERT(backend_h != nullptr);
|
||||
|
||||
const uint32_t n_embd = hparams.n_embd;
|
||||
float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd;
|
||||
|
||||
GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size);
|
||||
ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_rows*n_embd*sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
// Copy backend sampling output if this ubatch produced any sampling tensors.
|
||||
if (has_samplers && (!res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty())) {
|
||||
const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
|
||||
|
|
@ -1823,6 +1938,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
}
|
||||
|
||||
n_outputs_prev += n_outputs;
|
||||
n_tokens_prev += ubatch.n_tokens;
|
||||
} while (mctx->next());
|
||||
|
||||
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
||||
|
|
@ -1893,10 +2009,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
|
||||
const auto n_batch = cparams.n_batch;
|
||||
const auto n_vocab = vocab.n_tokens();
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_embd_out = hparams.n_embd_out();
|
||||
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
bool has_logits = true;
|
||||
bool has_embd = cparams.embeddings;
|
||||
bool has_embd_pre_norm = cparams.embeddings_pre_norm;
|
||||
|
||||
// TODO: hacky enc-dec support
|
||||
if (model.arch == LLM_ARCH_T5) {
|
||||
|
|
@ -1908,8 +2026,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
size_t backend_float_count = 0;
|
||||
size_t backend_token_count = 0;
|
||||
|
||||
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
|
||||
embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0;
|
||||
|
||||
if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) {
|
||||
// unmasked: pre-norm row exists for every token in the batch, not just
|
||||
// those flagged via batch.logits[i] -> size by token count instead.
|
||||
embd_pre_norm.size = (size_t) n_embd * n_batch;
|
||||
}
|
||||
|
||||
// Allocate backend sampling output buffers if there are backend samplers configured.
|
||||
const bool has_sampling = !sampling.samplers.empty();
|
||||
|
|
@ -1925,8 +2050,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
|
||||
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
|
||||
const size_t new_size =
|
||||
(logits.size + embd.size + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
(logits.size + embd.size + embd_pre_norm.size + backend_float_count) * sizeof(float) +
|
||||
( backend_token_count) * sizeof(llama_token);
|
||||
|
||||
// alloc only when more than the current capacity is required
|
||||
// TODO: also consider shrinking the buffer
|
||||
|
|
@ -1942,6 +2067,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
buf_output = nullptr;
|
||||
logits.data = nullptr;
|
||||
embd.data = nullptr;
|
||||
embd_pre_norm.data = nullptr;
|
||||
}
|
||||
|
||||
auto * buft = ggml_backend_cpu_buffer_type();
|
||||
|
|
@ -1970,6 +2096,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||
embd = has_embd ? buffer_view<float>{(float *) (base + offset), embd.size} : buffer_view<float>{nullptr, 0};
|
||||
offset += embd.size * sizeof(float);
|
||||
|
||||
embd_pre_norm = has_embd_pre_norm ? buffer_view<float>{(float *) (base + offset), embd_pre_norm.size} : buffer_view<float>{nullptr, 0};
|
||||
offset += embd_pre_norm.size * sizeof(float);
|
||||
|
||||
if (has_sampling) {
|
||||
sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
|
||||
offset += sampling.logits.size * sizeof(float);
|
||||
|
|
@ -2034,6 +2163,12 @@ void llama_context::output_reorder() {
|
|||
}
|
||||
}
|
||||
|
||||
if (embd_pre_norm.size > 0) {
|
||||
for (uint64_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd_pre_norm.data[i0*n_embd + k], embd_pre_norm.data[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!sampling.samplers.empty()) {
|
||||
assert(sampling.logits.size > 0);
|
||||
assert(sampling.probs.size > 0);
|
||||
|
|
@ -2121,7 +2256,7 @@ ggml_cgraph * llama_context::graph_reserve(
|
|||
|
||||
auto * res = gf_res_reserve.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
||||
const auto gparams = graph_params(res, ubatch, mctx, ctx_type_to_graph_type(cparams.ctx_type));
|
||||
|
||||
res->reset();
|
||||
|
||||
|
|
@ -3100,7 +3235,7 @@ void llama_context::opt_epoch_iter(
|
|||
|
||||
auto * res = gf_res_prev.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), ctx_type_to_graph_type(cparams.ctx_type));
|
||||
|
||||
res->reset();
|
||||
|
||||
|
|
@ -3201,8 +3336,10 @@ llama_context_params llama_context_default_params() {
|
|||
/*.n_batch =*/ 2048,
|
||||
/*.n_ubatch =*/ 512,
|
||||
/*.n_seq_max =*/ 1,
|
||||
/*.n_rs_seq =*/ 0,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.ctx_type =*/ LLAMA_CONTEXT_TYPE_DEFAULT,
|
||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||
/*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
|
||||
|
|
@ -3306,6 +3443,13 @@ llama_context * llama_init_from_model(
|
|||
model->hparams.pooling_type, params.pooling_type);
|
||||
}
|
||||
|
||||
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
|
||||
model->hparams.nextn_predict_layers == 0) {
|
||||
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
auto * ctx = new llama_context(*model, params);
|
||||
return ctx;
|
||||
|
|
@ -3347,6 +3491,10 @@ uint32_t llama_n_seq_max(const llama_context * ctx) {
|
|||
return ctx->n_seq_max();
|
||||
}
|
||||
|
||||
uint32_t llama_n_rs_seq(const llama_context * ctx) {
|
||||
return ctx->get_cparams().n_rs_seq;
|
||||
}
|
||||
|
||||
const llama_model * llama_get_model(const llama_context * ctx) {
|
||||
return &ctx->get_model();
|
||||
}
|
||||
|
|
@ -3436,6 +3584,22 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
|
|||
return ctx->get_embeddings_seq(seq_id);
|
||||
}
|
||||
|
||||
void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked) {
|
||||
ctx->set_embeddings_pre_norm(value, masked);
|
||||
}
|
||||
|
||||
float * llama_get_embeddings_pre_norm(llama_context * ctx) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_embeddings_pre_norm();
|
||||
}
|
||||
|
||||
float * llama_get_embeddings_pre_norm_ith(llama_context * ctx, int32_t i) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->get_embeddings_pre_norm_ith(i);
|
||||
}
|
||||
|
||||
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
||||
return ctx->set_sampler(seq_id, smpl);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -84,6 +84,9 @@ struct llama_context {
|
|||
float * get_embeddings_ith(int32_t i);
|
||||
float * get_embeddings_seq(llama_seq_id seq_id);
|
||||
|
||||
float * get_embeddings_pre_norm();
|
||||
float * get_embeddings_pre_norm_ith(int32_t i);
|
||||
|
||||
llama_token * get_sampled_tokens() const;
|
||||
llama_token get_sampled_token_ith(int32_t idx);
|
||||
|
||||
|
|
@ -107,6 +110,7 @@ struct llama_context {
|
|||
void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
|
||||
|
||||
void set_embeddings (bool value);
|
||||
void set_embeddings_pre_norm(bool value, bool masked);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
|
|
@ -278,6 +282,11 @@ private:
|
|||
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
|
||||
buffer_view<float> embd = {nullptr, 0};
|
||||
|
||||
// hidden state before the final output norm (2-dimensional array: [n_outputs][n_embd])
|
||||
// populated only when cparams.embeddings_pre_norm is enabled and the model graph
|
||||
// sets llm_graph_result::t_h_pre_norm
|
||||
buffer_view<float> embd_pre_norm = {nullptr, 0};
|
||||
|
||||
struct sampling_info {
|
||||
// !samplers.empty() to check if any samplers are active
|
||||
std::map<llama_seq_id, llama_sampler *> samplers;
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ struct llama_cparams {
|
|||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
|
|
@ -27,6 +28,8 @@ struct llama_cparams {
|
|||
float yarn_beta_slow;
|
||||
|
||||
bool embeddings;
|
||||
bool embeddings_pre_norm; // also extract the hidden state before the final output norm
|
||||
bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0
|
||||
bool causal_attn;
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
|
|
@ -40,6 +43,7 @@ struct llama_cparams {
|
|||
bool kv_unified;
|
||||
bool pipeline_parallel;
|
||||
|
||||
enum llama_context_type ctx_type;
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
|
|
|
|||
|
|
@ -88,3 +88,19 @@ LLAMA_API int32_t llama_model_n_devices(const struct llama_model * model);
|
|||
LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * model, int i);
|
||||
|
||||
LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// pre-norm embeddings (hidden state before the final output norm)
|
||||
//
|
||||
|
||||
// Set whether the context outputs pre-norm embeddings or not
|
||||
// If masked == true, output the embeddings only for the tokens with batch.logits != 0
|
||||
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
|
||||
LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked);
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx);
|
||||
|
||||
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i);
|
||||
|
|
|
|||
|
|
@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
|
|||
}
|
||||
|
||||
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
// base tensors may not be allocated if there are no non-SWA attention layers
|
||||
if (self_k_idxs && self_k_idxs->buffer) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
||||
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
// swa tensors may not be allocated if there are no SWA attention layers
|
||||
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
|
||||
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
if (self_k_rot) {
|
||||
mctx->get_base()->set_input_k_rot(self_k_rot);
|
||||
|
|
@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
|||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
// base tensors may not be allocated if there are no non-SWA attention layers
|
||||
if (self_k_idxs && self_k_idxs->buffer) {
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
|
||||
}
|
||||
|
||||
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
|
||||
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
|
||||
// swa tensors may not be allocated if there are no SWA attention layers
|
||||
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
@ -848,6 +861,9 @@ void llm_graph_result::set_outputs() {
|
|||
if (t_embd_pooled != nullptr) {
|
||||
ggml_set_output(t_embd_pooled);
|
||||
}
|
||||
if (t_h_pre_norm != nullptr) {
|
||||
ggml_set_output(t_h_pre_norm);
|
||||
}
|
||||
for (auto & [seq_id, t] : t_sampled) {
|
||||
if (t != nullptr) {
|
||||
ggml_set_output(t);
|
||||
|
|
@ -2528,7 +2544,8 @@ ggml_tensor * llm_graph_context::build_rs(
|
|||
int32_t rs_zero,
|
||||
const llm_graph_get_rows_fn & get_state_rows) const {
|
||||
|
||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, rs_size);
|
||||
GGML_UNUSED(rs_size);
|
||||
ggml_tensor * states = ggml_reshape_2d(ctx0, s, state_size, s->ne[1]);
|
||||
|
||||
// Clear a single state which will then be copied to the other cleared states.
|
||||
// Note that this is a no-op when the view is zero-sized.
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ enum llm_graph_type {
|
|||
LLM_GRAPH_TYPE_DEFAULT,
|
||||
LLM_GRAPH_TYPE_ENCODER,
|
||||
LLM_GRAPH_TYPE_DECODER,
|
||||
LLM_GRAPH_TYPE_DECODER_MTP,
|
||||
};
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
|
|
@ -580,7 +581,8 @@ struct llm_graph_params {
|
|||
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
||||
(
|
||||
(!ubatch.token && !other.ubatch.token) ||
|
||||
(!ubatch.embd && !other.ubatch.embd)
|
||||
(!ubatch.embd && !other.ubatch.embd) ||
|
||||
(ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd)
|
||||
);
|
||||
|
||||
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
||||
|
|
@ -644,6 +646,7 @@ public:
|
|||
ggml_tensor * get_logits() const { return t_logits; }
|
||||
ggml_tensor * get_embd() const { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
||||
ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; }
|
||||
|
||||
ggml_cgraph * get_gf() const { return gf; }
|
||||
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
||||
|
|
@ -672,6 +675,7 @@ public:
|
|||
ggml_tensor * t_logits = nullptr;
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm
|
||||
|
||||
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
|
||||
std::map<llama_seq_id, ggml_tensor*> t_candidates;
|
||||
|
|
|
|||
|
|
@ -229,6 +229,12 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
|
|||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
if (kv_only_nextn) {
|
||||
// MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
|
||||
// the leading trunk blocks are not executed in this graph.
|
||||
return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
|
||||
}
|
||||
|
||||
if (n_layer_kv_from_start >= 0) {
|
||||
if (il < (uint32_t) n_layer_kv_from_start) {
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -92,6 +92,8 @@ struct llama_hparams {
|
|||
uint32_t moe_latent_size = 0;
|
||||
uint32_t nextn_predict_layers = 0;
|
||||
|
||||
bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
float f_norm_group_eps;
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
|
|||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
|
|
@ -54,6 +55,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
|
|||
offload,
|
||||
rs_size,
|
||||
n_seq_max,
|
||||
n_rs_seq,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
: filter_recr
|
||||
|
|
@ -73,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
|
|||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
if (mem_recr->n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ public:
|
|||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
|
|
@ -54,6 +55,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
|||
offload,
|
||||
rs_size,
|
||||
n_seq_max,
|
||||
n_rs_seq,
|
||||
filter_recr == nullptr ?
|
||||
[&](int32_t il) { return hparams.is_recurrent(il); }
|
||||
: filter_recr
|
||||
|
|
@ -73,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
|||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
if (mem_recr->n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ public:
|
|||
uint32_t rs_size,
|
||||
/* common */
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
bool offload,
|
||||
bool unified,
|
||||
/* layer filters */
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|||
bool offload,
|
||||
uint32_t mem_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) {
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
|
||||
|
|
@ -31,6 +32,9 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|||
size = mem_size;
|
||||
used = 0;
|
||||
|
||||
this->n_rs_seq = n_rs_seq;
|
||||
rs_idx.assign(n_seq_max, 0);
|
||||
|
||||
cells.clear();
|
||||
cells.resize(mem_size);
|
||||
|
||||
|
|
@ -92,8 +96,9 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|||
throw std::runtime_error("failed to create ggml context for rs cache");
|
||||
}
|
||||
|
||||
ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), mem_size);
|
||||
ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), mem_size);
|
||||
const uint32_t n_rows = mem_size * (1 + n_rs_seq);
|
||||
ggml_tensor * r = ggml_new_tensor_2d(ctx, type_r, hparams.n_embd_r(), n_rows);
|
||||
ggml_tensor * s = ggml_new_tensor_2d(ctx, type_s, hparams.n_embd_s(), n_rows);
|
||||
ggml_format_name(r, "cache_r_l%d", i);
|
||||
ggml_format_name(s, "cache_s_l%d", i);
|
||||
r_l[i] = r;
|
||||
|
|
@ -115,8 +120,8 @@ llama_memory_recurrent::llama_memory_recurrent(
|
|||
const size_t memory_size_r = size_r_bytes();
|
||||
const size_t memory_size_s = size_s_bytes();
|
||||
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max,
|
||||
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs %2u rs_seq), R (%s): %7.2f MiB, S (%s): %7.2f MiB\n", __func__,
|
||||
(float)(memory_size_r + memory_size_s) / (1024.0f * 1024.0f), mem_size, n_layer, n_seq_max, n_rs_seq,
|
||||
ggml_type_name(type_r), (float)memory_size_r / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_s), (float)memory_size_s / (1024.0f * 1024.0f));
|
||||
}
|
||||
|
|
@ -138,10 +143,11 @@ void llama_memory_recurrent::clear(bool data) {
|
|||
ggml_backend_buffer_clear(buf.get(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
std::fill(rs_idx.begin(), rs_idx.end(), 0);
|
||||
}
|
||||
|
||||
bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
//printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
|
||||
uint32_t new_head = size;
|
||||
|
||||
if (p0 < 0) {
|
||||
|
|
@ -152,6 +158,15 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|||
p1 = std::numeric_limits<llama_pos>::max();
|
||||
}
|
||||
|
||||
const bool rm_all = p0 == 0 && p1 == std::numeric_limits<llama_pos>::max();
|
||||
if (rm_all) {
|
||||
if (seq_id >= 0) {
|
||||
set_rs_idx(seq_id, 0);
|
||||
} else {
|
||||
std::fill(rs_idx.begin(), rs_idx.end(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
// models like Mamba or RWKV can't have a state partially erased at the end
|
||||
// of the sequence because their state isn't preserved for previous tokens
|
||||
if (seq_id >= (int64_t) size) {
|
||||
|
|
@ -161,10 +176,16 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
|
|||
if (0 <= seq_id) {
|
||||
int32_t & tail_id = cells[seq_id].tail;
|
||||
if (tail_id >= 0) {
|
||||
const auto & cell = cells[tail_id];
|
||||
// partial intersection is invalid if it includes the final pos
|
||||
auto & cell = cells[tail_id];
|
||||
|
||||
// partial rollback via per-token snapshot index (bounded by n_rs_seq)
|
||||
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
|
||||
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
|
||||
const llama_pos rollback = cell.pos - (p0 - 1);
|
||||
if (rollback >= 1 && rollback <= (llama_pos) n_rs_seq) {
|
||||
set_rs_idx(seq_id, (uint32_t) rollback);
|
||||
cell.pos = p0 - 1;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// invalidate tails which will be cleared
|
||||
|
|
@ -368,6 +389,13 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
|||
return result;
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::set_rs_idx(llama_seq_id seq_id, uint32_t idx) {
|
||||
if (seq_id < 0 || (size_t) seq_id >= rs_idx.size()) {
|
||||
return;
|
||||
}
|
||||
rs_idx[seq_id] = (idx > n_rs_seq) ? n_rs_seq : idx;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const auto & [_, buf] : ctxs_bufs) {
|
||||
|
|
@ -388,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
|
|||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
if (n_rs_seq > 0) {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: recurrent state rollback does not support equal splits
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
|
@ -703,6 +737,7 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|||
GGML_UNUSED(flags);
|
||||
|
||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges_data; // logical source row ranges
|
||||
uint32_t cell_count = 0;
|
||||
|
||||
// Count the number of cells with the specified seq_id
|
||||
|
|
@ -712,6 +747,35 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|||
const auto & cell = cells[i];
|
||||
if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
|
||||
++cell_count;
|
||||
uint32_t rs_idx_cur = 0;
|
||||
|
||||
if (n_rs_seq != 0) {
|
||||
if (seq_id != -1) {
|
||||
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < rs_idx.size());
|
||||
rs_idx_cur = rs_idx[seq_id];
|
||||
} else {
|
||||
bool has_rs_idx = false;
|
||||
for (const llama_seq_id cell_seq_id : cell.seq_id) {
|
||||
GGML_ASSERT(cell_seq_id >= 0 && (size_t) cell_seq_id < rs_idx.size());
|
||||
|
||||
const uint32_t seq_rs_idx = rs_idx[cell_seq_id];
|
||||
if (!has_rs_idx) {
|
||||
rs_idx_cur = seq_rs_idx;
|
||||
has_rs_idx = true;
|
||||
} else if (rs_idx_cur != seq_rs_idx) {
|
||||
GGML_ABORT("cannot write shared recurrent state with different rollback indices");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t cell_id = rs_idx_cur * size + (cell.src >= 0 ? cell.src : (int32_t) i);
|
||||
if (cell_ranges_data.empty() || cell_ranges_data.back().second != cell_id) {
|
||||
cell_ranges_data.emplace_back(cell_id, cell_id + 1);
|
||||
} else {
|
||||
cell_ranges_data.back().second++;
|
||||
}
|
||||
|
||||
if (cell_range_begin == size) {
|
||||
cell_range_begin = i;
|
||||
}
|
||||
|
|
@ -726,7 +790,7 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|||
cell_ranges.emplace_back(cell_range_begin, size);
|
||||
}
|
||||
|
||||
if (flags % LLAMA_STATE_SEQ_FLAGS_ON_DEVICE && cell_ranges.size() > 1) {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_ON_DEVICE) && cell_ranges.size() > 1) {
|
||||
GGML_ABORT("cannot save/load multiple ranges of cells to/from device memory\n");
|
||||
}
|
||||
|
||||
|
|
@ -737,10 +801,16 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
|||
}
|
||||
GGML_ASSERT(cell_count == cell_count_check);
|
||||
|
||||
cell_count_check = 0;
|
||||
for (const auto & range : cell_ranges_data) {
|
||||
cell_count_check += range.second - range.first;
|
||||
}
|
||||
GGML_ASSERT(cell_count == cell_count_check);
|
||||
|
||||
io.write(&cell_count, sizeof(cell_count));
|
||||
|
||||
state_write_meta(io, cell_ranges, seq_id);
|
||||
state_write_data(io, cell_ranges);
|
||||
state_write_data(io, cell_ranges_data);
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
|
|
@ -762,6 +832,14 @@ void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_i
|
|||
}
|
||||
throw std::runtime_error("failed to restore kv cache");
|
||||
}
|
||||
|
||||
if (n_rs_seq != 0) {
|
||||
if (seq_id == -1) {
|
||||
std::fill(rs_idx.begin(), rs_idx.end(), 0);
|
||||
} else {
|
||||
set_rs_idx(seq_id, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
|
||||
|
|
@ -804,7 +882,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
|
||||
io.write(&r_size_row, sizeof(r_size_row));
|
||||
|
||||
// Write each range of cells of r_size_row length
|
||||
// Write each logical cell row range. With pending recurrent rollback,
|
||||
// the logical current state may live in a rollback snapshot plane.
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * r_size_row;
|
||||
|
|
@ -825,7 +904,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
|
||||
io.write(&s_size_row, sizeof(s_size_row));
|
||||
|
||||
// Write each range of S tensor rows
|
||||
// Write each logical cell row range. With pending recurrent rollback,
|
||||
// the logical current state may live in a rollback snapshot plane.
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t buf_size = range_size * s_size_row;
|
||||
|
|
@ -852,9 +932,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
|||
// Write GQA embedding size
|
||||
io.write(&n_embd_s, sizeof(n_embd_s));
|
||||
|
||||
// For each row, we get the element values of each cell
|
||||
// For each row, we get the element values of each logical cell
|
||||
for (uint32_t j = 0; j < n_embd_s; ++j) {
|
||||
// Write each range of cells of s_size_el length
|
||||
for (const auto & range : cell_ranges) {
|
||||
const size_t range_size = range.second - range.first;
|
||||
const size_t src_offset = (range.first + j * mem_size) * s_size_el;
|
||||
|
|
@ -1163,5 +1242,21 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
|
|||
}
|
||||
|
||||
int32_t llama_memory_recurrent_context::s_copy(int i) const {
|
||||
return mem->cells[i + mem->head].src0;
|
||||
const uint32_t cell_idx = i + mem->head;
|
||||
const int32_t src0 = mem->cells[cell_idx].src0;
|
||||
|
||||
if (mem->n_rs_seq == 0) {
|
||||
return src0;
|
||||
}
|
||||
|
||||
uint32_t idx = 0;
|
||||
if (!mem->cells[cell_idx].seq_id.empty()) {
|
||||
const llama_seq_id seq = *mem->cells[cell_idx].seq_id.begin();
|
||||
if (seq >= 0 && (size_t) seq < mem->rs_idx.size()) {
|
||||
idx = mem->rs_idx[seq];
|
||||
// reset rollback idx
|
||||
mem->rs_idx[seq] = 0;
|
||||
}
|
||||
}
|
||||
return (int32_t)(idx * mem->size) + src0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ public:
|
|||
bool offload,
|
||||
uint32_t mem_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_rs_seq,
|
||||
const layer_filter_cb & filter);
|
||||
|
||||
~llama_memory_recurrent() = default;
|
||||
|
|
@ -69,6 +70,14 @@ public:
|
|||
uint32_t size = 0; // total number of cells, shared across all sequences
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups
|
||||
uint32_t n_rs_seq = 0;
|
||||
|
||||
// per-seq rollback index
|
||||
std::vector<uint32_t> rs_idx;
|
||||
|
||||
void set_rs_idx(llama_seq_id seq_id, uint32_t idx);
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-graph.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
|
@ -20,6 +21,8 @@ struct llama_memory_params {
|
|||
|
||||
// use full-size SWA cache
|
||||
bool swa_full;
|
||||
|
||||
llama_context_type ctx_type;
|
||||
};
|
||||
|
||||
enum llama_memory_status {
|
||||
|
|
|
|||
|
|
@ -1312,9 +1312,16 @@ struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_conte
|
|||
return tensor;
|
||||
}
|
||||
|
||||
void llama_model_loader::done_getting_tensors() const {
|
||||
if (n_created != n_tensors) {
|
||||
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||
void llama_model_loader::done_getting_tensors(bool partial) const {
|
||||
if (n_created > n_tensors) {
|
||||
throw std::runtime_error(format("%s: too many tensors created; expected %d, got %d", __func__, n_tensors, n_created));
|
||||
}
|
||||
if (n_created < n_tensors) {
|
||||
if (!partial) {
|
||||
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: partial load — used %d of %d tensors in the file (rest belong to a sibling model on the same .gguf)\n",
|
||||
__func__, n_created, n_tensors);
|
||||
}
|
||||
if (n_tensors_moved > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
|
||||
|
|
|
|||
|
|
@ -184,7 +184,7 @@ struct llama_model_loader {
|
|||
|
||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
||||
|
||||
void done_getting_tensors() const;
|
||||
void done_getting_tensors(bool partial = false) const;
|
||||
|
||||
void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
|
||||
|
||||
|
|
|
|||
|
|
@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() {
|
|||
add_tensor(model->output);
|
||||
add_tensor(model->output_b);
|
||||
add_tensor(model->output_norm_enc);
|
||||
add_tensor(model->output_s);
|
||||
add_tensor(model->output_in_s);
|
||||
add_tensor(model->cls);
|
||||
add_tensor(model->cls_b);
|
||||
add_tensor(model->cls_out);
|
||||
|
|
|
|||
|
|
@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
|||
if (!layer.ssm_beta_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) {
|
||||
layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) {
|
||||
layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// input scales
|
||||
if (!layer.wq_in_s && layer.wq) {
|
||||
|
|
@ -1393,11 +1399,30 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
|||
if (!layer.ssm_beta_in_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) {
|
||||
layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) {
|
||||
layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
// output scales
|
||||
if (output && output->type == GGML_TYPE_NVFP4) {
|
||||
// weight scale
|
||||
if (!output_s) {
|
||||
output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
// input scale
|
||||
if (!output_in_s) {
|
||||
output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ml.done_getting_tensors();
|
||||
|
||||
GGML_ASSERT(!(output && tok_embd &&
|
||||
strcmp(output->name, tok_embd->name) == 0 &&
|
||||
output->type == GGML_TYPE_NVFP4));
|
||||
// populate tensors_by_name
|
||||
for (auto & [_, ctx_ptr] : ml.ctx_map) {
|
||||
for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) {
|
||||
|
|
@ -1934,6 +1959,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
// checks
|
||||
default:
|
||||
{
|
||||
// The MTP head is dense-attention only on hybrid Qwen3.5/3.6, so use a plain
|
||||
// attention KV cache for the MTP context instead of the hybrid wrapper.
|
||||
const bool mtp_on_hybrid_qwen35 =
|
||||
params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
|
||||
(arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE);
|
||||
|
||||
if (llm_arch_is_recurrent(arch)) {
|
||||
res = new llama_memory_recurrent(
|
||||
*this,
|
||||
|
|
@ -1942,8 +1973,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
cparams.offload_kqv,
|
||||
std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
cparams.n_seq_max,
|
||||
cparams.n_rs_seq,
|
||||
nullptr);
|
||||
} else if (llm_arch_is_hybrid(arch)) {
|
||||
} else if (llm_arch_is_hybrid(arch) && !mtp_on_hybrid_qwen35) {
|
||||
// The main difference between hybrid architectures is the
|
||||
// layer filters, so pick the right one here
|
||||
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
||||
|
|
@ -1958,6 +1990,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
filter_recr = [&](int32_t il) {
|
||||
return hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
|
||||
};
|
||||
} else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter_attn = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && !hparams.is_recurrent(il);
|
||||
};
|
||||
filter_recr = [&, n_main](int32_t il) {
|
||||
return (uint32_t)il < n_main && hparams.is_recurrent(il);
|
||||
};
|
||||
}
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
|
|
@ -1975,6 +2015,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
/* recurrent_type_s */ GGML_TYPE_F32,
|
||||
/* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
/* n_seq_max */ cparams.n_seq_max,
|
||||
/* n_rs_seq */ cparams.n_rs_seq,
|
||||
/* offload */ cparams.offload_kqv,
|
||||
/* unified */ cparams.kv_unified,
|
||||
/* filter_attn */ std::move(filter_attn),
|
||||
|
|
@ -1993,6 +2034,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
/* recurrent_type_v */ GGML_TYPE_F32,
|
||||
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
||||
/* n_seq_max */ cparams.n_seq_max,
|
||||
/* n_rs_seq */ cparams.n_rs_seq,
|
||||
/* offload */ cparams.offload_kqv,
|
||||
/* unified */ cparams.kv_unified,
|
||||
/* filter_attn */ std::move(filter_attn),
|
||||
|
|
@ -2000,6 +2042,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
}
|
||||
} else {
|
||||
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
||||
llama_kv_cache::layer_filter_cb filter = nullptr;
|
||||
|
||||
if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) {
|
||||
reuse = [&](int32_t il) {
|
||||
|
|
@ -2011,6 +2054,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
};
|
||||
}
|
||||
|
||||
if (mtp_on_hybrid_qwen35) {
|
||||
const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; };
|
||||
}
|
||||
|
||||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
GGML_ASSERT(hparams.is_swa_any());
|
||||
|
||||
|
|
@ -2026,7 +2074,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
cparams.n_seq_max,
|
||||
cparams.n_ubatch,
|
||||
1,
|
||||
nullptr,
|
||||
filter,
|
||||
reuse);
|
||||
} else {
|
||||
GGML_ASSERT(!hparams.is_swa_any());
|
||||
|
|
@ -2043,7 +2091,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|||
1,
|
||||
hparams.n_swa,
|
||||
hparams.swa_type,
|
||||
nullptr,
|
||||
filter,
|
||||
nullptr);
|
||||
}
|
||||
}
|
||||
|
|
@ -2146,6 +2194,7 @@ int32_t llama_model_n_swa(const llama_model * model) {
|
|||
return model->hparams.n_swa;
|
||||
}
|
||||
|
||||
|
||||
uint32_t llama_model_n_cls_out(const struct llama_model * model) {
|
||||
return model->hparams.n_cls_out;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -202,12 +202,16 @@ struct llama_layer_shortconv {
|
|||
};
|
||||
|
||||
struct llama_layer_nextn {
|
||||
struct ggml_tensor * eh_proj = nullptr;
|
||||
struct ggml_tensor * embed_tokens = nullptr;
|
||||
struct ggml_tensor * enorm = nullptr;
|
||||
struct ggml_tensor * hnorm = nullptr;
|
||||
struct ggml_tensor * shared_head_head = nullptr;
|
||||
struct ggml_tensor * shared_head_norm = nullptr;
|
||||
struct ggml_tensor * eh_proj = nullptr;
|
||||
struct ggml_tensor * eh_proj_s = nullptr;
|
||||
struct ggml_tensor * eh_proj_in_s = nullptr;
|
||||
struct ggml_tensor * embed_tokens = nullptr;
|
||||
struct ggml_tensor * enorm = nullptr;
|
||||
struct ggml_tensor * hnorm = nullptr;
|
||||
struct ggml_tensor * shared_head_head = nullptr;
|
||||
struct ggml_tensor * shared_head_head_s = nullptr;
|
||||
struct ggml_tensor * shared_head_head_in_s = nullptr;
|
||||
struct ggml_tensor * shared_head_norm = nullptr;
|
||||
};
|
||||
|
||||
struct llama_layer {
|
||||
|
|
@ -533,6 +537,11 @@ struct llama_model {
|
|||
struct ggml_tensor * output_b = nullptr;
|
||||
struct ggml_tensor * output_norm_enc = nullptr;
|
||||
|
||||
|
||||
// NVFP4 per-tensor scale2, input_scale for LM head
|
||||
struct ggml_tensor * output_s = nullptr;
|
||||
struct ggml_tensor * output_in_s = nullptr;
|
||||
|
||||
// classifier
|
||||
struct ggml_tensor * cls = nullptr;
|
||||
struct ggml_tensor * cls_b = nullptr;
|
||||
|
|
|
|||
|
|
@ -530,6 +530,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
struct llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
||||
|
||||
virtual ~llm_tokenizer_bpe_session() = default;
|
||||
|
||||
static void append(const llama_token token_id, std::vector<llama_token> & output) {
|
||||
output.push_back(token_id);
|
||||
}
|
||||
|
|
@ -567,7 +569,7 @@ struct llm_tokenizer_bpe_session {
|
|||
}
|
||||
}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
virtual void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
int final_prev_index = -1;
|
||||
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
|
||||
|
||||
|
|
@ -1579,6 +1581,88 @@ private:
|
|||
const llm_tokenizer_plamo2 & tokenizer;
|
||||
};
|
||||
|
||||
// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
|
||||
// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
|
||||
// text at load
|
||||
static const std::string dna_kmer_marker = "\xee\x80\x80";
|
||||
|
||||
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
|
||||
static const std::string open_tag = "<dna>";
|
||||
static const std::string close_tag = "</dna>";
|
||||
|
||||
const auto dna_begin_id = vocab.text_to_token(open_tag);
|
||||
const auto dna_end_id = vocab.text_to_token(close_tag);
|
||||
const auto dna_oov_id = vocab.text_to_token("<oov>");
|
||||
|
||||
// Fall back to plain BPE if the DNA pieces aren't in the vocab.
|
||||
if (dna_begin_id == LLAMA_TOKEN_NULL || dna_end_id == LLAMA_TOKEN_NULL || dna_oov_id == LLAMA_TOKEN_NULL) {
|
||||
llm_tokenizer_bpe_session::tokenize(text, output);
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t k = 6;
|
||||
size_t pos = 0;
|
||||
|
||||
while (pos < text.size()) {
|
||||
const size_t start = text.find(open_tag, pos);
|
||||
if (start == std::string::npos) {
|
||||
if (pos < text.size()) {
|
||||
llm_tokenizer_bpe_session::tokenize(text.substr(pos), output);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (start > pos) {
|
||||
llm_tokenizer_bpe_session::tokenize(text.substr(pos, start - pos), output);
|
||||
}
|
||||
output.push_back(dna_begin_id);
|
||||
|
||||
const size_t content_start = start + open_tag.size();
|
||||
const size_t end = text.find(close_tag, content_start);
|
||||
const size_t content_end = (end == std::string::npos) ? text.size() : end;
|
||||
|
||||
emit_dna_kmers(text.substr(content_start, content_end - content_start), k, dna_oov_id, output);
|
||||
|
||||
if (end == std::string::npos) {
|
||||
break;
|
||||
}
|
||||
output.push_back(dna_end_id);
|
||||
pos = end + close_tag.size();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void emit_dna_kmers(const std::string & raw, size_t k, llama_token oov_id, std::vector<llama_token> & output) {
|
||||
std::string seq = raw;
|
||||
for (char & c : seq) {
|
||||
if (c >= 'a' && c <= 'z') {
|
||||
c = char(c - 32);
|
||||
}
|
||||
}
|
||||
|
||||
// k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
|
||||
// isn't in the vocab and falls back to <oov>
|
||||
auto kmer_token = [&](const std::string & kmer) {
|
||||
const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
|
||||
return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + k <= seq.size(); i += k) {
|
||||
output.push_back(kmer_token(seq.substr(i, k)));
|
||||
}
|
||||
if (i < seq.size()) {
|
||||
std::string kmer = seq.substr(i);
|
||||
kmer.append(k - kmer.size(), 'A');
|
||||
output.push_back(kmer_token(kmer));
|
||||
}
|
||||
}
|
||||
|
||||
const llama_vocab & vocab;
|
||||
};
|
||||
|
||||
//
|
||||
// impl
|
||||
//
|
||||
|
|
@ -1808,7 +1892,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
special_mask_id = 103;
|
||||
|
||||
add_sep = true;
|
||||
} else if (tokenizer_model == "gpt2") {
|
||||
} else if (tokenizer_model == "gpt2" || tokenizer_model == "hybriddna") {
|
||||
type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
// read bpe merges and populate bpe ranks
|
||||
|
|
@ -2266,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
}
|
||||
GGML_ASSERT(id_to_token.size() == token_to_id.size());
|
||||
|
||||
// hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
|
||||
// it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
|
||||
// k-mers are the block right after <oov>, so only scan from there.
|
||||
if (tokenizer_model == "hybriddna") {
|
||||
const auto idx = token_to_id.find("<oov>");
|
||||
if (idx != token_to_id.end()) {
|
||||
auto it = id_to_token.begin() + idx->second + 1;
|
||||
for (; it != id_to_token.end(); ++it) {
|
||||
std::string & text = it->text;
|
||||
if (text.size() > dna_kmer_marker.size()
|
||||
&& text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
|
||||
text.erase(text.size() - dna_kmer_marker.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
init_tokenizer(type);
|
||||
|
||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||
|
|
@ -3144,11 +3245,19 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|||
} break;
|
||||
case LLAMA_VOCAB_TYPE_BPE:
|
||||
{
|
||||
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
|
||||
// it calls some other methods that are not exist in llm_tokenizer,
|
||||
// here just cast it to bpe tokenizer object
|
||||
const llm_tokenizer_bpe * tok_bpe = static_cast<const llm_tokenizer_bpe *>(tokenizer.get());
|
||||
|
||||
std::unique_ptr<llm_tokenizer_bpe_session> session;
|
||||
if (vocab.get_tokenizer_model() == "hybriddna") {
|
||||
session = std::make_unique<llm_tokenizer_hybriddna_session>(vocab, *tok_bpe);
|
||||
} else {
|
||||
session = std::make_unique<llm_tokenizer_bpe_session>(vocab, *tok_bpe);
|
||||
}
|
||||
|
||||
if (add_special) {
|
||||
session.append_bos(output);
|
||||
session->append_bos(output);
|
||||
}
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
|
|
@ -3161,15 +3270,15 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
session.tokenize(text, output);
|
||||
session->tokenize(text, output);
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
session.append(fragment.token, output);
|
||||
session->append(fragment.token, output);
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special) {
|
||||
session.append_eos(output);
|
||||
session.check_double_bos_eos(output);
|
||||
session->append_eos(output);
|
||||
session->check_double_bos_eos(output);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
|
|
|
|||
|
|
@ -198,6 +198,11 @@ extern "C" {
|
|||
LLAMA_SPLIT_MODE_TENSOR = 3,
|
||||
};
|
||||
|
||||
enum llama_context_type {
|
||||
LLAMA_CONTEXT_TYPE_DEFAULT = 0,
|
||||
LLAMA_CONTEXT_TYPE_MTP = 1,
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
|
|
@ -333,9 +338,11 @@ extern "C" {
|
|||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||
uint32_t n_ubatch; // physical maximum batch size
|
||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||
uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
enum llama_context_type ctx_type; // set the context type (e.g. MTP)
|
||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||
enum llama_attention_type attention_type; // attention type to use for embeddings
|
||||
|
|
@ -530,6 +537,7 @@ extern "C" {
|
|||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx);
|
||||
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
|
||||
DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
|
||||
|
|
@ -866,7 +874,8 @@ extern "C" {
|
|||
// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
|
||||
#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
|
||||
|
||||
// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
|
||||
// Keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load).
|
||||
// Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag.
|
||||
#define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
|
||||
|
||||
typedef uint32_t llama_state_seq_flags;
|
||||
|
|
|
|||
|
|
@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
|
|
|
|||
|
|
@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output_with_img_logits", -1);
|
||||
|
||||
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
|
|
|||
|
|
@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
if (f_logit_scale) {
|
||||
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
if (f_logit_scale) {
|
||||
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
// utility to get one slice from the third dimension
|
||||
// input dim: [x, y, c, b]
|
||||
|
|
@ -397,7 +398,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
|
||||
// K=1 (final state only): reshape to 3D (S_v*S_v*H_v, 1, n_seqs) for ggml_gated_delta_net.
|
||||
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, S_v * S_v * H_v, 1, n_seqs);
|
||||
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d);
|
||||
if (n_tokens == 1) {
|
||||
cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
} else {
|
||||
|
|
@ -443,3 +446,162 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
|
||||
return build_delta_net_chunking(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_delta_net_base::build_conv_state(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * conv_states_all,
|
||||
ggml_tensor * qkv_mixed,
|
||||
int64_t conv_kernel_size,
|
||||
int64_t conv_channels,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
const auto mem_size = mctx_cur->get_size();
|
||||
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
|
||||
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
|
||||
cb(conv_states, "conv_states", il);
|
||||
|
||||
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
|
||||
cb(conv_states, "conv_states_reshaped", il);
|
||||
|
||||
qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
|
||||
cb(qkv_mixed, "qkv_mixed_transposed", il);
|
||||
|
||||
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
|
||||
cb(conv_input, "conv_input", il);
|
||||
|
||||
const int64_t row_count = (conv_kernel_size - 1) * conv_channels;
|
||||
|
||||
const size_t row_size = ggml_row_size(conv_states_all->type, row_count);
|
||||
|
||||
if (cparams.n_rs_seq == 0) {
|
||||
const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0];
|
||||
const int64_t s_slot = 0;
|
||||
|
||||
ggml_tensor * conv_state_last =
|
||||
ggml_view_3d(ctx0, conv_input,
|
||||
conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
ggml_row_size(conv_input->type, s_idx));
|
||||
cb(conv_state_last, "conv_state_last", il);
|
||||
|
||||
ggml_tensor * conv_state_update =
|
||||
ggml_view_2d(ctx0, conv_states_all,
|
||||
row_count, n_seqs, conv_states_all->nb[1],
|
||||
(s_slot * mem_size + kv_head) * row_size);
|
||||
cb(conv_state_update, "conv_state_update", il);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
|
||||
} else {
|
||||
// [TAG_RECURRENT_ROLLBACK_SPLITS]
|
||||
// TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are
|
||||
// inside the same ubatch. currently with `split_equal()` this is not correct
|
||||
|
||||
const int64_t K = (int64_t) cparams.n_rs_seq + 1;
|
||||
|
||||
for (int64_t t = 1; t <= K; ++t) {
|
||||
const int64_t s_idx = std::max<int64_t>(0, conv_input->ne[0] - conv_states->ne[0] - K + t);
|
||||
const int64_t s_slot = K - t;
|
||||
|
||||
ggml_tensor * conv_state_last =
|
||||
ggml_view_3d(ctx0, conv_input,
|
||||
conv_kernel_size - 1, conv_channels, n_seqs,
|
||||
conv_input->nb[1], conv_input->nb[2],
|
||||
ggml_row_size(conv_input->type, s_idx));
|
||||
|
||||
ggml_tensor * conv_state_update =
|
||||
ggml_view_2d(ctx0,
|
||||
conv_states_all, row_count, n_seqs,
|
||||
conv_states_all->nb[1],
|
||||
(s_slot * mem_size + kv_head) * row_size);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update));
|
||||
}
|
||||
}
|
||||
|
||||
return conv_input;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * ssm_states_all,
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il) {
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
const auto kv_head = mctx_cur->get_head();
|
||||
const uint32_t mem_size = mctx_cur->get_size();
|
||||
|
||||
const int64_t S_v = s->ne[0];
|
||||
const int64_t H_v = s->ne[2];
|
||||
const int64_t n_seqs = s->ne[3];
|
||||
const int64_t n_seq_tokens = q->ne[2];
|
||||
|
||||
const bool keep = cparams.n_rs_seq > 0;
|
||||
|
||||
if (!keep) {
|
||||
auto attn_out = build_delta_net(q, k, v, g, b, s, il);
|
||||
ggml_tensor * output = attn_out.first;
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
cb(output, "attn_output", il);
|
||||
cb(new_state, "new_state", il);
|
||||
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, new_state,
|
||||
ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
|
||||
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
const int64_t D = S_v * S_v * H_v;
|
||||
const int64_t K = cparams.n_rs_seq + 1;
|
||||
|
||||
// TODO: remove pad + simplify
|
||||
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
|
||||
ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0);
|
||||
|
||||
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad);
|
||||
if (n_seq_tokens > 1) {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
} else {
|
||||
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
}
|
||||
|
||||
const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs;
|
||||
const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs;
|
||||
|
||||
ggml_tensor * output = ggml_view_4d(ctx0, gdn_out,
|
||||
S_v, H_v, n_seq_tokens, n_seqs,
|
||||
ggml_row_size(gdn_out->type, S_v),
|
||||
ggml_row_size(gdn_out->type, S_v * H_v),
|
||||
ggml_row_size(gdn_out->type, S_v * H_v * n_seq_tokens),
|
||||
0);
|
||||
cb(output, "attn_output", il);
|
||||
|
||||
const size_t row_size = hparams.n_embd_s() * ggml_element_size(ssm_states_all);
|
||||
for (int64_t k_i = 0; k_i < K; ++k_i) {
|
||||
const uint32_t cache_slot = (uint32_t) (K - 1 - k_i);
|
||||
ggml_tensor * src = ggml_view_4d(ctx0, gdn_out,
|
||||
S_v, S_v, H_v, n_seqs,
|
||||
ggml_row_size(gdn_out->type, S_v),
|
||||
ggml_row_size(gdn_out->type, S_v * S_v),
|
||||
ggml_row_size(gdn_out->type, S_v * S_v * H_v),
|
||||
ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap));
|
||||
|
||||
ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all,
|
||||
hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
|
||||
((size_t) cache_slot * mem_size + kv_head) * row_size);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -163,7 +163,7 @@ llama_model_exaone4::graph<iswa>::graph(const llama_model & model, const llm_gra
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
// final logit soft-capping
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
|
|
|
|||
|
|
@ -207,7 +207,7 @@ llama_model_gemma3::graph<iswa>::graph(const llama_model & model, const llm_grap
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
if (hparams.f_final_logit_softcapping) {
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
|
|
|
|||
|
|
@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
{
|
||||
// final logit soft-capping
|
||||
|
|
|
|||
|
|
@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
if (hparams.f_final_logit_softcapping) {
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||
|
|
|
|||
|
|
@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params
|
|||
res->t_embd = cur;
|
||||
|
||||
// Output projection
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
// For Granite architectures - scale logits
|
||||
if (hparams.f_logit_scale) {
|
||||
|
|
|
|||
|
|
@ -145,7 +145,7 @@ llama_model_granite::graph::graph(
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
// For Granite architectures - scale logits
|
||||
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
||||
|
|
|
|||
|
|
@ -206,7 +206,7 @@ llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
|
||||
|
||||
|
|
|
|||
|
|
@ -184,7 +184,7 @@ llama_model_grovemoe::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ llama_model_hunyuan_moe::graph::graph(const llama_model & model, const llm_graph
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ llama_model_hunyuan_vl::graph::graph(const llama_model & model, const llm_graph_
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
|
|
|
|||
|
|
@ -129,7 +129,7 @@ llama_model_internlm2::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -123,7 +123,7 @@ llama_model_jais::graph::graph(const llama_model & model, const llm_graph_params
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -152,7 +152,7 @@ llama_model_jais2::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// Output projection
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ llama_model_jamba::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -262,7 +262,7 @@ llama_model_lfm2::graph<iswa>::graph(const llama_model & model, const llm_graph_
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ llama_model_llada_moe::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -147,7 +147,7 @@ llama_model_llada::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -235,7 +235,7 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
|
|||
|
||||
if constexpr (!embed) {
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -260,7 +260,7 @@ llama_model_llama4::graph<iswa>::graph(const llama_model & model, const llm_grap
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ llama_model_maincoder::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ llama_model_mamba::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -231,7 +231,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ llama_model_minicpm3::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "lmhead_scaling", -1);
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ llama_model_minimax_m2::graph::graph(const llama_model & model, const llm_graph_
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -222,7 +222,7 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
|||
ggml_tensor * s,
|
||||
int il);
|
||||
|
||||
// use the ggml_gated_delta_net fused operator
|
||||
// use the ggml_gated_delta_net fused operator (K=1; state has shape (D, 1, n_seqs))
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
|
|
@ -65,6 +65,29 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
|||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
|
||||
// read conv state from cache, concat with qkv_mixed, write back (single slot or per-token)
|
||||
// qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs)
|
||||
ggml_tensor * build_conv_state(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * conv_states_all,
|
||||
ggml_tensor * qkv_mixed,
|
||||
int64_t conv_kernel_size,
|
||||
int64_t conv_channels,
|
||||
int il);
|
||||
|
||||
// run delta-net attention and write the new recurrent state(s) back to ssm_states_all
|
||||
// s: (head_v_dim, head_v_dim, num_v_heads, n_seqs); returns output: (head_v_dim, num_v_heads, n_seq_tokens, n_seqs)
|
||||
ggml_tensor * build_recurrent_attn(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_tensor * ssm_states_all,
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
};
|
||||
|
||||
struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
|
|
@ -1739,6 +1762,10 @@ struct llama_model_qwen35 : public llama_model_base {
|
|||
const llama_model & model;
|
||||
};
|
||||
|
||||
struct graph_mtp : public llm_graph_context {
|
||||
graph_mtp(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
|
@ -1781,6 +1808,10 @@ struct llama_model_qwen35moe : public llama_model_base {
|
|||
const llama_model & model;
|
||||
};
|
||||
|
||||
struct graph_mtp : public llm_graph_context {
|
||||
graph_mtp(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -161,7 +161,7 @@ llama_model_mpt::graph::graph(const llama_model & model, const llm_graph_params
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -174,7 +174,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
|
|
|
|||
|
|
@ -140,7 +140,7 @@ llama_model_nemotron::graph::graph(const llama_model & model, const llm_graph_pa
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ llama_model_olmo::graph::graph(const llama_model & model, const llm_graph_params
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -198,7 +198,7 @@ llama_model_olmo2::graph<iswa>::graph(const llama_model & model, const llm_graph
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -164,7 +164,7 @@ llama_model_olmoe::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -160,7 +160,7 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -162,7 +162,7 @@ llama_model_openelm::graph::graph(const llama_model & model, const llm_graph_par
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ llama_model_orion::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ llama_model_paddleocr::graph::graph(const llama_model & model, const llm_graph_p
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ llama_model_pangu_embed::graph::graph(const llama_model & model, const llm_graph
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
if (model.output_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, model.output_b);
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ llama_model_phi2::graph::graph(const llama_model & model, const llm_graph_params
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output_no_bias", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.output_b);
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ llama_model_phi3::graph<iswa>::graph(const llama_model & model, const llm_graph_
|
|||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
if (model.output_b != nullptr) {
|
||||
cb(cur, "result_output_no_bias", -1);
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ llama_model_plamo::graph::graph(const llama_model & model, const llm_graph_param
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -185,7 +185,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para
|
|||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(model.output, cur);
|
||||
cur = build_lora_mm(model.output, cur, model.output_s);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
// Explicitly mark as output tensor to ensure proper backend assignment
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue