talk-llama : sync llama.cpp

This commit is contained in:
Georgi Gerganov 2026-06-15 09:15:48 +03:00
parent 0a3fa9ca17
commit 0ec0845110
25 changed files with 671 additions and 96 deletions

View File

@ -3,7 +3,6 @@
#include "llama-impl.h"
#include <map>
#include <set>
#include <vector>
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@ -128,6 +127,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_EAGLE3, "eagle3" },
{ LLM_ARCH_MISTRAL4, "mistral4" },
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
{ LLM_ARCH_MIMO2, "mimo2" },
@ -292,46 +292,51 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
{ LLM_KV_TARGET_LAYERS, "%s.target_layers" },
{ LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
{ LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },
{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
// sentence-transformers dense modules feature dims
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
{ LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
{ LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
{ LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
{ LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" },
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
{ LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
{ LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, "tokenizer.ggml.normalizer.strip_accents" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
{ LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" },
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
@ -559,6 +564,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
{ LLM_TENSOR_FC, "fc" },
{ LLM_TENSOR_D2T, "d2t" },
};
// declare information about the model weight tensors:
@ -783,6 +792,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
// latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
// eagle3
{LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

View File

@ -141,6 +141,7 @@ enum llm_arch {
LLM_ARCH_KIMI_LINEAR,
LLM_ARCH_TALKIE,
LLM_ARCH_MELLUM,
LLM_ARCH_EAGLE3,
LLM_ARCH_UNKNOWN,
};
@ -314,6 +315,7 @@ enum llm_kv {
LLM_KV_TOKENIZER_RWKV,
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS,
LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_FIM_MID_ID,
@ -336,6 +338,10 @@ enum llm_kv {
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
LLM_KV_TARGET_LAYERS,
LLM_KV_TARGET_HIDDEN_SIZE,
LLM_KV_NORM_BEFORE_RESIDUAL,
LLM_KV_SHORTCONV_L_CACHE,
LLM_KV_XIELU_ALPHA_N,
@ -566,8 +572,13 @@ enum llm_tensor {
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
LLM_TENSOR_MASKED_EMBD_CENTROIDS,
LLM_TENSOR_MASKED_EMBD_ORDERING,
LLM_TENSOR_FC,
LLM_TENSOR_D2T,
};
enum llm_tensor_layer {
LLM_TENSOR_LAYER_INPUT,
LLM_TENSOR_LAYER_REPEATING,

View File

@ -71,6 +71,9 @@ llama_context::llama_context(
cparams.no_perf = params.no_perf;
cparams.warmup = false;
cparams.embeddings_layer_inp.resize(hparams.n_layer(), false);
embd_layer_inp.resize(hparams.n_layer());
cparams.ctx_type = params.ctx_type;
cparams.pooling_type = params.pooling_type;
@ -91,12 +94,21 @@ llama_context::llama_context(
if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
if (params.ctx_other == nullptr) {
// TODO: change from runtime_error to llama_exception to avoid printing error message
throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)");
throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)");
}
cparams.ctx_other = params.ctx_other;
}
if (model.arch == LLM_ARCH_EAGLE3) {
if (model.tok_embd == nullptr || model.output == nullptr) {
if (params.ctx_other == nullptr) {
throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)");
}
cparams.ctx_other = params.ctx_other;
}
}
// Initialize backend samplers here so they are part of the sampling graph
// before the reserve passes run later in this function. This avoids a later
// re-reserve when graph nodes change.
@ -194,7 +206,7 @@ llama_context::llama_context(
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max;
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
@ -938,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) {
}
}
float * llama_context::get_embeddings_layer_inp(uint32_t lid) {
output_reorder();
GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data());
return embd_layer_inp[lid].data;
}
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
output_reorder();
@ -1125,6 +1145,17 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) {
cparams.embeddings_nextn_masked = masked;
}
void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable);
GGML_ASSERT(lid < model.hparams.n_layer());
cparams.embeddings_layer_inp[lid] = enable;
// note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected
sched_need_reserve = true;
}
void llama_context::set_causal_attn(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
@ -1350,7 +1381,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd_inp();
// eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
const int64_t n_embd = hparams.n_embd_inp();
const int64_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
@ -1925,6 +1957,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
}
}
extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens);
// extract nextn embeddings before
// only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
{
@ -2029,6 +2063,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
const auto n_batch = cparams.n_batch;
const auto n_vocab = vocab.n_tokens();
const auto n_embd = hparams.n_embd;
const auto n_embd_out = hparams.n_embd_out();
bool has_logits = true;
@ -2041,9 +2076,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
has_embd = true;
}
size_t backend_float_count = 0;
size_t backend_token_count = 0;
size_t embd_layer_inp_float_count = 0;
logits.size = has_logits ? n_vocab*n_outputs_max : 0;
embd.size = has_embd ? n_embd_out*n_outputs_max : 0;
@ -2055,6 +2090,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
embd_nextn.size = (size_t) n_embd_out * n_batch;
}
for (bool enabled : cparams.embeddings_layer_inp) {
if (enabled) {
embd_layer_inp_float_count += (size_t) n_embd * n_batch;
}
}
// Allocate backend sampling output buffers if there are backend samplers configured.
const bool has_sampling = !sampling.samplers.empty();
if (has_sampling) {
@ -2069,8 +2110,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
const size_t new_size =
(logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
(logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
( backend_token_count) * sizeof(llama_token);
// alloc only when more than the current capacity is required
// TODO: also consider shrinking the buffer
@ -2087,6 +2128,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
logits.data = nullptr;
embd.data = nullptr;
embd_nextn.data = nullptr;
for (auto & layer_inp : embd_layer_inp) {
layer_inp = {nullptr, 0};
}
}
auto * buft = ggml_backend_cpu_buffer_type();
@ -2118,6 +2162,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
embd_nextn = has_embd_nextn ? buffer_view<float>{(float *) (base + offset), embd_nextn.size} : buffer_view<float>{nullptr, 0};
offset += embd_nextn.size * sizeof(float);
for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
if (cparams.embeddings_layer_inp[il]) {
embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
offset += embd_layer_inp[il].size * sizeof(float);
} else {
embd_layer_inp[il] = buffer_view<float>{nullptr, 0};
}
}
if (has_sampling) {
sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
offset += sampling.logits.size * sizeof(float);
@ -2164,6 +2217,34 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
return n_outputs_max;
}
void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) {
if (!cparams.embeddings_layer_inp[il]) {
continue;
}
if (!embd_layer_inp[il].has_data()) {
GGML_ABORT("output layer input buffer not allocated");
}
ggml_tensor * t = res->get_layer_inp((int) il);
if (!t) {
GGML_ABORT("layer input tensor not found");
}
const size_t nbytes = ggml_nbytes(t);
const size_t nfloats = nbytes / sizeof(float);
GGML_ASSERT(n_tokens > 0);
GGML_ASSERT(nfloats % n_tokens == 0);
const size_t row_floats = nfloats / n_tokens;
const size_t dst_offset = token_offset * row_floats;
GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size);
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
GGML_ASSERT(backend != nullptr);
ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes);
}
}
void llama_context::output_reorder() {
const uint64_t n_vocab = model.vocab.n_tokens();
const uint64_t n_embd = model.hparams.n_embd;
@ -2190,6 +2271,16 @@ void llama_context::output_reorder() {
}
}
if (embd_layer_inp.size() > 0) {
for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) {
if (embd_layer_inp[lid].size > 0) {
for (uint64_t k = 0; k < n_embd; ++k) {
std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
}
}
}
}
if (!sampling.samplers.empty()) {
assert(sampling.logits.size > 0);
assert(sampling.probs.size > 0);
@ -3604,6 +3695,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) {
ctx->set_embeddings_nextn(value, masked);
}
void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) {
ctx->set_embeddings_layer_inp(lid, value);
}
llama_memory_t llama_get_memory(const struct llama_context * ctx) {
if (!ctx) {
return nullptr;
@ -3624,6 +3719,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) {
return ctx->get_embeddings_nextn_ith(i);
}
float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) {
ctx->synchronize();
return ctx->get_embeddings_layer_inp(lid);
}
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
return ctx->set_sampler(seq_id, smpl);
}

View File

@ -88,6 +88,8 @@ struct llama_context {
float * get_embeddings_nextn();
float * get_embeddings_nextn_ith(int32_t i);
float * get_embeddings_layer_inp(uint32_t lid);
llama_token * get_sampled_tokens() const;
llama_token get_sampled_token_ith(int32_t idx);
@ -112,6 +114,7 @@ struct llama_context {
void set_embeddings (bool value);
void set_embeddings_nextn(bool value, bool masked);
void set_embeddings_layer_inp(uint32_t lid, bool enable);
void set_causal_attn(bool value);
void set_warmup(bool value);
@ -226,6 +229,10 @@ private:
// map the output row index `i` to batch index
int64_t output_resolve_row(int32_t i) const;
// async-copy enabled layer-input tensors (per cparams.output_layer_inp)
// from backend into host-side embd_layer_inp buffers
void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens);
//
// graph
//
@ -288,6 +295,10 @@ private:
// sets llm_graph_result::t_h_nextn
buffer_view<float> embd_nextn = {nullptr, 0};
// host buffers for output layer input embeddings, per layer
// populated when cparams.output_layer_inp[il] is true
std::vector<buffer_view<float>> embd_layer_inp;
struct sampling_info {
// !samplers.empty() to check if any samplers are active
std::map<llama_seq_id, llama_sampler *> samplers;

View File

@ -3,6 +3,7 @@
#include "llama.h"
#include <cstdint>
#include <vector>
#define LLAMA_MAX_SEQ 256
@ -44,6 +45,8 @@ struct llama_cparams {
bool kv_unified;
bool pipeline_parallel;
std::vector<bool> embeddings_layer_inp; // [n_layer()] extract input embeddings for layer
enum llama_context_type ctx_type;
enum llama_pooling_type pooling_type;

View File

@ -101,4 +101,20 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
// LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);
// Set whether the context outputs the input embeddings of a specific layer
LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);
// mirrors:
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);
LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
//
// model/context data extraction
//
// returns pointer to the target-model layer indices
LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model);
// returns the number of extracted layers from target model
LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model);

View File

@ -567,7 +567,10 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
}
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
// the kq mask guards on its own buffer: shared cells leave idxs unbacked while the mask stays live
if (self_kq_mask && self_kq_mask->buffer) {
mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
}
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
@ -575,7 +578,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
}
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
}
if (self_k_rot) {
mctx->get_base()->set_input_k_rot(self_k_rot);
@ -607,7 +612,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
}
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
if (self_kq_mask && self_kq_mask->buffer) {
res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
}
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
@ -615,7 +622,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
}
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
}
return res;
}
@ -895,6 +904,10 @@ void llm_graph_result::reset() {
t_logits = nullptr;
t_embd = nullptr;
t_embd_pooled = nullptr;
t_layer_inp.resize(LLAMA_MAX_LAYERS);
std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
t_sampled.clear();
t_sampled_probs.clear();
t_sampled_logits.clear();
@ -923,7 +936,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
}
}
void llm_graph_result::set_outputs() {
void llm_graph_result::set_outputs(const llm_graph_params & params) {
if (t_logits != nullptr) {
ggml_set_output(t_logits);
}
@ -936,6 +949,15 @@ void llm_graph_result::set_outputs() {
if (t_h_nextn != nullptr) {
ggml_set_output(t_h_nextn);
}
{
const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp;
for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) {
if (embeddings_layer_inp[il]) {
GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null");
ggml_set_output(t_layer_inp[il]);
}
}
}
for (auto & [seq_id, t] : t_sampled) {
if (t != nullptr) {
ggml_set_output(t);
@ -1864,9 +1886,9 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
res->t_inp_embd = cur;
// For Granite architecture
// NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be
// multimodal inputs that should not be scaled.
if (ubatch.token && hparams.f_embedding_scale != 0.0f) {
// NOTE: For deepstack models, only apply scale to token inputs (ie text-only input).
// Raw embeddings are assumed to be multimodal inputs that should not be scaled.
if (hparams.f_embedding_scale != 0.0f && (ubatch.token || hparams.n_deepstack_layers == 0)) {
if (!ggml_is_contiguous(cur)) {
cur = ggml_cont(ctx0, cur);
}

View File

@ -705,6 +705,8 @@ public:
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
ggml_tensor * get_h_nextn() const { return t_h_nextn; }
ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
ggml_cgraph * get_gf() const { return gf; }
ggml_context * get_ctx() const { return ctx_compute.get(); }
@ -713,7 +715,7 @@ public:
void reset();
void set_inputs(const llama_ubatch * ubatch);
void set_outputs();
void set_outputs(const llm_graph_params & params);
// try to update the existing graph result using the new graph parameters in order to reuse it
// this can only be done if we determine that the resulting graph using the new graph parameters
@ -734,10 +736,12 @@ public:
ggml_tensor * t_embd_pooled = nullptr;
ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm
std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
std::map<llama_seq_id, ggml_tensor*> t_candidates;
std::map<llama_seq_id, ggml_tensor*> t_sampled;
std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
std::vector<ggml_tensor *> t_layer_inp;
std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
std::map<llama_seq_id, ggml_tensor *> t_candidates;
std::map<llama_seq_id, ggml_tensor *> t_sampled;
std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
std::vector<llm_graph_input_ptr> inputs;

View File

@ -45,6 +45,7 @@ struct llama_hparams {
bool rope_finetuned;
bool use_par_res;
bool swin_norm;
bool norm_before_residual = false;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;

View File

@ -394,6 +394,7 @@ namespace GGUFMeta {
template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
template bool llama_model_loader::get_arr<std::vector<int32_t>>(enum llm_kv kid, std::vector<int32_t> & result, bool required);
template<typename T>
bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {

View File

@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
return new llama_model_qwen35moe(params);
case LLM_ARCH_MISTRAL3:
return new llama_model_mistral3(params);
case LLM_ARCH_EAGLE3:
return new llama_model_eagle3(params);
case LLM_ARCH_MIMO2:
return new llama_model_mimo2(params);
case LLM_ARCH_KIMI_LINEAR:
@ -2238,7 +2240,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
// TODO: move reranking logic here and generalize
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
llm->res->set_outputs();
llm->res->set_outputs(params);
return llm->res->get_gf();
}
@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_MISTRAL3:
case LLM_ARCH_EAGLE3:
case LLM_ARCH_MISTRAL4:
case LLM_ARCH_LLAMA_EMBED:
case LLM_ARCH_MAINCODER:
@ -2600,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) {
bool llama_model_has_encoder(const llama_model * model) {
switch (model->arch) {
case LLM_ARCH_T5: return true;
case LLM_ARCH_T5ENCODER: return true;
case LLM_ARCH_T5:
case LLM_ARCH_T5ENCODER:
case LLM_ARCH_EAGLE3: return true;
default: return false;
}
}
@ -2687,3 +2691,12 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
}
}
const int32_t * llama_model_target_layer_ids(const struct llama_model * model) {
const auto & v = model->target_layer_ids;
return v.empty() ? nullptr : v.data();
}
uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) {
return (uint32_t) model->target_layer_ids.size();
}

View File

@ -569,6 +569,13 @@ struct llama_model {
struct ggml_tensor * per_layer_model_proj = nullptr;
struct ggml_tensor * per_layer_proj_norm = nullptr;
// eagle3
struct ggml_tensor * fc = nullptr; // feature fusion layer
struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping
// unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
std::vector<int32_t> target_layer_ids;
std::vector<llama_layer> layers;
//Dense linear projections for SentenceTransformers models like embeddinggemma

View File

@ -764,7 +764,7 @@ struct llm_tokenizer_wpm_session {
void tokenize(const std::string & text, std::vector<llama_token> & output) {
// normalize and split by whitespace
std::vector<std::string> words = preprocess(text, vocab.get_normalizer_lowercase());
std::vector<std::string> words = preprocess(text, vocab.get_normalizer_opts());
// bos token prepended already
// find the longest tokens that form the words
@ -809,11 +809,14 @@ struct llm_tokenizer_wpm_session {
}
// TODO: reduce string copies by using cpts_offs array
static std::vector<std::string> preprocess(const std::string & text, bool lowercase) {
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
static std::vector<std::string> preprocess(const std::string & text, const llama_vocab::normalizer_options & normalizer_opts) {
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(text);
if (normalizer_opts.strip_accents) {
cpts = unicode_cpts_normalize_nfd(cpts);
}
std::vector<std::string> words(1, "");
for (const uint32_t cpt : cpts_nfd) {
for (const uint32_t cpt : cpts) {
const auto flags = unicode_cpt_flags_from_cpt(cpt);
if (flags.is_whitespace) {
@ -828,7 +831,11 @@ struct llm_tokenizer_wpm_session {
continue;
}
const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt);
if (normalizer_opts.strip_accents && flags.is_accent_mark) {
continue;
}
const std::string s = unicode_cpt_to_utf8(normalizer_opts.lowercase ? unicode_tolower(cpt) : cpt);
if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
if (words.back().size()) { // finish previous word if any
words.emplace_back();
@ -1692,7 +1699,7 @@ struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session {
llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
void tokenize(const std::string & text, std::vector<llama_token> & output) override {
const bool lowercase = vocab.get_normalizer_lowercase();
const bool lowercase = vocab.get_normalizer_opts().lowercase;
std::string segment;
auto flush = [&]() {
@ -1797,7 +1804,9 @@ struct llama_vocab::impl {
bool remove_extra_whitespaces = false;
bool escape_whitespaces = true;
bool treat_whitespace_as_suffix = false;
bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json)
// BertNormalizer options
llama_vocab::normalizer_options normalizer_opts;
std::unordered_map<std::string, llama_token> token_to_id;
std::vector<token_data> id_to_token;
@ -2172,7 +2181,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
} else if (
tokenizer_pre == "whitespace") {
pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE;
normalizer_lowercase = false;
normalizer_opts.lowercase = false;
} else if (
tokenizer_pre == "refact") {
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@ -2532,8 +2541,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
}
}
// Lowercase normalizer flag (consulted by WPM / whitespace BPE)
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false);
// BertNormalizer options
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_opts.lowercase, false);
normalizer_opts.strip_accents = normalizer_opts.lowercase;
ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, normalizer_opts.strip_accents, false);
// suppress tokens
{
@ -3969,8 +3980,8 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const {
return pimpl->treat_whitespace_as_suffix;
}
bool llama_vocab::get_normalizer_lowercase() const {
return pimpl->normalizer_lowercase;
const llama_vocab::normalizer_options & llama_vocab::get_normalizer_opts() const {
return pimpl->normalizer_opts;
}
const std::vector<llama_token> & llama_vocab::get_suppress_tokens() const {

View File

@ -76,6 +76,12 @@ struct llama_vocab {
llama_token_attr attr;
};
struct normalizer_options {
bool lowercase = true;
bool strip_accents = true;
// TODO: clean_text, handle_chinese_chars
};
llama_vocab();
~llama_vocab();
@ -141,7 +147,7 @@ struct llama_vocab {
bool get_remove_extra_whitespaces () const;
bool get_escape_whitespaces () const;
bool get_treat_whitespace_as_suffix() const;
bool get_normalizer_lowercase () const;
const normalizer_options & get_normalizer_opts() const;
const std::vector<llama_token> & get_suppress_tokens() const;

View File

@ -398,9 +398,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
// K=1 (final state only): reshape to 3D (S_v*S_v*H_v, 1, n_seqs) for ggml_gated_delta_net.
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, S_v * S_v * H_v, 1, n_seqs);
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d);
// K=1: output carries the final state only. state s is 4D [S_v, S_v, H_v, n_seqs].
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s, /*K=*/1);
if (n_tokens == 1) {
cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
} else {
@ -564,11 +563,8 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
const int64_t D = S_v * S_v * H_v;
const int64_t K = cparams.n_rs_seq + 1;
// TODO: remove pad + simplify
ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs);
ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0);
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad);
// state s is 4D [S_v, S_v, H_v, n_seqs]; K snapshot slots are written into the output.
ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s, K);
if (n_seq_tokens > 1) {
cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il);
} else {
@ -587,21 +583,24 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn(
cb(output, "attn_output", il);
const size_t row_size = hparams.n_embd_s() * ggml_element_size(ssm_states_all);
for (int64_t k_i = 0; k_i < K; ++k_i) {
const uint32_t cache_slot = (uint32_t) (K - 1 - k_i);
ggml_tensor * src = ggml_view_4d(ctx0, gdn_out,
S_v, S_v, H_v, n_seqs,
ggml_row_size(gdn_out->type, S_v),
ggml_row_size(gdn_out->type, S_v * S_v),
ggml_row_size(gdn_out->type, S_v * S_v * H_v),
ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap));
ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all,
hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1],
((size_t) cache_slot * mem_size + kv_head) * row_size);
// op writes the last min(n_seq_tokens, K) snapshots; trailing slots are left unwritten
const int64_t n_written = std::min<int64_t>(n_seq_tokens, K);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
}
// write the produced snapshots into the recurrent cache (snapshot slot i -> rollback group i)
ggml_tensor * src = ggml_view_3d(ctx0, gdn_out,
D, n_seqs, n_written,
ggml_row_size(gdn_out->type, D),
ggml_row_size(gdn_out->type, state_size_per_snap),
ggml_row_size(gdn_out->type, attn_score_elems));
ggml_tensor * dst = ggml_view_3d(ctx0, ssm_states_all,
D, n_seqs, n_written,
ssm_states_all->nb[1],
(size_t) mem_size * row_size,
(size_t) kv_head * row_size);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst));
return output;
}

View File

@ -0,0 +1,323 @@
#include "models.h"
void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
}
if (target_layer_ids.size() != 3) {
throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
}
LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
target_layer_ids[0],
target_layer_ids[1],
target_layer_ids[2]);
uint32_t n_embd_tgt = 0;
ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
// eagle3 norm_before_residual (optional, default false)
// compatible with Readhat eagle3 speculator model
ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
if (hparams.norm_before_residual) {
LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
}
type = LLM_TYPE_UNKNOWN;
}
void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
LLAMA_LOAD_LOCALS;
const int64_t n_embd_inp = hparams.n_embd_inp();
const int64_t n_embd_attn_input = 2 * n_embd;
// Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
// d2t: draft to target vocabulary mapping
int64_t n_draft_vocab = n_vocab; // Default: same as target vocab
const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
if (d2t_meta) {
n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
} else {
d2t = nullptr; // no d2t, use default vocab size
LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
}
// Feature fusion layer: projects 3 target layers to draft hidden size
fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
// Output layer (uses draft vocab size)
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
// Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
if (tok_embd_meta) {
const int64_t n_target_vocab = tok_embd_meta->ne[1];
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
}
// Single decoder layer
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
// input_layernorm: applied to token embeddings
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
// eagle3 specific: hidden_norm applied to fused target features
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
// Attention takes input_embeds_normed + fused_target_normed as input
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
// rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
}
}
std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
switch (params.gtype) {
case LLM_GRAPH_TYPE_ENCODER:
return std::make_unique<graph<true>>(*this, params);
case LLM_GRAPH_TYPE_DEFAULT:
case LLM_GRAPH_TYPE_DECODER:
return std::make_unique<graph<false>>(*this, params);
default:
GGML_ABORT("invalid graph type");
};
}
template <>
ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
ggml_tensor * cur = nullptr;
// Input: Target model features (3 layers concatenated: low, mid, high)
// Data will be provided via ubatch->embd in encode_eagle3_features()
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
ggml_set_input(inp_target->embd);
cur = inp_target->embd;
cb(cur, "inp_embd", -1);
res->add_input(std::move(inp_target));
return cur;
}
// eagle3 Encoder: processes target model features through feature fusion layer
// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
// Output: g_embeddings e.g. [4096, n_tokens] stored in context
template <>
llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur = nullptr;
cur = build_inp_embd_enc();
// Feature fusion layer
cur = build_lora_mm(model.fc, cur);
cb(cur, "fc_out", -1);
// Output: g_embeddings e.g. [4096, n_tokens]
// store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
ggml_set_output(cur);
res->t_h_nextn = cur;
ggml_build_forward_expand(gf, cur);
}
// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
// Input: draft tokens + g_embeddings from encoder
// Output: draft logits
template <>
llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer
ggml_tensor * cur;
ggml_tensor * inpL;
// eagle3 Decoder receives:
// 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
// 2. g_embeddings from encoder
auto * tok_embd = model.tok_embd;
if (model.tok_embd == nullptr) {
GGML_ASSERT(cparams.ctx_other != nullptr);
const auto * model_other = llama_get_model(cparams.ctx_other);
GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
tok_embd = model_other->tok_embd;
}
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_set_input(inp->tokens);
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
ggml_set_input(inp->embd);
ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
cb(inp_embd, "inp_embd", -1);
ggml_tensor * inp_g = inp->embd;
cb(inp_g, "inp_g_embeddings", -1);
res->add_input(std::move(inp));
inpL = inp_g;
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
// Single decoder layer (il = 0)
const int il = 0;
{
// Apply input_layernorm to the token embeddings
ggml_tensor * embd_norm = build_norm(inp_embd,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(embd_norm, "embd_norm", il);
// Apply hidden_norm to inp_g
ggml_tensor * g_norm = build_norm(inp_g,
model.layers[il].attn_norm_2, NULL,
LLM_NORM_RMS, -1);
cb(g_norm, "g_norm", il);
// norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
// - false (default): use raw inp_g for residual
// - true: use normalized g_norm for residual
// inpL is the concatenated input (normalized inp_embd + normalized inp_g)
ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
// Concatenate normalized inp_embd and normalized inp_g
cur = ggml_concat(ctx0, embd_norm, g_norm, il);
cb(cur, "concat_embd", il);
// Self-attention with concatenated input
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
// rope freq factors, returns nullptr if not available
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
// Add residual and update it
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// Apply FFN norm to the sum
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "post_attn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
// Output norm with residual
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "eagle3_prenorm", il);
inpL = cur;
}
cur = inpL;
// Output prenorm state (for next token's g_embeddings in autoregressive generation)
ggml_set_output(cur);
res->t_h_nextn = cur;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
// lm_head - projects to draft vocabulary
// if the draft has no own output projection, inherit the target model's lm_head
auto * output = model.output;
if (output == nullptr) {
GGML_ASSERT(cparams.ctx_other != nullptr);
const auto * model_other = llama_get_model(cparams.ctx_other);
GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
output = model_other->output;
}
cur = build_lora_mm(output, cur);
if (model.d2t) {
const int64_t n_draft_vocab = cur->ne[0];
const int64_t n_outputs = cur->ne[1];
const int64_t n_vocab = (int64_t) model.vocab.n_tokens();
GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
cur = ggml_set_rows(ctx0, logits,
ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs),
ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1));
cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
}
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}

View File

@ -39,6 +39,9 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_CENTROIDS, "weight"), {}, TENSOR_NOT_REQUIRED);
create_tensor(tn(LLM_TENSOR_MASKED_EMBD_ORDERING), {}, TENSOR_NOT_REQUIRED);
const int64_t n_embd_backbone = hparams.n_embd_inp();
nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0);

View File

@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
const int n_rot_l = hparams.n_rot(il);
res->t_layer_inp[il] = inpL;
// norm
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);

View File

@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
// norm

View File

@ -46,7 +46,7 @@ struct llm_build_delta_net_base : public llm_graph_context {
ggml_tensor * s,
int il);
// use the ggml_gated_delta_net fused operator (K=1; state has shape (D, 1, n_seqs))
// use the ggml_gated_delta_net fused operator (K=1; state has shape [S_v, S_v, H_v, n_seqs])
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
ggml_tensor * q,
ggml_tensor * k,
@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base {
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
};
struct llama_model_eagle3 : public llama_model_base {
llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {}
void load_arch_hparams(llama_model_loader & ml) override;
void load_arch_tensors(llama_model_loader & ml) override;
template <bool is_enc>
struct graph : public llm_graph_context {
graph(const llama_model & model, const llm_graph_params & params);
ggml_tensor * build_inp_embd_enc() const;
};
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
};
struct llama_model_mistral4 : public llama_model_deepseek2 {
llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}

View File

@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

View File

@ -11,6 +11,10 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
// Load attention parameters
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
for (uint32_t i = 0; i < hparams.n_layer(); ++i) {
hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0;
}
@ -273,7 +277,7 @@ ggml_tensor * llama_model_plamo2::graph::build_plamo2_mamba_layer(llm_graph_inpu
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(d_inner % n_head == 0);
GGML_ASSERT(d_inner % n_heads == 0);
GGML_ASSERT(n_group == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);

View File

@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
// norm

View File

@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
}
if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

View File

@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
res->t_layer_inp[il] = inpL;
ggml_tensor * inpSA = inpL;
// norm