From 0ec0845110dc934911dc48e8c5beb5ad3189b3f3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 15 Jun 2026 09:15:48 +0300 Subject: [PATCH] talk-llama : sync llama.cpp --- examples/talk-llama/llama-arch.cpp | 88 +++-- examples/talk-llama/llama-arch.h | 11 + examples/talk-llama/llama-context.cpp | 113 +++++- examples/talk-llama/llama-context.h | 11 + examples/talk-llama/llama-cparams.h | 3 + examples/talk-llama/llama-ext.h | 16 + examples/talk-llama/llama-graph.cpp | 38 ++- examples/talk-llama/llama-graph.h | 14 +- examples/talk-llama/llama-hparams.h | 1 + examples/talk-llama/llama-model-loader.cpp | 1 + examples/talk-llama/llama-model.cpp | 19 +- examples/talk-llama/llama-model.h | 7 + examples/talk-llama/llama-vocab.cpp | 35 +- examples/talk-llama/llama-vocab.h | 8 +- examples/talk-llama/models/delta-net-base.cpp | 41 ++- examples/talk-llama/models/eagle3.cpp | 323 ++++++++++++++++++ .../talk-llama/models/gemma4-assistant.cpp | 3 + examples/talk-llama/models/gemma4.cpp | 2 + examples/talk-llama/models/llama.cpp | 2 + examples/talk-llama/models/models.h | 17 +- examples/talk-llama/models/openai-moe.cpp | 2 + examples/talk-llama/models/plamo2.cpp | 6 +- examples/talk-llama/models/qwen3.cpp | 2 + examples/talk-llama/models/qwen35.cpp | 2 +- examples/talk-llama/models/qwen3moe.cpp | 2 + 25 files changed, 671 insertions(+), 96 deletions(-) create mode 100644 examples/talk-llama/models/eagle3.cpp diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index 6a5d5f8d2..9f93d5bc7 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -3,7 +3,6 @@ #include "llama-impl.h" #include -#include #include static const std::map LLM_ARCH_NAMES = { @@ -128,6 +127,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_EAGLE3, "eagle3" }, { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_PADDLEOCR, "paddleocr" }, { LLM_ARCH_MIMO2, "mimo2" }, @@ -292,46 +292,51 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_TARGET_LAYERS, "%s.target_layers" }, + { LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, - { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, - { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, - { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, + { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, + { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, + { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, - { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, - { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, - { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, - { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, - { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, - { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, - { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, - { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, + { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, + { LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, "tokenizer.ggml.normalizer.strip_accents" }, + { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, + { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, + { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, + { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, + { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, + { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + { LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -559,6 +564,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, + { LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" }, + { LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" }, + { LLM_TENSOR_FC, "fc" }, + { LLM_TENSOR_D2T, "d2t" }, }; // declare information about the model weight tensors: @@ -783,6 +792,11 @@ static const std::map LLM_TENSOR_INFOS = { // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, + {LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, + // eagle3 + {LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index 03b1a265d..c5245fb58 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -141,6 +141,7 @@ enum llm_arch { LLM_ARCH_KIMI_LINEAR, LLM_ARCH_TALKIE, LLM_ARCH_MELLUM, + LLM_ARCH_EAGLE3, LLM_ARCH_UNKNOWN, }; @@ -314,6 +315,7 @@ enum llm_kv { LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_CHAT_TEMPLATE, LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, + LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, LLM_KV_TOKENIZER_FIM_PRE_ID, LLM_KV_TOKENIZER_FIM_SUF_ID, LLM_KV_TOKENIZER_FIM_MID_ID, @@ -336,6 +338,10 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_TARGET_LAYERS, + LLM_KV_TARGET_HIDDEN_SIZE, + LLM_KV_NORM_BEFORE_RESIDUAL, + LLM_KV_SHORTCONV_L_CACHE, LLM_KV_XIELU_ALPHA_N, @@ -566,8 +572,13 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_MASKED_EMBD_CENTROIDS, + LLM_TENSOR_MASKED_EMBD_ORDERING, + LLM_TENSOR_FC, + LLM_TENSOR_D2T, }; + enum llm_tensor_layer { LLM_TENSOR_LAYER_INPUT, LLM_TENSOR_LAYER_REPEATING, diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index 9a40c4366..168dbabd7 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -71,6 +71,9 @@ llama_context::llama_context( cparams.no_perf = params.no_perf; cparams.warmup = false; + cparams.embeddings_layer_inp.resize(hparams.n_layer(), false); + embd_layer_inp.resize(hparams.n_layer()); + cparams.ctx_type = params.ctx_type; cparams.pooling_type = params.pooling_type; @@ -91,12 +94,21 @@ llama_context::llama_context( if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { if (params.ctx_other == nullptr) { // TODO: change from runtime_error to llama_exception to avoid printing error message - throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)"); } cparams.ctx_other = params.ctx_other; } + if (model.arch == LLM_ARCH_EAGLE3) { + if (model.tok_embd == nullptr || model.output == nullptr) { + if (params.ctx_other == nullptr) { + throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)"); + } + cparams.ctx_other = params.ctx_other; + } + } + // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. @@ -194,7 +206,7 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max; + cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max; cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; @@ -938,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) { } } +float * llama_context::get_embeddings_layer_inp(uint32_t lid) { + output_reorder(); + + GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data()); + + return embd_layer_inp[lid].data; +} + llama_token llama_context::get_sampled_token_ith(int32_t idx) { output_reorder(); @@ -1125,6 +1145,17 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) { cparams.embeddings_nextn_masked = masked; } +void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) { + LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable); + + GGML_ASSERT(lid < model.hparams.n_layer()); + + cparams.embeddings_layer_inp[lid] = enable; + + // note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected + sched_need_reserve = true; +} + void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1350,7 +1381,8 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); + // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim + const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -1925,6 +1957,8 @@ int llama_context::decode(const llama_batch & batch_inp) { } } + extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens); + // extract nextn embeddings before // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. { @@ -2029,6 +2063,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; @@ -2041,9 +2076,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { has_embd = true; } - size_t backend_float_count = 0; size_t backend_token_count = 0; + size_t embd_layer_inp_float_count = 0; logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; @@ -2055,6 +2090,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn.size = (size_t) n_embd_out * n_batch; } + for (bool enabled : cparams.embeddings_layer_inp) { + if (enabled) { + embd_layer_inp_float_count += (size_t) n_embd * n_batch; + } + } + // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); if (has_sampling) { @@ -2069,8 +2110,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = - (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) + - ( backend_token_count) * sizeof(llama_token); + (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) + + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -2087,6 +2128,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits.data = nullptr; embd.data = nullptr; embd_nextn.data = nullptr; + for (auto & layer_inp : embd_layer_inp) { + layer_inp = {nullptr, 0}; + } } auto * buft = ggml_backend_cpu_buffer_type(); @@ -2118,6 +2162,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn = has_embd_nextn ? buffer_view{(float *) (base + offset), embd_nextn.size} : buffer_view{nullptr, 0}; offset += embd_nextn.size * sizeof(float); + for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) { + if (cparams.embeddings_layer_inp[il]) { + embd_layer_inp[il] = buffer_view{(float *) (base + offset), (size_t) n_embd * n_batch}; + offset += embd_layer_inp[il].size * sizeof(float); + } else { + embd_layer_inp[il] = buffer_view{nullptr, 0}; + } + } + if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; offset += sampling.logits.size * sizeof(float); @@ -2164,6 +2217,34 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { return n_outputs_max; } +void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) { + for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) { + if (!cparams.embeddings_layer_inp[il]) { + continue; + } + if (!embd_layer_inp[il].has_data()) { + GGML_ABORT("output layer input buffer not allocated"); + } + ggml_tensor * t = res->get_layer_inp((int) il); + if (!t) { + GGML_ABORT("layer input tensor not found"); + } + + const size_t nbytes = ggml_nbytes(t); + const size_t nfloats = nbytes / sizeof(float); + GGML_ASSERT(n_tokens > 0); + GGML_ASSERT(nfloats % n_tokens == 0); + + const size_t row_floats = nfloats / n_tokens; + const size_t dst_offset = token_offset * row_floats; + GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size); + + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t); + GGML_ASSERT(backend != nullptr); + ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes); + } +} + void llama_context::output_reorder() { const uint64_t n_vocab = model.vocab.n_tokens(); const uint64_t n_embd = model.hparams.n_embd; @@ -2190,6 +2271,16 @@ void llama_context::output_reorder() { } } + if (embd_layer_inp.size() > 0) { + for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) { + if (embd_layer_inp[lid].size > 0) { + for (uint64_t k = 0; k < n_embd; ++k) { + std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]); + } + } + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); @@ -3604,6 +3695,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } +void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) { + ctx->set_embeddings_layer_inp(lid, value); +} + llama_memory_t llama_get_memory(const struct llama_context * ctx) { if (!ctx) { return nullptr; @@ -3624,6 +3719,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) { return ctx->get_embeddings_nextn_ith(i); } +float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) { + ctx->synchronize(); + + return ctx->get_embeddings_layer_inp(lid); +} + bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { return ctx->set_sampler(seq_id, smpl); } diff --git a/examples/talk-llama/llama-context.h b/examples/talk-llama/llama-context.h index 6f8f59a22..853052be2 100644 --- a/examples/talk-llama/llama-context.h +++ b/examples/talk-llama/llama-context.h @@ -88,6 +88,8 @@ struct llama_context { float * get_embeddings_nextn(); float * get_embeddings_nextn_ith(int32_t i); + float * get_embeddings_layer_inp(uint32_t lid); + llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -112,6 +114,7 @@ struct llama_context { void set_embeddings (bool value); void set_embeddings_nextn(bool value, bool masked); + void set_embeddings_layer_inp(uint32_t lid, bool enable); void set_causal_attn(bool value); void set_warmup(bool value); @@ -226,6 +229,10 @@ private: // map the output row index `i` to batch index int64_t output_resolve_row(int32_t i) const; + // async-copy enabled layer-input tensors (per cparams.output_layer_inp) + // from backend into host-side embd_layer_inp buffers + void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens); + // // graph // @@ -288,6 +295,10 @@ private: // sets llm_graph_result::t_h_nextn buffer_view embd_nextn = {nullptr, 0}; + // host buffers for output layer input embeddings, per layer + // populated when cparams.output_layer_inp[il] is true + std::vector> embd_layer_inp; + struct sampling_info { // !samplers.empty() to check if any samplers are active std::map samplers; diff --git a/examples/talk-llama/llama-cparams.h b/examples/talk-llama/llama-cparams.h index 8a35d389e..2b109f909 100644 --- a/examples/talk-llama/llama-cparams.h +++ b/examples/talk-llama/llama-cparams.h @@ -3,6 +3,7 @@ #include "llama.h" #include +#include #define LLAMA_MAX_SEQ 256 @@ -44,6 +45,8 @@ struct llama_cparams { bool kv_unified; bool pipeline_parallel; + std::vector embeddings_layer_inp; // [n_layer()] extract input embeddings for layer + enum llama_context_type ctx_type; enum llama_pooling_type pooling_type; diff --git a/examples/talk-llama/llama-ext.h b/examples/talk-llama/llama-ext.h index bd7454412..b744af528 100644 --- a/examples/talk-llama/llama-ext.h +++ b/examples/talk-llama/llama-ext.h @@ -101,4 +101,20 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); +// Set whether the context outputs the input embeddings of a specific layer +LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value); + +// mirrors: +// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid); + LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); + +// +// model/context data extraction +// + +// returns pointer to the target-model layer indices +LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model); +// returns the number of extracted layers from target model +LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model); diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index da7a92955..7468bd9b7 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -567,7 +567,10 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); } - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // the kq mask guards on its own buffer: shared cells leave idxs unbacked while the mask stays live + if (self_kq_mask && self_kq_mask->buffer) { + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { @@ -575,7 +578,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); } - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); @@ -607,7 +612,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there } - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + if (self_kq_mask && self_kq_mask->buffer) { + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { @@ -615,7 +622,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there } - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } @@ -895,6 +904,10 @@ void llm_graph_result::reset() { t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; + + t_layer_inp.resize(LLAMA_MAX_LAYERS); + std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr); + t_sampled.clear(); t_sampled_probs.clear(); t_sampled_logits.clear(); @@ -923,7 +936,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) { } } -void llm_graph_result::set_outputs() { +void llm_graph_result::set_outputs(const llm_graph_params & params) { if (t_logits != nullptr) { ggml_set_output(t_logits); } @@ -936,6 +949,15 @@ void llm_graph_result::set_outputs() { if (t_h_nextn != nullptr) { ggml_set_output(t_h_nextn); } + { + const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp; + for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) { + if (embeddings_layer_inp[il]) { + GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null"); + ggml_set_output(t_layer_inp[il]); + } + } + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); @@ -1864,9 +1886,9 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { res->t_inp_embd = cur; // For Granite architecture - // NOTE: Only apply scale to token inputs. Raw embeddings are assumed to be - // multimodal inputs that should not be scaled. - if (ubatch.token && hparams.f_embedding_scale != 0.0f) { + // NOTE: For deepstack models, only apply scale to token inputs (ie text-only input). + // Raw embeddings are assumed to be multimodal inputs that should not be scaled. + if (hparams.f_embedding_scale != 0.0f && (ubatch.token || hparams.n_deepstack_layers == 0)) { if (!ggml_is_contiguous(cur)) { cur = ggml_cont(ctx0, cur); } diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index 6793846e3..cc5cfe51d 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -705,6 +705,8 @@ public: ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_h_nextn() const { return t_h_nextn; } + ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; } + ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -713,7 +715,7 @@ public: void reset(); void set_inputs(const llama_ubatch * ubatch); - void set_outputs(); + void set_outputs(const llm_graph_params & params); // try to update the existing graph result using the new graph parameters in order to reuse it // this can only be done if we determine that the resulting graph using the new graph parameters @@ -734,10 +736,12 @@ public: ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm - std::map t_sampled_logits; - std::map t_candidates; - std::map t_sampled; - std::map t_sampled_probs; + std::vector t_layer_inp; + + std::map t_sampled_logits; + std::map t_candidates; + std::map t_sampled; + std::map t_sampled_probs; std::vector inputs; diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index 032944cb4..d045059a6 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -45,6 +45,7 @@ struct llama_hparams { bool rope_finetuned; bool use_par_res; bool swin_norm; + bool norm_before_residual = false; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; diff --git a/examples/talk-llama/llama-model-loader.cpp b/examples/talk-llama/llama-model-loader.cpp index 0d1cf3cc3..474cabdfc 100644 --- a/examples/talk-llama/llama-model-loader.cpp +++ b/examples/talk-llama/llama-model-loader.cpp @@ -394,6 +394,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index 4f12e0949..7281ed79f 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_qwen35moe(params); case LLM_ARCH_MISTRAL3: return new llama_model_mistral3(params); + case LLM_ARCH_EAGLE3: + return new llama_model_eagle3(params); case LLM_ARCH_MIMO2: return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: @@ -2238,7 +2240,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { // TODO: move reranking logic here and generalize llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers); - llm->res->set_outputs(); + llm->res->set_outputs(params); return llm->res->get_gf(); } @@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_EAGLE3: case LLM_ARCH_MISTRAL4: case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_MAINCODER: @@ -2600,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) { bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - case LLM_ARCH_T5ENCODER: return true; + case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: + case LLM_ARCH_EAGLE3: return true; default: return false; } } @@ -2687,3 +2691,12 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid, layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED); } } + +const int32_t * llama_model_target_layer_ids(const struct llama_model * model) { + const auto & v = model->target_layer_ids; + return v.empty() ? nullptr : v.data(); +} + +uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) { + return (uint32_t) model->target_layer_ids.size(); +} diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index 992c8d9c8..f4718f6d5 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -569,6 +569,13 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // eagle3 + struct ggml_tensor * fc = nullptr; // feature fusion layer + struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping + + // unified vector to store target-model extracted layer ids in eagle3, dflash, etc. + std::vector target_layer_ids; + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp index 9a4bed494..8543e178d 100644 --- a/examples/talk-llama/llama-vocab.cpp +++ b/examples/talk-llama/llama-vocab.cpp @@ -764,7 +764,7 @@ struct llm_tokenizer_wpm_session { void tokenize(const std::string & text, std::vector & output) { // normalize and split by whitespace - std::vector words = preprocess(text, vocab.get_normalizer_lowercase()); + std::vector words = preprocess(text, vocab.get_normalizer_opts()); // bos token prepended already // find the longest tokens that form the words @@ -809,11 +809,14 @@ struct llm_tokenizer_wpm_session { } // TODO: reduce string copies by using cpts_offs array - static std::vector preprocess(const std::string & text, bool lowercase) { - const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); + static std::vector preprocess(const std::string & text, const llama_vocab::normalizer_options & normalizer_opts) { + std::vector cpts = unicode_cpts_from_utf8(text); + if (normalizer_opts.strip_accents) { + cpts = unicode_cpts_normalize_nfd(cpts); + } std::vector words(1, ""); - for (const uint32_t cpt : cpts_nfd) { + for (const uint32_t cpt : cpts) { const auto flags = unicode_cpt_flags_from_cpt(cpt); if (flags.is_whitespace) { @@ -828,7 +831,11 @@ struct llm_tokenizer_wpm_session { continue; } - const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); + if (normalizer_opts.strip_accents && flags.is_accent_mark) { + continue; + } + + const std::string s = unicode_cpt_to_utf8(normalizer_opts.lowercase ? unicode_tolower(cpt) : cpt); if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { if (words.back().size()) { // finish previous word if any words.emplace_back(); @@ -1692,7 +1699,7 @@ struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session { llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} void tokenize(const std::string & text, std::vector & output) override { - const bool lowercase = vocab.get_normalizer_lowercase(); + const bool lowercase = vocab.get_normalizer_opts().lowercase; std::string segment; auto flush = [&]() { @@ -1797,7 +1804,9 @@ struct llama_vocab::impl { bool remove_extra_whitespaces = false; bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; - bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json) + + // BertNormalizer options + llama_vocab::normalizer_options normalizer_opts; std::unordered_map token_to_id; std::vector id_to_token; @@ -2172,7 +2181,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "whitespace") { pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE; - normalizer_lowercase = false; + normalizer_opts.lowercase = false; } else if ( tokenizer_pre == "refact") { pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; @@ -2532,8 +2541,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } } - // Lowercase normalizer flag (consulted by WPM / whitespace BPE) - ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); + // BertNormalizer options + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_opts.lowercase, false); + normalizer_opts.strip_accents = normalizer_opts.lowercase; + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, normalizer_opts.strip_accents, false); // suppress tokens { @@ -3969,8 +3980,8 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { return pimpl->treat_whitespace_as_suffix; } -bool llama_vocab::get_normalizer_lowercase() const { - return pimpl->normalizer_lowercase; +const llama_vocab::normalizer_options & llama_vocab::get_normalizer_opts() const { + return pimpl->normalizer_opts; } const std::vector & llama_vocab::get_suppress_tokens() const { diff --git a/examples/talk-llama/llama-vocab.h b/examples/talk-llama/llama-vocab.h index 2626ae36e..707cd4bac 100644 --- a/examples/talk-llama/llama-vocab.h +++ b/examples/talk-llama/llama-vocab.h @@ -76,6 +76,12 @@ struct llama_vocab { llama_token_attr attr; }; + struct normalizer_options { + bool lowercase = true; + bool strip_accents = true; + // TODO: clean_text, handle_chinese_chars + }; + llama_vocab(); ~llama_vocab(); @@ -141,7 +147,7 @@ struct llama_vocab { bool get_remove_extra_whitespaces () const; bool get_escape_whitespaces () const; bool get_treat_whitespace_as_suffix() const; - bool get_normalizer_lowercase () const; + const normalizer_options & get_normalizer_opts() const; const std::vector & get_suppress_tokens() const; diff --git a/examples/talk-llama/models/delta-net-base.cpp b/examples/talk-llama/models/delta-net-base.cpp index 4f4c7cac7..ad9ce7714 100644 --- a/examples/talk-llama/models/delta-net-base.cpp +++ b/examples/talk-llama/models/delta-net-base.cpp @@ -398,9 +398,8 @@ std::pair llm_build_delta_net_base::build_delta_ne GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs); GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs); - // K=1 (final state only): reshape to 3D (S_v*S_v*H_v, 1, n_seqs) for ggml_gated_delta_net. - ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, S_v * S_v * H_v, 1, n_seqs); - ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d); + // K=1: output carries the final state only. state s is 4D [S_v, S_v, H_v, n_seqs]. + ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s, /*K=*/1); if (n_tokens == 1) { cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il); } else { @@ -564,11 +563,8 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( const int64_t D = S_v * S_v * H_v; const int64_t K = cparams.n_rs_seq + 1; - // TODO: remove pad + simplify - ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); - ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0); - - ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad); + // state s is 4D [S_v, S_v, H_v, n_seqs]; K snapshot slots are written into the output. + ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s, K); if (n_seq_tokens > 1) { cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); } else { @@ -587,21 +583,24 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( cb(output, "attn_output", il); const size_t row_size = hparams.n_embd_s() * ggml_element_size(ssm_states_all); - for (int64_t k_i = 0; k_i < K; ++k_i) { - const uint32_t cache_slot = (uint32_t) (K - 1 - k_i); - ggml_tensor * src = ggml_view_4d(ctx0, gdn_out, - S_v, S_v, H_v, n_seqs, - ggml_row_size(gdn_out->type, S_v), - ggml_row_size(gdn_out->type, S_v * S_v), - ggml_row_size(gdn_out->type, S_v * S_v * H_v), - ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap)); - ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all, - hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], - ((size_t) cache_slot * mem_size + kv_head) * row_size); + // op writes the last min(n_seq_tokens, K) snapshots; trailing slots are left unwritten + const int64_t n_written = std::min(n_seq_tokens, K); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); - } + // write the produced snapshots into the recurrent cache (snapshot slot i -> rollback group i) + ggml_tensor * src = ggml_view_3d(ctx0, gdn_out, + D, n_seqs, n_written, + ggml_row_size(gdn_out->type, D), + ggml_row_size(gdn_out->type, state_size_per_snap), + ggml_row_size(gdn_out->type, attn_score_elems)); + + ggml_tensor * dst = ggml_view_3d(ctx0, ssm_states_all, + D, n_seqs, n_written, + ssm_states_all->nb[1], + (size_t) mem_size * row_size, + (size_t) kv_head * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); return output; } diff --git a/examples/talk-llama/models/eagle3.cpp b/examples/talk-llama/models/eagle3.cpp new file mode 100644 index 000000000..3321b3905 --- /dev/null +++ b/examples/talk-llama/models/eagle3.cpp @@ -0,0 +1,323 @@ +#include "models.h" + +void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) { + throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); + } + if (target_layer_ids.size() != 3) { + throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'"); + } + LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, + target_layer_ids[0], + target_layer_ids[1], + target_layer_ids[2]); + + uint32_t n_embd_tgt = 0; + + ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt); + LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); + + hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; + + // eagle3 norm_before_residual (optional, default false) + // compatible with Readhat eagle3 speculator model + ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false); + if (hparams.norm_before_residual) { + LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__); + } + + type = LLM_TYPE_UNKNOWN; +} + +void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + const int64_t n_embd_inp = hparams.n_embd_inp(); + const int64_t n_embd_attn_input = 2 * n_embd; + + // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) + // d2t: draft to target vocabulary mapping + int64_t n_draft_vocab = n_vocab; // Default: same as target vocab + const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t"); + if (d2t_meta) { + n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size + d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } else { + d2t = nullptr; // no d2t, use default vocab size + LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } + + // Feature fusion layer: projects 3 target layers to draft hidden size + fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0); + + // Output layer (uses draft vocab size) + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED); + + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) + const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); + if (tok_embd_meta) { + const int64_t n_target_vocab = tok_embd_meta->ne[1]; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab); + } + + // Single decoder layer + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // input_layernorm: applied to token embeddings + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // eagle3 specific: hidden_norm applied to fused target features + layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0); + + // Attention takes input_embeds_normed + fused_target_normed as input + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling) + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED); + } +} + +std::unique_ptr llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + return std::make_unique>(*this, params); + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + return std::make_unique>(*this, params); + default: + GGML_ABORT("invalid graph type"); + }; +} + +template <> +ggml_tensor * llama_model_eagle3::graph::build_inp_embd_enc() const { + ggml_tensor * cur = nullptr; + + // Input: Target model features (3 layers concatenated: low, mid, high) + // Data will be provided via ubatch->embd in encode_eagle3_features() + auto inp_target = std::make_unique(hparams.n_embd_inp()); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens); + ggml_set_input(inp_target->embd); + + cur = inp_target->embd; + cb(cur, "inp_embd", -1); + + res->add_input(std::move(inp_target)); + + return cur; +} + +// eagle3 Encoder: processes target model features through feature fusion layer +// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high +// Output: g_embeddings e.g. [4096, n_tokens] stored in context +template <> +llama_model_eagle3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur = nullptr; + + cur = build_inp_embd_enc(); + + // Feature fusion layer + cur = build_lora_mm(model.fc, cur); + cb(cur, "fc_out", -1); + + // Output: g_embeddings e.g. [4096, n_tokens] + // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft) + ggml_set_output(cur); + res->t_h_nextn = cur; + + ggml_build_forward_expand(gf, cur); +} + +// eagle3 Decoder: processes draft tokens using g_embeddings from encoder +// Input: draft tokens + g_embeddings from encoder +// Output: draft logits +template <> +llama_model_eagle3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer + + ggml_tensor * cur; + ggml_tensor * inpL; + + // eagle3 Decoder receives: + // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) + // 2. g_embeddings from encoder + auto * tok_embd = model.tok_embd; + if (model.tok_embd == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + tok_embd = model_other->tok_embd; + } + + auto inp = std::make_unique(n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp->embd); + + ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens); + cb(inp_embd, "inp_embd", -1); + + ggml_tensor * inp_g = inp->embd; + cb(inp_g, "inp_g_embeddings", -1); + + res->add_input(std::move(inp)); + + inpL = inp_g; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + // Single decoder layer (il = 0) + const int il = 0; + { + // Apply input_layernorm to the token embeddings + ggml_tensor * embd_norm = build_norm(inp_embd, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(embd_norm, "embd_norm", il); + + // Apply hidden_norm to inp_g + ggml_tensor * g_norm = build_norm(inp_g, + model.layers[il].attn_norm_2, NULL, + LLM_NORM_RMS, -1); + cb(g_norm, "g_norm", il); + + // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) + // - false (default): use raw inp_g for residual + // - true: use normalized g_norm for residual + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) + ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL; + + // Concatenate normalized inp_embd and normalized inp_g + cur = ggml_concat(ctx0, embd_norm, g_norm, il); + cb(cur, "concat_embd", il); + + // Self-attention with concatenated input + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // rope freq factors, returns nullptr if not available + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + // Add residual and update it + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // Apply FFN norm to the sum + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + // Output norm with residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "eagle3_prenorm", il); + + inpL = cur; + } + + cur = inpL; + + // Output prenorm state (for next token's g_embeddings in autoregressive generation) + ggml_set_output(cur); + res->t_h_nextn = cur; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head - projects to draft vocabulary + // if the draft has no own output projection, inherit the target model's lm_head + auto * output = model.output; + if (output == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)"); + output = model_other->output; + } + cur = build_lora_mm(output, cur); + + if (model.d2t) { + const int64_t n_draft_vocab = cur->ne[0]; + const int64_t n_outputs = cur->ne[1]; + const int64_t n_vocab = (int64_t) model.vocab.n_tokens(); + + GGML_ASSERT(model.d2t->type == GGML_TYPE_I64); + GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab); + + ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY); + cur = ggml_set_rows(ctx0, logits, + ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs), + ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1)); + cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/examples/talk-llama/models/gemma4-assistant.cpp b/examples/talk-llama/models/gemma4-assistant.cpp index 5b7a25a5a..6378130e7 100644 --- a/examples/talk-llama/models/gemma4-assistant.cpp +++ b/examples/talk-llama/models/gemma4-assistant.cpp @@ -39,6 +39,9 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + create_tensor(tn(LLM_TENSOR_MASKED_EMBD_CENTROIDS, "weight"), {}, TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MASKED_EMBD_ORDERING), {}, TENSOR_NOT_REQUIRED); + const int64_t n_embd_backbone = hparams.n_embd_inp(); nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); diff --git a/examples/talk-llama/models/gemma4.cpp b/examples/talk-llama/models/gemma4.cpp index 6f7fcd645..6a96979ce 100644 --- a/examples/talk-llama/models/gemma4.cpp +++ b/examples/talk-llama/models/gemma4.cpp @@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para const float freq_scale_l = model.get_rope_freq_scale(cparams, il); const int n_rot_l = hparams.n_rot(il); + res->t_layer_inp[il] = inpL; + // norm cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); diff --git a/examples/talk-llama/models/llama.cpp b/examples/talk-llama/models/llama.cpp index c0ec7e0a9..4bfebc884 100644 --- a/examples/talk-llama/models/llama.cpp +++ b/examples/talk-llama/models/llama.cpp @@ -124,6 +124,8 @@ llama_model_llama::graph::graph(const llama_model & model, const llm_grap ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/examples/talk-llama/models/models.h b/examples/talk-llama/models/models.h index c137e32e8..ee3aff07b 100644 --- a/examples/talk-llama/models/models.h +++ b/examples/talk-llama/models/models.h @@ -46,7 +46,7 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * s, int il); - // use the ggml_gated_delta_net fused operator (K=1; state has shape (D, 1, n_seqs)) + // use the ggml_gated_delta_net fused operator (K=1; state has shape [S_v, S_v, H_v, n_seqs]) std::pair build_delta_net_fused( ggml_tensor * q, ggml_tensor * k, @@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base { std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; +struct llama_model_eagle3 : public llama_model_base { + llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + template + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + + ggml_tensor * build_inp_embd_enc() const; + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + struct llama_model_mistral4 : public llama_model_deepseek2 { llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} diff --git a/examples/talk-llama/models/openai-moe.cpp b/examples/talk-llama/models/openai-moe.cpp index 3ab15d61f..6d74f9c7e 100644 --- a/examples/talk-llama/models/openai-moe.cpp +++ b/examples/talk-llama/models/openai-moe.cpp @@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + const float freq_base_l = model.get_rope_freq_base (cparams, il); const float freq_scale_l = model.get_rope_freq_scale(cparams, il); diff --git a/examples/talk-llama/models/plamo2.cpp b/examples/talk-llama/models/plamo2.cpp index b93cf48bc..0b81513c3 100644 --- a/examples/talk-llama/models/plamo2.cpp +++ b/examples/talk-llama/models/plamo2.cpp @@ -11,6 +11,10 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + // Load attention parameters + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false); + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } @@ -273,7 +277,7 @@ ggml_tensor * llama_model_plamo2::graph::build_plamo2_mamba_layer(llm_graph_inpu GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - GGML_ASSERT(d_inner % n_head == 0); + GGML_ASSERT(d_inner % n_heads == 0); GGML_ASSERT(n_group == 0); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); diff --git a/examples/talk-llama/models/qwen3.cpp b/examples/talk-llama/models/qwen3.cpp index 1d0d2fab3..f4b2a2aeb 100644 --- a/examples/talk-llama/models/qwen3.cpp +++ b/examples/talk-llama/models/qwen3.cpp @@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/examples/talk-llama/models/qwen35.cpp b/examples/talk-llama/models/qwen35.cpp index 4b642cff4..6783d98ec 100644 --- a/examples/talk-llama/models/qwen35.cpp +++ b/examples/talk-llama/models/qwen35.cpp @@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para } if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/examples/talk-llama/models/qwen3moe.cpp b/examples/talk-llama/models/qwen3moe.cpp index 317e668be..6f6df5390 100644 --- a/examples/talk-llama/models/qwen3moe.cpp +++ b/examples/talk-llama/models/qwen3moe.cpp @@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm