From 5ed76e9a079962f1c85cfce44edd325c27ef1f97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 19 Jun 2026 10:19:58 +0300 Subject: [PATCH] talk-llama : sync llama.cpp --- examples/talk-llama/llama-arch.cpp | 1 + examples/talk-llama/llama-arch.h | 1 + examples/talk-llama/llama-context.cpp | 2 +- examples/talk-llama/llama-ext.h | 1 + examples/talk-llama/llama-graph.cpp | 103 ++--- examples/talk-llama/llama-graph.h | 5 +- examples/talk-llama/llama-hparams.cpp | 4 + examples/talk-llama/llama-hparams.h | 7 + examples/talk-llama/llama-model-saver.cpp | 1 + examples/talk-llama/llama-model.cpp | 9 +- examples/talk-llama/llama-vocab.cpp | 3 +- examples/talk-llama/llama.cpp | 2 +- examples/talk-llama/models/cohere2.cpp | 6 +- examples/talk-llama/models/cohere2moe.cpp | 443 ++++++++++++++++++++++ examples/talk-llama/models/eagle3.cpp | 8 +- examples/talk-llama/models/models.h | 17 + 16 files changed, 555 insertions(+), 58 deletions(-) create mode 100644 examples/talk-llama/models/cohere2moe.cpp diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index 9f93d5bc7..4a52d9772 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, + { LLM_ARCH_COHERE2MOE, "cohere2moe" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO2, "olmo2" }, diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index c5245fb58..989da06d8 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -71,6 +71,7 @@ enum llm_arch { LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, LLM_ARCH_COHERE2, + LLM_ARCH_COHERE2MOE, LLM_ARCH_DBRX, LLM_ARCH_OLMO, LLM_ARCH_OLMO2, diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index 168dbabd7..529bc4a5e 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -1382,7 +1382,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim - const int64_t n_embd = hparams.n_embd_inp(); + const int64_t n_embd = hparams.n_embd_inp_enc(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 diff --git a/examples/talk-llama/llama-ext.h b/examples/talk-llama/llama-ext.h index b744af528..8b5679b69 100644 --- a/examples/talk-llama/llama-ext.h +++ b/examples/talk-llama/llama-ext.h @@ -2,6 +2,7 @@ // this is a staging header for new llama.cpp API // breaking changes and C++ are allowed. everything here should be considered WIP +// try as much as possible to not include this header in the rest of the codebase #include "llama.h" diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index 7468bd9b7..68c9e606c 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm( ggml_tensor * w_s) const { ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + if (w_s) { + res = ggml_mul(ctx0, res, w_s); + } + for (const auto & lora : *loras) { llama_adapter_lora_weight * lw = lora.first->get_weight(w); if (lw == nullptr) { @@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm( res = ggml_add(ctx0, res, ab_cur); } - if (w_s) { - res = ggml_mul(ctx0, res, w_s); - } - return res; } ggml_tensor * llm_graph_context::build_lora_mm_id( ggml_tensor * w, // ggml_tensor * as ggml_tensor * cur, // ggml_tensor * b - ggml_tensor * ids) const { + ggml_tensor * ids, + ggml_tensor * w_s) const { ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + + if (w_s) { + const int64_t n_expert = w_s->ne[0]; + const int64_t n_tokens = cur->ne[2]; + ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1); + s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); + s = ggml_get_rows(ctx0, s, ids); + res = ggml_mul(ctx0, res, s); + } for (const auto & lora : *loras) { llama_adapter_lora_weight * lw = lora.first->get_weight(w); if (lw == nullptr) { @@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn( llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, int il) const { + // NVFP4 support is currently restricted to + // 1) LORA absence (*_s would be applied after LORA residual, which is incorrect) + // 2) bias absense (*_s would be applied after bias addition, which is incorrect) + // TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently) + auto has_lora = [this](ggml_tensor * w) { + if (!w) { + return false; + } + for (const auto & lora : *loras) { + if (lora.first->get_weight(w) != nullptr) { + return true; + } + } + return false; + }; + + GGML_ASSERT(!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4); + GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4); + GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4); + GGML_ASSERT(!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora(up)); + GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate)); + GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down)); + ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; cb(tmp, "ffn_up", il); @@ -1627,23 +1660,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn( if (gate_up_exps) { // merged gate_up path: one mul_mat_id, then split into gate and up views - ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens] + ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens] cb(gate_up, "ffn_moe_gate_up", il); + if (up_exps_s) { + cb(gate_up, "ffn_moe_gate_up_scaled", il); + } + if (gate_up_exps_b) { gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts); cb(gate_up, "ffn_moe_gate_up_biased", il); } - // apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused) - if (up_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - gate_up = ggml_mul(ctx0, gate_up, s); - cb(gate_up, "ffn_moe_gate_up_scaled", il); - } - const int64_t n_ff = gate_up->ne[0] / 2; cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0); cb(cur, "ffn_moe_gate", il); @@ -1651,43 +1679,33 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(up, "ffn_moe_up", il); } else { // separate gate and up path - up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); + if (up_exps_s) { + cb(up, "ffn_moe_up_scaled", il); + } + if (up_exps_b) { up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); cb(up, "ffn_moe_up_biased", il); } - // apply per-expert scale2 to up - if (up_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - up = ggml_mul(ctx0, up, s); - cb(up, "ffn_moe_up_scaled", il); - } - if (gate_exps) { - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens] cb(cur, "ffn_moe_gate", il); } else { cur = up; } + if (gate_exps_s) { + cb(cur, "ffn_moe_gate_scaled", il); + } + if (gate_exps_b) { cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); cb(cur, "ffn_moe_gate_biased", il); } - - // apply per-expert scale2 to gate - if (gate_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - cur = ggml_mul(ctx0, cur, s); - cb(cur, "ffn_moe_gate_scaled", il); - } } const bool has_gate = gate_exps || gate_up_exps; @@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } - experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); + if (down_exps_s) { + cb(experts, "ffn_moe_down_scaled", il); + } + if (down_exps_b) { experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts); cb(experts, "ffn_moe_down_biased", il); } - // apply per-expert scale2 to down - if (down_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - experts = ggml_mul(ctx0, experts, s); - cb(experts, "ffn_moe_down_scaled", il); - } - if (!weight_before_ffn) { experts = ggml_mul(ctx0, experts, weights); cb(experts, "ffn_moe_weighted", il); diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index cc5cfe51d..5e8a65835 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -853,11 +853,12 @@ struct llm_graph_context { ggml_tensor * cur, ggml_tensor * w_s = nullptr) const; - // do mat_mul_id, while optionally apply lora + // do mat_mul_id, while optionally apply lora and per-expert scale ggml_tensor * build_lora_mm_id( ggml_tensor * w, // ggml_tensor * as ggml_tensor * cur, // ggml_tensor * b - ggml_tensor * ids) const; + ggml_tensor * ids, + ggml_tensor * w_s = nullptr) const; ggml_tensor * build_norm( ggml_tensor * cur, diff --git a/examples/talk-llama/llama-hparams.cpp b/examples/talk-llama/llama-hparams.cpp index 2bf576873..9d0683d2f 100644 --- a/examples/talk-llama/llama-hparams.cpp +++ b/examples/talk-llama/llama-hparams.cpp @@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const { return n_embd_inp; } +uint32_t llama_hparams::n_embd_inp_enc() const { + return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp(); +} + uint32_t llama_hparams::n_embd_out() const { return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd; } diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index d045059a6..2eadeb214 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -189,6 +189,10 @@ struct llama_hparams { // input embedding dimension (0 = use n_embd) uint32_t n_embd_inp_impl = 0; + // encoder input embedding dimension (0 = use n_embd_inp()) + // e.g. the eagle3 encoder fuses target_layers * target_hidden features + uint32_t n_embd_inp_enc_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -305,6 +309,9 @@ struct llama_hparams { // dimension of main + auxiliary input embeddings uint32_t n_embd_inp() const; + // dimension of the encoder input embeddings + uint32_t n_embd_inp_enc() const; + // dimension of output embeddings uint32_t n_embd_out() const; diff --git a/examples/talk-llama/llama-model-saver.cpp b/examples/talk-llama/llama-model-saver.cpp index 67d4a9df0..a3928523b 100644 --- a/examples/talk-llama/llama-model-saver.cpp +++ b/examples/talk-llama/llama-model-saver.cpp @@ -18,6 +18,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) { case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: case LLM_ARCH_COHERE2: + case LLM_ARCH_COHERE2MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_BITNET: case LLM_ARCH_T5: diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index 7281ed79f..c52875533 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -157,6 +157,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_command_r(params); case LLM_ARCH_COHERE2: return new llama_model_cohere2(params); + case LLM_ARCH_COHERE2MOE: + return new llama_model_cohere2moe(params); case LLM_ARCH_DBRX: return new llama_model_dbrx(params); case LLM_ARCH_OLMO: @@ -1467,9 +1469,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } ml.done_getting_tensors(); + // Tied NVFP4 output is valid when no separate LM-head scale tensors are present. + // If sidecar scales exist, the output weight must be an actual output tensor. GGML_ASSERT(!(output && tok_embd && strcmp(output->name, tok_embd->name) == 0 && - output->type == GGML_TYPE_NVFP4)); + output->type == GGML_TYPE_NVFP4 && + (output_s || output_in_s))); // populate tensors_by_name for (auto & [_, ctx_ptr] : ml.ctx_map) { for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) { @@ -1844,6 +1849,7 @@ void llama_model::print_info() const { } if (arch == LLM_ARCH_MELLUM || + arch == LLM_ARCH_COHERE2MOE || arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || @@ -2389,6 +2395,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: case LLM_ARCH_COHERE2: + case LLM_ARCH_COHERE2MOE: case LLM_ARCH_OLMO: case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp index 8543e178d..6e78a3f6c 100644 --- a/examples/talk-llama/llama-vocab.cpp +++ b/examples/talk-llama/llama-vocab.cpp @@ -2280,7 +2280,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { clean_spaces = false; ignore_merges = true; } else if ( - tokenizer_pre == "tiny_aya") { + tokenizer_pre == "tiny_aya" || + tokenizer_pre == "cohere2moe") { pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA; clean_spaces = false; } else if ( diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp index a67fa8039..0de6048f2 100644 --- a/examples/talk-llama/llama.cpp +++ b/examples/talk-llama/llama.cpp @@ -249,7 +249,7 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama } // if using single GPU mode, remove all except the main GPU - if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { + if (params.split_mode == LLAMA_SPLIT_MODE_NONE && !model->devices.empty()) { if (params.main_gpu < 0) { model->devices.clear(); } else { diff --git a/examples/talk-llama/models/cohere2.cpp b/examples/talk-llama/models/cohere2.cpp index 61a5945a1..e2b366256 100644 --- a/examples/talk-llama/models/cohere2.cpp +++ b/examples/talk-llama/models/cohere2.cpp @@ -122,9 +122,9 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par // feed-forward network { cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, + model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s, + model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } diff --git a/examples/talk-llama/models/cohere2moe.cpp b/examples/talk-llama/models/cohere2moe.cpp new file mode 100644 index 000000000..499c73a1c --- /dev/null +++ b/examples/talk-llama/models/cohere2moe.cpp @@ -0,0 +1,443 @@ +#include "models.h" + +void llama_model_cohere2moe::load_arch_hparams(llama_model_loader & ml) { + const bool found_norm = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false); + const bool found_norm_rms = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + if (!found_norm && !found_norm_rms) { + throw std::runtime_error("missing Cohere2 MoE norm epsilon"); + } + if (!found_norm_rms) { + hparams.f_norm_rms_eps = 0.0f; + } + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer"); + + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + } + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + uint32_t swa_period = 4; + if (ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false)) { + hparams.set_swa_pattern(swa_period, true); + } else { + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + } + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + + switch (hparams.n_layer()) { + case 49: type = LLM_TYPE_30B_A3B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_cohere2moe::load_arch_tensors(llama_model_loader & ml) { + LLAMA_LOAD_LOCALS; + + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP + // tensors live in a separate file. Mark MTP tensors NOT_REQUIRED so the + // trunk loads cleanly. + const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; + const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0 for Cohere2Moe"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0 for Cohere2Moe"); + } + + auto load_block_trunk = [&](int i, int flags) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags); + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); + + if (static_cast(i) < hparams.n_layer_dense_lead) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags); + } else { + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff; + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags); + + if (hparams.n_expert_shared > 0) { + const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared; + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + } + } + }; + + auto load_block_mtp = [&](int i, int flags) { + auto & layer = layers[i]; + + // MTP block looks like a full-attention Cohere2 MoE decoder block. + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags); + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff; + + // Routed experts + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags); + + if (hparams.n_expert_shared > 0) { + const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared; + + // Shared experts + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + } + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < n_layer; ++i) { + load_block_trunk(i, trunk_flags); + } + // MTP/NextN layers are loaded as extra decoder blocks. + for (int i = n_layer; i < n_layer_all; ++i) { + load_block_mtp(i, mtp_flags); + } +} + +std::unique_ptr llama_model_cohere2moe::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } + return std::make_unique(*this, params); +} + +llama_model_cohere2moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + + const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS; + const float f_logit_scale = hparams.f_logit_scale; + ggml_tensor * cur; + ggml_tensor * inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. + for (int il = 0; il < n_layer; ++il) { + const bool is_swa = hparams.is_swa(il); + // Dense-prefix full-attention layers use RoPE; later layers follow the SWA pattern. + const bool force_rope = static_cast(il) < hparams.n_layer_dense_lead; + + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, cohere2moe_norm_type, il); + cb(cur, "attn_norm", il); + + ggml_tensor * ffn_inp = cur; + + { + const auto & layer = model.layers[il]; + + auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, + n_embd_head, n_head, n_head_kv, il); + + if (is_swa || force_rope) { + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + layer.wo, layer.wo_b, layer.wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + } + + ggml_tensor * attn_out = cur; + + const auto & layer = model.layers[il]; + + if (layer.ffn_gate_inp == nullptr) { + cur = build_ffn(ffn_inp, + layer.ffn_up, nullptr, layer.ffn_up_s, + layer.ffn_gate, nullptr, layer.ffn_gate_s, + layer.ffn_down, nullptr, layer.ffn_down_s, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + cur = build_moe_ffn(ffn_inp, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il, + nullptr, layer.ffn_gate_up_exps, + layer.ffn_up_exps_s, + layer.ffn_gate_exps_s, + layer.ffn_down_exps_s); + cb(cur, "ffn_moe_out", il); + + if (layer.ffn_up_shexp) { + ggml_tensor * ffn_shexp = build_ffn(ffn_inp, + layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, + layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, + layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, cur, ffn_shexp); + cur = ggml_scale(ctx0, cur, 0.5f); + cb(cur, "ffn_out", il); + } + } + + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + cur = build_norm(cur, model.output_norm, nullptr, cohere2moe_norm_type, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + +llama_model_cohere2moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + GGML_ASSERT(hparams.n_layer_nextn > 0 && "COHERE2MOE MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "COHERE2MOE MTP currently only supports a single MTP block"); + + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + + const int il = hparams.n_layer(); + const auto & layer = model.layers[il]; + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); + + const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS; + + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); + ggml_set_input(inp->embd); + + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } + cb(tok_embd, "mtp_tok_embd", il); + + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv_iswa(); + + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, cohere2moe_norm_type, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, cohere2moe_norm_type, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpL = cur; + + cur = build_norm(cur, layer.attn_norm, nullptr, cohere2moe_norm_type, il); + cb(cur, "mtp_attn_norm", il); + ggml_tensor * ffn_inp = cur; + + auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, n_embd_head, n_head, n_head_kv, il); + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "mtp_Qcur", il); + cb(Kcur, "mtp_Kcur", il); + cb(Vcur, "mtp_Vcur", il); + + cur = build_attn(inp_attn, + layer.wo, layer.wo_b, layer.wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf(float(n_embd_head)), il); + cb(cur, "mtp_attn_out", il); + + ggml_tensor * attn_out = cur; + + cur = build_moe_ffn(ffn_inp, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il, + nullptr, layer.ffn_gate_up_exps, + layer.ffn_up_exps_s, + layer.ffn_gate_exps_s, + layer.ffn_down_exps_s); + cb(cur, "mtp_ffn_moe_out", il); + + if (layer.ffn_up_shexp) { + ggml_tensor * ffn_shexp = build_ffn(ffn_inp, + layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, + layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, + layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "mtp_ffn_shexp", il); + + cur = ggml_add(ctx0, cur, ffn_shexp); + cur = ggml_scale(ctx0, cur, 0.5f); + cb(cur, "mtp_ffn_out", il); + } + + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "mtp_post_ffn", il); + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "COHERE2MOE MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, cohere2moe_norm_type, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + GGML_ASSERT(head_w && "COHERE2MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur, layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : nullptr); + + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/examples/talk-llama/models/eagle3.cpp b/examples/talk-llama/models/eagle3.cpp index 3321b3905..9d96fae59 100644 --- a/examples/talk-llama/models/eagle3.cpp +++ b/examples/talk-llama/models/eagle3.cpp @@ -19,7 +19,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt); LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); - hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; + hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; // eagle3 norm_before_residual (optional, default false) // compatible with Readhat eagle3 speculator model @@ -34,7 +34,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; - const int64_t n_embd_inp = hparams.n_embd_inp(); + const int64_t n_embd_inp = hparams.n_embd_inp_enc(); const int64_t n_embd_attn_input = 2 * n_embd; // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) @@ -109,8 +109,8 @@ ggml_tensor * llama_model_eagle3::graph::build_inp_embd_enc() const { // Input: Target model features (3 layers concatenated: low, mid, high) // Data will be provided via ubatch->embd in encode_eagle3_features() - auto inp_target = std::make_unique(hparams.n_embd_inp()); - inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens); + auto inp_target = std::make_unique(hparams.n_embd_inp_enc()); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens); ggml_set_input(inp_target->embd); cur = inp_target->embd; diff --git a/examples/talk-llama/models/models.h b/examples/talk-llama/models/models.h index ee3aff07b..2ac8415a3 100644 --- a/examples/talk-llama/models/models.h +++ b/examples/talk-llama/models/models.h @@ -937,6 +937,23 @@ struct llama_model_cohere2 : public llama_model_base { }; +struct llama_model_cohere2moe : public llama_model_base { + llama_model_cohere2moe(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_dbrx : public llama_model_base { llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override;