talk-llama : sync llama.cpp

This commit is contained in:
Georgi Gerganov 2026-03-16 07:16:46 +02:00
parent ab1252c19e
commit 2bc630f197
131 changed files with 2374 additions and 1519 deletions

View File

@ -4,6 +4,7 @@
#include <map>
#include <set>
#include <vector>
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
@ -184,6 +185,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
@ -229,11 +231,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
{ LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
{ LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
@ -361,6 +366,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
{ LLM_TENSOR_FFN_LATENT_DOWN, "blk.%d.ffn_latent_down" },
{ LLM_TENSOR_FFN_LATENT_UP, "blk.%d.ffn_latent_up" },
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
@ -1083,6 +1090,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_ATTN_NORM,
LLM_TENSOR_ATTN_Q,
LLM_TENSOR_ATTN_Q_NORM,
@ -1874,6 +1882,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_UP_EXPS,
LLM_TENSOR_FFN_DOWN_EXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_FFN_LATENT_DOWN,
LLM_TENSOR_FFN_LATENT_UP,
// MoE shared expert layer
LLM_TENSOR_FFN_DOWN_SHEXP,
LLM_TENSOR_FFN_UP_SHEXP,
@ -2749,6 +2759,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
// Nemotron 3 Super
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@ -2786,6 +2799,15 @@ std::string LLM_TN_IMPL::str() const {
return name;
}
std::vector<llm_arch> llm_arch_all() {
std::vector<llm_arch> ret;
ret.reserve(LLM_ARCH_NAMES.size());
for (const auto & [arch, _] : LLM_ARCH_NAMES) {
ret.push_back(arch);
}
return ret;
}
const char * llm_arch_name(llm_arch arch) {
auto it = LLM_ARCH_NAMES.find(arch);
if (it == LLM_ARCH_NAMES.end()) {

View File

@ -4,6 +4,7 @@
#include <string>
#include <set>
#include <vector>
//
// gguf constants (sync with gguf.py)
@ -188,6 +189,7 @@ enum llm_kv {
LLM_KV_EXPERT_GROUP_SCALE,
LLM_KV_EXPERTS_PER_GROUP,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_MOE_LATENT_SIZE,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_NUM_DEEPSTACK_LAYERS,
LLM_KV_POOLING_TYPE,
@ -233,11 +235,14 @@ enum llm_kv {
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
LLM_KV_ATTENTION_KEY_LENGTH_SWA,
LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
LLM_KV_ROPE_FREQ_BASE_SWA,
@ -381,6 +386,8 @@ enum llm_tensor {
LLM_TENSOR_FFN_GATE_CHEXPS,
LLM_TENSOR_FFN_UP_CHEXPS,
LLM_TENSOR_FFN_EXP_PROBS_B,
LLM_TENSOR_FFN_LATENT_DOWN,
LLM_TENSOR_FFN_LATENT_UP,
LLM_TENSOR_ATTN_Q_NORM,
LLM_TENSOR_ATTN_K_NORM,
LLM_TENSOR_LAYER_OUT_NORM,
@ -608,6 +615,8 @@ struct llm_tensor_info {
ggml_op op;
};
std::vector<llm_arch> llm_arch_all();
const char * llm_arch_name(llm_arch arch);
llm_arch llm_arch_from_string(const std::string & name);

View File

@ -394,11 +394,13 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
clear();
split_reset();
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
auto udata = std::make_shared<llama_ubatch::data_t>();
udata->token .resize(n_tokens);
udata->embd .clear();
udata->pos .resize(n_tokens);
udata->pos .resize(n_pos_all);
udata->n_seq_id .resize(n_tokens);
udata->seq_id .resize(n_tokens);
udata->seq_id_unq.resize(0);

View File

@ -7,6 +7,7 @@
#include "llama-memory.h"
#include "llama-mmap.h"
#include "llama-model.h"
#include "llama-ext.h"
#include <cinttypes>
#include <cmath>
@ -150,6 +151,10 @@ llama_context::llama_context(
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
cparams.fused_gdn_ar = true;
cparams.fused_gdn_ch = true;
cparams.auto_fgdn = true;
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@ -158,7 +163,7 @@ llama_context::llama_context(
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
// intialized later
// initialized later
cparams.pipeline_parallel = false;
{
@ -337,6 +342,14 @@ llama_context::llama_context(
if (cparams.pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
if (!graph_reuse_disable) {
// TODO: figure out a way to make graph reuse work with pipeline parallelism
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
graph_reuse_disable = true;
}
}
sched_reserve();
@ -422,7 +435,7 @@ void llama_context::sched_reserve() {
if (cparams.auto_fa) {
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
if (!gf) {
throw std::runtime_error("failed to split graph for Flash Attention check");
throw std::runtime_error("failed to reserve graph for Flash Attention check");
}
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
@ -432,8 +445,7 @@ void llama_context::sched_reserve() {
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
continue;
}
ggml_backend_dev_t device_fa = ggml_backend_get_device(
ggml_backend_sched_get_tensor_backend(sched.get(), n));
ggml_backend_dev_t device_fa = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
@ -448,6 +460,7 @@ void llama_context::sched_reserve() {
break;
}
}
if (fa_device_mismatch) {
cparams.flash_attn = false;
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
@ -459,6 +472,88 @@ void llama_context::sched_reserve() {
cparams.auto_fa = false;
}
if (cparams.auto_fgdn) {
LLAMA_LOG_INFO("%s: resolving fused Gated Delta Net support:\n", __func__);
if (cparams.fused_gdn_ar) {
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
if (!gf) {
throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (autoregressive)");
}
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_AR) + 1;
bool gdn_device_mismatch = false;
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * n = ggml_graph_node(gf, i);
if (n->op != GGML_OP_GATED_DELTA_NET) {
continue;
}
ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_AR "-", prefix_len) == 0);
const int il = std::stoi(n->name + prefix_len);
ggml_backend_dev_t device_kv = model.dev_layer(il);
if (device_gdn != device_kv) {
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
"is assigned to device %s (usually due to missing support)\n",
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
gdn_device_mismatch = true;
break;
}
}
if (gdn_device_mismatch) {
cparams.fused_gdn_ar = false;
LLAMA_LOG_WARN("%s: fused Gated Delta Net (autoregressive) not supported, set to disabled\n", __func__);
} else {
LLAMA_LOG_INFO("%s: fused Gated Delta Net (autoregressive) enabled\n", __func__);
}
}
if (cparams.fused_gdn_ch) {
// more than one token in the batch per sequence in order to take the chunked path
// note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
// because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
// it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
// the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
const uint32_t n_tokens_ch = 16*n_seqs;
auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
if (!gf) {
throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
}
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_CH) + 1;
bool gdn_device_mismatch = false;
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
ggml_tensor * n = ggml_graph_node(gf, i);
if (n->op != GGML_OP_GATED_DELTA_NET) {
continue;
}
ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_CH "-", prefix_len) == 0);
const int il = std::stoi(n->name + prefix_len);
ggml_backend_dev_t device_kv = model.dev_layer(il);
if (device_gdn != device_kv) {
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
"is assigned to device %s (usually due to missing support)\n",
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
gdn_device_mismatch = true;
break;
}
}
if (gdn_device_mismatch) {
cparams.fused_gdn_ch = false;
LLAMA_LOG_WARN("%s: fused Gated Delta Net (chunked) not supported, set to disabled\n", __func__);
} else {
LLAMA_LOG_INFO("%s: fused Gated Delta Net (chunked) enabled\n", __func__);
}
}
cparams.auto_fgdn = false;
}
// reserve worst-case graph
int n_splits_pp = -1;
int n_nodes_pp = -1;
@ -1039,11 +1134,15 @@ void llama_context::set_adapters_lora(llama_adapter_lora ** adapters, size_t n_a
bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
if (n_adapters != loras->size()) {
return false;
}
// Adapters with a zero scale are never added to `loras`, so also ignore them for the comparison.
size_t n_non_zero = 0;
for (size_t i = 0; i < n_adapters; i ++) {
if (scales[i] == 0.0f) {
continue;
}
n_non_zero++;
auto it = loras->find(adapters[i]);
if (it == loras->end() || it->second != scales[i]) {
@ -1051,6 +1150,10 @@ bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_
}
}
if (n_non_zero != loras->size()) {
return false;
}
return true;
}
@ -1114,6 +1217,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
{
//const auto t_start_us = ggml_time_us();
// FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
res->set_inputs(&ubatch);
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
@ -1981,7 +2085,7 @@ ggml_cgraph * llama_context::graph_reserve(
ggml_backend_sched_reset(sched.get());
// when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
// when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
gf_res_prev->reset();
// store the n_outputs as it is, and restore it afterwards
@ -2831,19 +2935,23 @@ llama_context * llama_init_from_model(
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
const uint32_t blck_size = ggml_blck_size(params.type_k);
if (model->hparams.n_embd_head_k % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
return nullptr;
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
return nullptr;
}
}
}
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
const uint32_t blck_size = ggml_blck_size(params.type_v);
if (model->hparams.n_embd_head_v % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
return nullptr;
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
return nullptr;
}
}
}
@ -3035,6 +3143,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
}
struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs) {
auto * memory = ctx->get_memory();
llama_memory_context_ptr mctx;
if (memory) {
mctx = memory->init_full();
}
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
}
// llama adapter API
int32_t llama_set_adapters_lora(

View File

@ -31,6 +31,9 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool auto_fa;
bool fused_gdn_ar; // use fused gated delta net (autoregressive)
bool fused_gdn_ch; // use fused gated delta net (chunked)
bool auto_fgdn;
bool no_perf;
bool warmup;
bool op_offload;

View File

@ -0,0 +1,12 @@
#pragma once
#include "llama-context.h"
#include "ggml.h"
#include "stdint.h"
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
struct llama_context * ctx,
uint32_t n_tokens,
uint32_t n_seqs,
uint32_t n_outputs);

View File

@ -601,7 +601,7 @@ const char * llama_grammar_parser::parse_sequence(
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
uint64_t min_times = std::stoull(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
uint64_t max_times = UINT64_MAX; // default: no max limit
@ -614,7 +614,7 @@ const char * llama_grammar_parser::parse_sequence(
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
max_times = std::stoull(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
@ -1160,13 +1160,13 @@ struct llama_grammar * llama_grammar_init_impl(
// if there is a grammar, parse it
// rules will be empty (default) if there are parse errors
if (!parser.parse(grammar_str) || parser.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
LLAMA_LOG_ERROR("failed to parse grammar\n");
return nullptr;
}
// Ensure that there is a "root" node.
if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
// Ensure that the grammar contains the start symbol
if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
return nullptr;
}
@ -1195,7 +1195,7 @@ struct llama_grammar * llama_grammar_init_impl(
continue;
}
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
return nullptr;
}
}

View File

@ -250,7 +250,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
const bool last = (
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL)) // qwen3 reranking & embedding models use last token
);
for (int i = 0; i < n_tokens; ++i) {
@ -509,6 +509,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
float * data = (float *) cross_kq_mask->data;
for (int i = 0; i < n_tokens; ++i) {
GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
for (int j = 0; j < n_enc; ++j) {
float f = -INFINITY;
@ -848,13 +849,13 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
ubatch (params.ubatch),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
n_rot (hparams.n_rot),
n_rot (hparams.n_rot()),
n_ctx (cparams.n_ctx),
n_head (hparams.n_head()),
n_head_kv (hparams.n_head_kv()),
n_embd_head_k (hparams.n_embd_head_k),
n_embd_head_k (hparams.n_embd_head_k()),
n_embd_k_gqa (hparams.n_embd_k_gqa()),
n_embd_head_v (hparams.n_embd_head_v),
n_embd_head_v (hparams.n_embd_head_v()),
n_embd_v_gqa (hparams.n_embd_v_gqa()),
n_expert (hparams.n_expert),
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
@ -899,7 +900,8 @@ ggml_tensor * llm_graph_context::build_cvec(
ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w,
ggml_tensor * cur) const {
ggml_tensor * cur,
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
for (const auto & lora : *loras) {
@ -920,6 +922,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
res = ggml_add(ctx0, res, ab_cur);
}
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
return res;
}
@ -1161,12 +1167,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in,
ggml_tensor * gate_up_exps) const {
ggml_tensor * gate_up_exps,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
ggml_tensor * down_exps_s) const {
return build_moe_ffn(
cur,
gate_inp, /* gate_inp_b */ nullptr,
@ -1178,12 +1186,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
n_expert_used,
type_op,
norm_w,
scale_w,
w_scale,
gating_op,
il,
probs_in,
gate_up_exps
gate_up_exps,
/* gate_up_exps_b */ nullptr,
up_exps_s,
gate_exps_s,
down_exps_s
);
}
@ -1202,13 +1213,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in,
ggml_tensor * gate_up_exps,
ggml_tensor * gate_up_exps_b) const {
ggml_tensor * gate_up_exps_b,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
ggml_tensor * down_exps_s) const {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
@ -1330,7 +1343,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
}
if (scale_w) {
if (w_scale != 0.0f && w_scale != 1.0f) {
weights = ggml_scale(ctx0, weights, w_scale);
cb(weights, "ffn_moe_weights_scaled", il);
}
@ -1360,6 +1373,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(gate_up, "ffn_moe_gate_up_biased", il);
}
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
gate_up = ggml_mul(ctx0, gate_up, s);
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
const int64_t n_ff = gate_up->ne[0] / 2;
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
cb(cur, "ffn_moe_gate", il);
@ -1375,6 +1397,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(up, "ffn_moe_up_biased", il);
}
// apply per-expert scale2 to up
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
up = ggml_mul(ctx0, up, s);
cb(up, "ffn_moe_up_scaled", il);
}
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
@ -1386,6 +1417,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
cb(cur, "ffn_moe_gate_biased", il);
}
// apply per-expert scale2 to gate
if (gate_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
cur = ggml_mul(ctx0, cur, s);
cb(cur, "ffn_moe_gate_scaled", il);
}
}
const bool has_gate = gate_exps || gate_up_exps;
@ -1465,6 +1505,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(experts, "ffn_moe_down_biased", il);
}
// apply per-expert scale2 to down
if (down_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
experts = ggml_mul(ctx0, experts, s);
cb(experts, "ffn_moe_down_scaled", il);
}
if (!weight_before_ffn) {
experts = ggml_mul(ctx0, experts, weights);
cb(cur, "ffn_moe_weighted", il);
@ -1607,6 +1656,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
// this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur);
ggml_set_name(cur, "attn_scale");
res->add_input(std::move(inp));
@ -1616,7 +1666,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
// but this would make the graph topology depend on the number of output tokens, which can interere with
// features that require constant topology such as pipline parallelism
// features that require constant topology such as pipeline parallelism
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
//if (n_outputs < n_tokens) {
// return nullptr;
@ -1779,7 +1829,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
if (v_mla) {
#if 0
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
// However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
cur = ggml_mul_mat(ctx0, v_mla, cur);
#else
@ -2553,7 +2603,7 @@ void llm_graph_context::build_pooling(
}
// softmax for qwen3 reranker
if (arch == LLM_ARCH_QWEN3) {
if (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL) {
cur = ggml_soft_max(ctx0, cur);
}
} break;

View File

@ -764,10 +764,11 @@ struct llm_graph_context {
ggml_tensor * cur,
int il) const;
// do mat_mul, while optionally apply lora
// do mat_mul, while optionally apply lora and per-tensor scale
ggml_tensor * build_lora_mm(
ggml_tensor * w,
ggml_tensor * cur) const;
ggml_tensor * cur,
ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora
ggml_tensor * build_lora_mm_id(
@ -810,12 +811,14 @@ struct llm_graph_context {
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in = nullptr,
ggml_tensor * gate_up_exps = nullptr) const;
ggml_tensor * gate_up_exps = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
ggml_tensor * down_exps_s = nullptr) const;
ggml_tensor * build_moe_ffn(
ggml_tensor * cur,
@ -832,13 +835,15 @@ struct llm_graph_context {
int64_t n_expert_used,
llm_ffn_op_type type_op,
bool norm_w,
bool scale_w,
float w_scale,
llama_expert_gating_func_type gating_op,
int il,
ggml_tensor * probs_in = nullptr,
ggml_tensor * gate_up_exps = nullptr,
ggml_tensor * gate_up_exps_b = nullptr) const;
ggml_tensor * gate_up_exps_b = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
ggml_tensor * down_exps_s = nullptr) const;
//
// inputs

View File

@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
return n_head/n_head_kv;
}
uint32_t llama_hparams::n_rot(uint32_t il) const {
if (il < n_layer) {
return is_swa(il) ? n_rot_swa : n_rot_full;
}
GGML_ABORT("fatal error");
}
uint32_t llama_hparams::n_embd_inp() const {
uint32_t n_embd_inp = n_embd;
@ -76,16 +84,32 @@ uint32_t llama_hparams::n_embd_out() const {
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
}
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
if (il < n_layer) {
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
}
GGML_ABORT("fatal error");
}
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
if (il < n_layer) {
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
}
GGML_ABORT("fatal error");
}
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);
return n_embd_head_k * n_head_kv;
return n_embd_head_k(il) * n_head_kv;
}
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
const uint32_t n_head_kv = this->n_head_kv(il);
return n_embd_head_v * n_head_kv;
return n_embd_head_v(il) * n_head_kv;
}
bool llama_hparams::is_n_embd_k_gqa_variable() const {
@ -197,11 +221,11 @@ bool llama_hparams::is_mla() const {
}
uint32_t llama_hparams::n_embd_head_k_mla() const {
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
}
uint32_t llama_hparams::n_embd_head_v_mla() const {
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
}
bool llama_hparams::has_kv(uint32_t il) const {

View File

@ -44,13 +44,20 @@ struct llama_hparams {
uint32_t n_embd;
uint32_t n_layer;
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
uint32_t n_rot;
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
uint32_t n_rel_attn_bkts = 0;
// different head size for full_attention and SWA layers
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
uint32_t n_embd_head_k_swa;
uint32_t n_embd_head_v_swa;
// different RoPE dimensions for full_attention and SWA layers
uint32_t n_rot_full;
uint32_t n_rot_swa;
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
uint32_t n_embd_head_k_mla_impl = 0;
uint32_t n_embd_head_v_mla_impl = 0;
@ -82,6 +89,7 @@ struct llama_hparams {
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;
float f_norm_eps;
@ -247,12 +255,18 @@ struct llama_hparams {
uint32_t n_gqa(uint32_t il = 0) const;
uint32_t n_rot(uint32_t il = 0) const;
// dimension of main + auxiliary input embeddings
uint32_t n_embd_inp() const;
// dimension of output embeddings
uint32_t n_embd_out() const;
// dimension of key/value embeddings for each head (per layer)
uint32_t n_embd_head_k(uint32_t il = 0) const;
uint32_t n_embd_head_v(uint32_t il = 0) const;
// dimension of key embeddings across all k-v heads
uint32_t n_embd_k_gqa(uint32_t il = 0) const;

View File

@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
char buf[256];
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
for (size_t i = 1; i < ne.size(); i++) {
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
}
return buf;
}

View File

@ -70,4 +70,6 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
#define LLAMA_TENSOR_NAME_FGDN_AR "__fgdn_ar__"
#define LLAMA_TENSOR_NAME_FGDN_CH "__fgdn_ch__"

View File

@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
break;
}
// remeber the position that we found
// remember the position that we found
res.push_back(sinfo_new);
// store the old state of the cells in the recovery stack
@ -1033,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
return ggml_view_4d(ctx, k,
hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
ggml_row_size(k->type, hparams.n_embd_head_k),
hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
ggml_row_size(k->type, hparams.n_embd_head_k(il)),
ggml_row_size(k->type, n_embd_k_gqa),
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
@ -1056,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
if (!v_trans) {
// note: v->nb[1] <= v->nb[2]
return ggml_view_4d(ctx, v,
hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
@ -1065,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
// note: v->nb[1] > v->nb[2]
return ggml_view_4d(ctx, v,
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
ggml_row_size(v->type, kv_size), // v->nb[2]
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
}
for (uint32_t s = 0; s < n_stream; ++s) {
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
// bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
@ -1544,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale) const {
float freq_scale,
uint32_t il) const {
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@ -1552,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
const auto & n_rot = hparams.n_rot;
const auto & n_rot = hparams.n_rot(il);
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
@ -1606,13 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
auto * ctx = res->get_ctx();
auto * gf = res->get_gf();
const auto & n_embd_head_k = hparams.n_embd_head_k;
//const auto & n_embd_head_v = hparams.n_embd_head_v;
const auto & n_rot = hparams.n_rot;
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
@ -1626,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const auto n_rot = hparams.n_rot(il);
const auto n_embd_head_k = hparams.n_embd_head_k(il);
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
const float freq_base_l = model.get_rope_freq_base (cparams, il);
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
@ -1638,7 +1636,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
ggml_row_size(layer.k->type, n_embd_k_gqa),
ggml_row_size(layer.k->type, n_embd_nope));
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
ggml_build_forward_expand(gf, cur);
}
@ -1760,8 +1758,10 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t
io.write(&pos, sizeof(pos));
io.write(&n_seq_id, sizeof(n_seq_id));
// TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
if (hparams.n_pos_per_embd() > 1) {
const llama_kv_cell_ext ext = cells.ext_get(i);
io.write(&ext, sizeof(ext));
}
for (const auto & seq_id : seq_ids) {
io.write(&seq_id, sizeof(seq_id));
@ -1895,6 +1895,14 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
return false;
}
if (hparams.n_pos_per_embd() > 1) {
llama_kv_cell_ext ext;
io.read_to(&ext, sizeof(ext));
ubatch.pos[i + ubatch.n_tokens] = ext.y;
ubatch.pos[i + ubatch.n_tokens*2] = ext.x;
}
// read the sequence id, but directly discard it - we will use dest_seq_id instead
{
llama_seq_id seq_id;
@ -1945,6 +1953,12 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
cells.pos_set(i, pos);
if (hparams.n_pos_per_embd() > 1) {
llama_kv_cell_ext ext;
io.read_to(&ext, sizeof(ext));
cells.ext_set(i, ext);
}
for (uint32_t j = 0; j < n_seq_id; ++j) {
llama_seq_id seq_id;
io.read_to(&seq_id, sizeof(seq_id));

View File

@ -264,7 +264,8 @@ private:
ggml_tensor * shift,
ggml_tensor * factors,
float freq_base,
float freq_scale) const;
float freq_scale,
uint32_t il) const;
ggml_cgraph * build_graph_shift(
llm_graph_result * res,

View File

@ -1,12 +1,17 @@
#include "llama-model-loader.h"
#include "ggml-alloc.h"
#include "ggml.h"
#include "gguf.h"
#include "llama-hparams.h"
#include <algorithm>
#include <array>
#include <cinttypes>
#include <cstdint>
#include <cstring>
#include <future>
#include <regex>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
@ -37,6 +42,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
@ -263,7 +269,7 @@ namespace GGUFMeta {
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
const int kid = gguf_find_key(metadata, key.c_str());
if (kid < 0) {
if (required) {
@ -273,7 +279,7 @@ namespace GGUFMeta {
}
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
result = arr_info.length;
@ -290,7 +296,7 @@ namespace GGUFMeta {
template<typename T>
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
const gguf_context * ctx = meta.get();
const gguf_context * ctx = metadata;
const int kid = gguf_find_key(ctx, key.c_str());
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
@ -331,7 +337,7 @@ namespace GGUFMeta {
template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
const gguf_context * ctx = meta.get();
const gguf_context * ctx = metadata;
const int kid = gguf_find_key(ctx, key.c_str());
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
@ -393,7 +399,7 @@ namespace GGUFMeta {
const struct llama_model_kv_override * override =
it != kv_overrides.end() ? &it->second : nullptr;
const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
if (required && !found) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@ -427,7 +433,7 @@ namespace GGUFMeta {
// get array of n <= N_MAX elements, or a single element repeated n times
template<typename T, size_t N_MAX>
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
const int kid = gguf_find_key(meta.get(), key.c_str());
const int kid = gguf_find_key(metadata, key.c_str());
if (kid < 0) {
if (required) {
@ -440,9 +446,9 @@ namespace GGUFMeta {
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
}
if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
struct GGUFMeta::ArrayInfo arr_info =
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
if (n != arr_info.length) {
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
@ -473,7 +479,7 @@ namespace GGUFMeta {
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
const std::string key = llm_kv(kid);
const int id = gguf_find_key(meta.get(), key.c_str());
const int id = gguf_find_key(metadata, key.c_str());
if (id < 0) {
if (required) {
@ -483,7 +489,7 @@ namespace GGUFMeta {
}
// throw and error if type is an array
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
if (required) {
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
}
@ -500,6 +506,9 @@ namespace GGUFMeta {
llama_model_loader::llama_model_loader(
struct gguf_context * meta,
llama_model_set_tensor_data_t set_tensor_data,
void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
@ -507,7 +516,8 @@ llama_model_loader::llama_model_loader(
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
: metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
@ -521,136 +531,142 @@ llama_model_loader::llama_model_loader(
tensor_buft_overrides = param_tensor_buft_overrides_p;
// Load the main GGUF
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
if (!fname.empty()) {
// Load the main GGUF
struct ggml_context * ctx = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
meta.reset(gguf_init_from_file(fname.c_str(), params));
if (!meta) {
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
}
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);
if (use_mmap && use_direct_io) {
if (files.back()->has_direct_io()) {
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
use_mmap = false;
} else {
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
use_direct_io = false;
// reopen file using std::fopen for mmap
files.pop_back();
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
}
}
// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
}
uint16_t n_split = 0;
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
// Load additional GGML contexts
if (n_split > 1) {
// make sure the main file is loaded first
uint16_t idx = 0;
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
get_key(kv_split_no, idx);
if (idx != 0) {
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
metadata = metadata_ptr.get();
if (metadata == nullptr) {
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
}
// generate list of splits if needed
if (splits.empty()) {
splits = llama_get_list_splits(fname, idx, n_split);
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);
if (use_mmap && use_direct_io) {
if (files.back()->has_direct_io()) {
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
use_mmap = false;
} else {
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
use_direct_io = false;
// reopen file using std::fopen for mmap
files.pop_back();
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
}
}
// in case user give a custom list of splits, check if it matches the expected number
if (n_split != (uint16_t)splits.size()) {
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
}
uint16_t n_split = 0;
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
if (trace > 0) {
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
}
// load other splits
for (idx = 1; idx < n_split; idx++) {
const char * fname_split = splits[idx].c_str();
struct gguf_init_params split_params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
// Load additional GGML contexts
if (n_split > 1) {
// make sure the main file is loaded first
uint16_t idx = 0;
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
get_key(kv_split_no, idx);
if (idx != 0) {
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
}
// check idx
// generate list of splits if needed
if (splits.empty()) {
splits = llama_get_list_splits(fname, idx, n_split);
}
// in case user give a custom list of splits, check if it matches the expected number
if (n_split != (uint16_t)splits.size()) {
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
}
if (trace > 0) {
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
}
// load other splits
for (idx = 1; idx < n_split; idx++) {
const char * fname_split = splits[idx].c_str();
struct gguf_init_params split_params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx,
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
}
// check idx
{
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
if (kid < 0) {
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
}
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
if (idx_gguf != idx) {
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
}
}
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
}
}
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
// sanity check
{
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
if (kid < 0) {
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
}
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
if (idx_gguf != idx) {
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
const int n_tensors_loaded = (int) weights_map.size();
if (n_tensors != n_tensors_loaded) {
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
}
}
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string tensor_name = std::string(cur->name);
// make sure there is no duplicated tensor names
if (weights_map.find(tensor_name) != weights_map.end()) {
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
}
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
}
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
}
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
// sanity check
{
const int n_tensors_loaded = (int) weights_map.size();
if (n_tensors != n_tensors_loaded) {
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
}
}
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
} else {
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
}
n_kv = gguf_get_n_kv(meta.get());
n_kv = gguf_get_n_kv(metadata);
n_tensors = weights_map.size();
fver = (enum llama_fver) gguf_get_version(meta.get());
fver = (enum llama_fver) gguf_get_version(metadata);
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
@ -709,6 +725,7 @@ llama_model_loader::llama_model_loader(
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -729,14 +746,14 @@ llama_model_loader::llama_model_loader(
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(meta.get(), i);
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
const char * name = gguf_get_key(metadata, i);
const enum gguf_type type = gguf_get_kv_type(metadata, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(meta.get(), i);
std::string value = gguf_kv_to_str(metadata, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@ -838,15 +855,382 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
return cur;
}
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
// checks if the weight tensor can be used with the specified buffer type and device
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
GGML_ASSERT(w != nullptr);
if (op == GGML_OP_NONE) {
return true;
}
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context_ptr ctx_ptr { ggml_init(params) };
if (!ctx_ptr) {
throw std::runtime_error(format("failed to create ggml context"));
}
ggml_context * ctx = ctx_ptr.get();
ggml_tensor * op_tensor = nullptr;
switch (op) {
case GGML_OP_GET_ROWS:
{
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
op_tensor = ggml_get_rows(ctx, w, b);
} break;
case GGML_OP_MUL_MAT:
{
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
op_tensor = ggml_mul_mat(ctx, w, b);
} break;
case GGML_OP_MUL_MAT_ID:
{
const int n_expert_used = hparams.n_expert_used;
GGML_ASSERT(n_expert_used > 0);
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
} break;
case GGML_OP_ADD:
{
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_add(ctx, a, w);
} break;
case GGML_OP_ADD_ID:
{
const int n_expert_used = hparams.n_expert_used;
GGML_ASSERT(n_expert_used > 0);
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
op_tensor = ggml_add_id(ctx, a, w, c);
} break;
case GGML_OP_MUL:
{
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
op_tensor = ggml_mul(ctx, a, w);
} break;
case GGML_OP_DIV:
{
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
op_tensor = ggml_div(ctx, a, w);
} break;
case GGML_OP_ROPE:
{
const int n_embd_head = hparams.n_embd_head_v();
const int n_head = hparams.n_head();
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
op_tensor = ggml_rope_ext(
ctx, a, b, w,
0, 0, 0, 0, 0,
0, 0, 0, 0
);
} break;
case GGML_OP_SSM_CONV:
{
const int64_t n_seq_tokens = 512;
const int64_t n_seqs = 3;
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
} break;
case GGML_OP_SSM_SCAN:
{
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
const int64_t n_head = w->ne[1];
const int64_t head_dim = hparams.ssm_d_inner / n_head;
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
const int64_t n_seq_tokens = 512;
const int64_t n_seqs = 3;
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
} break;
case GGML_OP_RWKV_WKV6:
{
// FIXME
const int64_t S = 123;
const int64_t H = 123;
const int64_t n_tokens = 123;
const int64_t n_seqs = 123;
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * tf = w;
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
} break;
case GGML_OP_IM2COL:
{
const int n_embd_inp = hparams.n_embd_inp();
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
} break;
case GGML_OP_SCALE:
{
op_tensor = ggml_scale(ctx, w, 1.0f);
} break;
default:
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
}
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
GGML_ASSERT(w->buffer == nullptr);
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
ggml_backend_buffer_free(w->buffer);
w->buffer = nullptr;
return op_supported;
}
// find the first buffer type in the list that can use the tensor
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
GGML_ASSERT(!buft_list->empty());
for (const auto & cur : *buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
return cur_buft;
}
}
return nullptr;
}
struct ggml_tensor * llama_model_loader::create_tensor(
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
// one ggml context per buffer type
int max_n_tensors = n_tensors;
max_n_tensors += 1; // duplicated output tensor
max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
if (files.empty()) {
max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
}
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
throw std::runtime_error(format("failed to create ggml context"));
}
ctx_map.emplace(buft, ctx);
return ctx;
}
return it->second.get();
};
auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
if (!t_meta) {
if (flags & TENSOR_NOT_REQUIRED) {
return nullptr;
}
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
}
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
// the tensor is duplicated
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
llm_tensor tn_tensor = tn.tensor;
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
tn_tensor = LLM_TENSOR_OUTPUT;
}
llm_tensor_info info;
try {
info = llm_tensor_info_for(tn_tensor);
} catch (const std::out_of_range & e) {
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
}
// skip unused tensors
if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
const size_t nbytes = ggml_nbytes(t_meta);
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
size_data -= nbytes;
n_created++;
return nullptr;
}
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
ggml_op op;
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
if (bias) {
if (info.op == GGML_OP_MUL_MAT_ID) {
op = GGML_OP_ADD_ID;
} else {
op = GGML_OP_ADD;
}
} else {
op = info.op;
}
// sanity checks
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
if (tn.bid != -1) {
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
}
} else {
if (tn.bid == -1) {
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
}
}
// select the buffer type for this tensor
const buft_list_t * buft_list;
switch (info.layer) {
case LLM_TENSOR_LAYER_INPUT:
buft_list = buft_list_input;
break;
case LLM_TENSOR_LAYER_OUTPUT:
buft_list = buft_list_output;
break;
case LLM_TENSOR_LAYER_REPEATING:
GGML_ASSERT(buft_list_layer != nullptr);
buft_list = buft_list_layer;
break;
default:
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
}
ggml_backend_buffer_type_t buft = nullptr;
// check overrides
if (tensor_buft_overrides) {
std::string tensor_name = tn.str();
for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
std::regex pattern(overrides->pattern);
if (std::regex_search(tensor_name, pattern)) {
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
// when overriding to a CPU buffer, consider the extra buffer types
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
} else {
buft = overrides->buft;
}
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
tensor_name.c_str(),
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
ggml_backend_buft_name(buft));
break;
}
}
}
if (!buft) {
buft = select_weight_buft(hparams, t_meta, op, buft_list);
if (!buft) {
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
}
}
// avoid using a host buffer when using mmap
auto * buft_dev = ggml_backend_buft_get_device(buft);
if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
throw std::runtime_error("no CPU backend found");
}
buft = ggml_backend_dev_buffer_type(cpu_dev);
}
if (buft != buft_list->front().second) {
if (n_tensors_moved == 0) {
first_tensor_moved_name = t_meta->name;
first_tensor_moved_type_name = ggml_type_name(t_meta->type);
first_moved_from_buft = buft_list->front().second;
first_moved_to_buft = buft;
}
n_tensors_moved++;
}
return buft;
};
if (files.empty()) {
if (flags & TENSOR_SKIP_IF_VIRTUAL) {
return nullptr;
}
ggml_type type = GGML_TYPE_F32;
const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
if (tid != -1) {
type = gguf_get_tensor_type(metadata, tid);
}
// for tensors that are not required some of the dimensions can be invalid:
if (flags & TENSOR_NOT_REQUIRED) {
for (size_t dim = 0; dim < ne.size(); dim++) {
if (ne.begin()[dim] <= 0) {
return nullptr;
}
}
}
ggml_tensor t_meta;
memset(&t_meta, 0, sizeof(ggml_tensor));
t_meta.type = type;
for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
GGML_ASSERT(t_meta.ne[dim] >= 1);
t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
GGML_ASSERT(t_meta.nb[dim] >= 1);
}
ggml_set_name(&t_meta, tn.str().c_str());
ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
GGML_ASSERT(buft != nullptr);
ggml_context * ctx = ctx_for_buft(buft);
ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
ggml_set_name(ret, tn.str().c_str());
return ret;
}
ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
if (buft == nullptr) {
return nullptr; // return type is ggml_tensor *
}
ggml_context * ctx = ctx_for_buft(buft);
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
if (flags & TENSOR_DUPLICATED) {
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
if (t) {
return t;
}
}
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
if (cur == NULL) {
return NULL;
}
bool duplicated = flags & TENSOR_DUPLICATED;
const bool duplicated = flags & TENSOR_DUPLICATED;
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
ggml_set_name(tensor, ggml_get_name(cur));
@ -858,7 +1242,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
}
return tensor;
}
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
@ -893,6 +1276,11 @@ void llama_model_loader::done_getting_tensors() const {
if (n_created != n_tensors) {
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
}
if (n_tensors_moved > 0) {
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
__func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
}
}
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
@ -974,6 +1362,12 @@ bool llama_model_loader::load_all_data(
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
if (files.empty()) {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
set_tensor_data(t, set_tensor_data_ud);
}
return true;
}
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
std::vector<no_init<uint8_t>> read_buf;

View File

@ -4,17 +4,22 @@
#include "llama-impl.h"
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <cstddef>
#include <cstring>
#include <map>
#include <stdexcept>
#include <unordered_map>
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
// lists of buffer types used for each layer
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
@ -58,9 +63,10 @@ struct llama_model_loader {
}
};
static const int TENSOR_NOT_REQUIRED = 1 << 0;
static const int TENSOR_DUPLICATED = 1 << 1;
static const int TENSOR_SKIP = 1 << 2;
static const int TENSOR_NOT_REQUIRED = 1 << 0;
static const int TENSOR_DUPLICATED = 1 << 1;
static const int TENSOR_SKIP = 1 << 2;
static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
int n_kv = 0;
int n_tensors = 0;
@ -84,7 +90,10 @@ struct llama_model_loader {
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
const llama_model_tensor_buft_override * tensor_buft_overrides;
gguf_context_ptr meta;
gguf_context_ptr metadata_ptr;
struct gguf_context * metadata; // either metadata_ptr.get() or externally set
llama_model_set_tensor_data_t set_tensor_data;
void * set_tensor_data_ud;
std::vector<ggml_context_ptr> contexts;
std::string arch_name;
@ -94,7 +103,26 @@ struct llama_model_loader {
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
struct ggml_backend_buft_comparator {
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
}
};
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
// track tensors that had to be moved for debugging:
size_t n_tensors_moved = 0;
std::string first_tensor_moved_name;
std::string first_tensor_moved_type_name;
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
llama_model_loader(
struct gguf_context * metadata,
llama_model_set_tensor_data_t set_tensor_data,
void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
@ -149,7 +177,9 @@ struct llama_model_loader {
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
struct ggml_tensor * create_tensor(
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);

View File

@ -7,14 +7,19 @@
#include "llama-model.h"
#include "llama-vocab.h"
#include <cstdint>
#include <string>
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
gguf_ctx = gguf_init_empty();
}
llama_model_saver::llama_model_saver(const struct llama_model * model) :
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
llama_model_saver::~llama_model_saver() {
gguf_free(gguf_ctx);
if (gguf_ctx_owned) {
gguf_free(gguf_ctx);
}
}
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
@ -46,7 +51,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
template <typename Container>
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
GGML_ASSERT(model != nullptr || !per_layer);
const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
GGML_ASSERT(n_values <= value.size());
if (n_values == 0) {
@ -83,6 +89,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
GGML_ABORT("fatal error");
}
}
// instantiate for external usage:
template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
std::vector<const char *> tmp(value.size());
@ -104,37 +112,39 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
}
void llama_model_saver::add_kv_from_model() {
const llama_hparams & hparams = model.hparams;
const llama_vocab & vocab = model.vocab;
const llama_hparams & hparams = model->hparams;
const llama_vocab & vocab = model->vocab;
const int32_t n_vocab = vocab.n_tokens();
std::vector<std::string> tokens(n_vocab);
std::vector<float> scores(n_vocab);
std::vector<int32_t> token_types(n_vocab);
for (int32_t id = 0; id < n_vocab; ++id) {
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
for (int32_t id = 0; id < n_vocab; ++id) {
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
tokens[id] = token_data.text;
scores[id] = token_data.score;
tokens[id] = token_data.text;
scores[id] = token_data.score;
switch(token_data.attr) {
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
case LLAMA_TOKEN_ATTR_UNDEFINED:
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
switch(token_data.attr) {
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
case LLAMA_TOKEN_ATTR_UNDEFINED:
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
}
}
}
// add_kv(LLM_KV_GENERAL_TYPE, ???);
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
add_kv(LLM_KV_GENERAL_NAME, model.name);
add_kv(LLM_KV_GENERAL_NAME, model->name);
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
// add_kv(LLM_KV_GENERAL_VERSION, ???);
// add_kv(LLM_KV_GENERAL_URL, ???);
@ -176,8 +186,10 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
@ -189,7 +201,8 @@ void llama_model_saver::add_kv_from_model() {
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
@ -255,25 +268,25 @@ void llama_model_saver::add_kv_from_model() {
}
void llama_model_saver::add_tensors_from_model() {
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
}
add_tensor(model.type_embd);
add_tensor(model.pos_embd);
add_tensor(model.tok_norm);
add_tensor(model.tok_norm_b);
add_tensor(model.output_norm);
add_tensor(model.output_norm_b);
add_tensor(model.output);
add_tensor(model.output_b);
add_tensor(model.output_norm_enc);
add_tensor(model.cls);
add_tensor(model.cls_b);
add_tensor(model.cls_out);
add_tensor(model.cls_out_b);
add_tensor(model.cls_norm);
add_tensor(model->type_embd);
add_tensor(model->pos_embd);
add_tensor(model->tok_norm);
add_tensor(model->tok_norm_b);
add_tensor(model->output_norm);
add_tensor(model->output_norm_b);
add_tensor(model->output);
add_tensor(model->output_b);
add_tensor(model->output_norm_enc);
add_tensor(model->cls);
add_tensor(model->cls_b);
add_tensor(model->cls_out);
add_tensor(model->cls_out_b);
add_tensor(model->cls_norm);
for (const struct llama_layer & layer : model.layers) {
for (const struct llama_layer & layer : model->layers) {
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
}

View File

@ -1,5 +1,6 @@
#pragma once
#include "gguf.h"
#include "llama.h"
#include "llama-arch.h"
@ -7,10 +8,12 @@
struct llama_model_saver {
struct gguf_context * gguf_ctx = nullptr;
const struct llama_model & model;
const bool gguf_ctx_owned;
const struct llama_model * model;
const struct LLM_KV llm_kv;
llama_model_saver(const struct llama_model & model);
llama_model_saver(const struct llama_model * model);
llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
~llama_model_saver();
void add_kv(enum llm_kv key, uint32_t value);

File diff suppressed because it is too large Load Diff

View File

@ -54,6 +54,7 @@ enum llm_type {
LLM_TYPE_0_3B,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
LLM_TYPE_0_8B,
LLM_TYPE_1B,
LLM_TYPE_1_2B,
LLM_TYPE_1_3B,
@ -125,12 +126,15 @@ enum llm_type {
LLM_TYPE_100B_A6B,
LLM_TYPE_102B_A12B, // Solar-Open
LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_120B_A12B, // Nemotron 3 Super
LLM_TYPE_122B_A10B, // Qwen3.5
LLM_TYPE_196B_A11B, // Step3.5-Flash
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
LLM_TYPE_355B_A32B, // GLM-4.5
LLM_TYPE_397B_A17B, // Qwen3.5
LLM_TYPE_744B_A40B, // GLM-5
LLM_TYPE_E2B,
LLM_TYPE_E4B,
@ -291,6 +295,15 @@ struct llama_layer {
struct ggml_tensor * ffn_up_exps_b = nullptr;
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
// ff MoE per-expert scales (NVFP4 per-tensor scale2)
struct ggml_tensor * ffn_gate_exps_s = nullptr;
struct ggml_tensor * ffn_down_exps_s = nullptr;
struct ggml_tensor * ffn_up_exps_s = nullptr;
// ff MoE latent proj
struct ggml_tensor * ffn_latent_down = nullptr;
struct ggml_tensor * ffn_latent_up = nullptr;
// ff shared expert (shexp)
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
struct ggml_tensor * ffn_gate_shexp = nullptr;
@ -384,13 +397,21 @@ struct llama_layer {
struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale = nullptr;
struct ggml_tensor * wk_scale = nullptr;
struct ggml_tensor * wv_scale = nullptr;
struct ggml_tensor * wo_scale = nullptr;
struct ggml_tensor * ffn_gate_scale = nullptr;
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
struct ggml_tensor * wq_s = nullptr;
struct ggml_tensor * wk_s = nullptr;
struct ggml_tensor * wv_s = nullptr;
struct ggml_tensor * wo_s = nullptr;
struct ggml_tensor * wqkv_s = nullptr;
struct ggml_tensor * wqkv_gate_s = nullptr;
struct ggml_tensor * ffn_gate_s = nullptr;
struct ggml_tensor * ffn_up_s = nullptr;
struct ggml_tensor * ffn_down_s = nullptr;
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
struct ggml_tensor * ffn_up_shexp_s = nullptr;
struct ggml_tensor * ffn_down_shexp_s = nullptr;
struct ggml_tensor * ssm_out_s = nullptr;
struct ggml_tensor * ssm_alpha_s = nullptr;
struct ggml_tensor * ssm_beta_s = nullptr;
// altup & laurel
struct ggml_tensor * per_layer_inp_gate = nullptr;

File diff suppressed because it is too large Load Diff

View File

@ -1719,7 +1719,7 @@ private:
};
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
struct gguf_context * ctx = ml.meta.get();
struct gguf_context * ctx = ml.metadata;
// determine vocab type
{
@ -1833,7 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// correct endiannes of data in precompiled_charsmap binary blob
// correct endianness of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);

View File

@ -1,5 +1,6 @@
#include "llama.h"
#include "ggml-cpp.h"
#include "llama-impl.h"
#include "llama-chat.h"
@ -12,6 +13,7 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "gguf.h"
#include <algorithm>
#include <cassert>
@ -825,7 +827,8 @@ int64_t llama_time_us(void) {
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = 0;
@ -834,7 +837,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
model.t_start_us = tm.t_start_us;
try {
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info();
@ -880,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
}
static struct llama_model * llama_model_load_from_file_impl(
struct gguf_context * metadata,
llama_model_set_tensor_data_t set_tensor_data,
void * set_tensor_data_ud,
const std::string & path_model,
std::vector<std::string> & splits,
struct llama_model_params params) {
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
ggml_time_init();
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
@ -1003,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
props.memory_free/1024/1024);
}
const int status = llama_model_load(path_model, splits, *model, params);
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
@ -1019,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
return model;
}
struct llama_model * llama_model_init_from_user(
struct gguf_context * metadata,
llama_model_set_tensor_data_t set_tensor_data,
void * set_tensor_data_ud,
struct llama_model_params params) {
GGML_ASSERT(metadata != nullptr);
std::string path_model;
std::vector<std::string> splits = {};
params.use_mmap = false;
params.use_extra_bufts = false;
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
}
// deprecated
struct llama_model * llama_load_model_from_file(
const char * path_model,
@ -1030,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params) {
std::vector<std::string> splits = {};
return llama_model_load_from_file_impl(path_model, splits, params);
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
}
struct llama_model * llama_model_load_from_splits(
@ -1046,11 +1066,11 @@ struct llama_model * llama_model_load_from_splits(
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
return llama_model_load_from_file_impl(splits.front(), splits, params);
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
}
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
llama_model_saver ms(*model);
llama_model_saver ms(model);
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(path_model);

View File

@ -5,6 +5,7 @@
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include "ggml-opt.h"
#include "gguf.h"
#include <stddef.h>
#include <stdint.h>
@ -152,6 +153,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
@ -440,19 +442,30 @@ extern "C" {
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
// Create a new model from GGUF metadata as well as a function to set the tensor data
// - tensors are created as GGML_TYPE_F32 by default,
// override by adding a tensor with the same name but a different name to the context
LLAMA_API struct llama_model * llama_model_init_from_user(
struct gguf_context * metadata,
llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
void * set_tensor_data_ud, // userdata for function
struct llama_model_params params);
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params),
"use llama_model_load_from_file instead");
// Load the model from a file
// Load a model from a file
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
// If the split file name does not follow this pattern, use llama_model_load_from_splits
LLAMA_API struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params);
// Load the model from multiple splits (support custom naming scheme)
// Load a model from multiple splits (support custom naming scheme)
// The paths must be in the correct order
LLAMA_API struct llama_model * llama_model_load_from_splits(
const char ** paths,
@ -973,7 +986,7 @@ extern "C" {
// Logits for the ith token. For positive indices, Equivalent to:
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
// Negative indices can be used to access logits in reverse order, -1 is the last logit.
// returns NULL for invalid ids.
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
@ -988,7 +1001,7 @@ extern "C" {
// Get the embeddings for the ith token. For positive indices, Equivalent to:
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
// Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
// shape: [n_embd] (1-dimensional)
// returns NULL for invalid ids.
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@ -1008,9 +1021,9 @@ extern "C" {
// Returns LLAMA_TOKEN_NULL if no token was sampled.
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
// Get the backend sampled probabilites for the ith token
// Get the backend sampled probabilities for the ith token
// The index matches llama_get_sampled_token_ith().
// Returns NULL if no probabilites were generated.
// Returns NULL if no probabilities were generated.
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
@ -1337,7 +1350,7 @@ extern "C" {
float tau,
float eta);
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
/// @details Initializes a GBNF grammar, see grammars/README.md for details.
/// @param vocab The vocabulary that this grammar will be used with.
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
/// @param grammar_root The name of the start symbol for the grammar.

View File

@ -1,8 +1,8 @@
#include "models.h"
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
n_expert, n_expert_used,
LLM_FFN_SILU,
hparams.expert_weights_norm, // norm_w (route_norm=True)
hparams.expert_weights_scale, // scale_w
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);

View File

@ -3,10 +3,10 @@
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,10 +2,10 @@
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,11 +1,10 @@
#include "models.h"
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -2,10 +2,10 @@
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
);
break;
case LLM_TYPE_13B:
case LLM_TYPE_UNKNOWN:
break;
default:
GGML_ABORT("fatal error");

View File

@ -1,6 +1,5 @@
#include "models.h"
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur;
ggml_tensor * inpL;
@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
false, hparams.expert_weights_scale,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -1,13 +1,11 @@
#include "models.h"
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -1,12 +1,10 @@
#include "models.h"
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
// feed-forward network
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
// MoE branch
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr,
model.layers[il].ffn_down_exps,
nullptr,
hparams.n_expert, hparams.n_expert_used,
LLM_FFN_GELU, false,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
model.arch == LLM_ARCH_JINA_BERT_V3) {

View File

@ -2,9 +2,9 @@
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
// self-attention
{
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
if (model.layers[il].wq_scale) {
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
}
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
}
// B1.K
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
if (model.layers[il].wk_scale) {
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
}
// B1.V
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
if (model.layers[il].wv_scale) {
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
LLM_NORM_RMS, il);
cb(cur, "attn_sub_norm", il);
cur = build_lora_mm(model.layers[il].wo, cur);
if (model.layers[il].wo_scale) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
}
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo);
}
@ -115,8 +103,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
NULL, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
LLM_NORM_RMS, il);
cb(cur, "ffn_sub_norm", il);
cur = build_lora_mm(model.layers[il].ffn_down, cur);
if (model.layers[il].ffn_down_scale) {
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
}
cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
cb(cur, "ffn_down", il);
cur = ggml_add(ctx0, cur, ffn_inp);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -3,10 +3,10 @@
#include <float.h>
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,10 +2,10 @@
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,11 +1,11 @@
#include "models.h"
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,11 +2,11 @@
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * inpL;
ggml_tensor * cur;

View File

@ -1,9 +1,9 @@
#include "models.h"
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
const float f_logit_scale = hparams.f_logit_scale;

View File

@ -4,9 +4,9 @@
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
const float f_logit_scale = hparams.f_logit_scale;

View File

@ -1,12 +1,11 @@
#include "models.h"
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -3,10 +3,10 @@
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,13 +1,11 @@
#include "models.h"
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, hparams.expert_weights_scale,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -8,7 +8,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_rope = hparams.n_rot();
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
cb(Qcur, "Qcur_attn_temp_scaled", il);
}
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
cur = build_attn(inp_attn_k,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
@ -216,7 +216,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
hparams.expert_weights_scale, hparams.expert_weights_scale,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il,
nullptr,

View File

@ -1,6 +1,6 @@
#include "models.h"
#define CHUNK_SIZE 64
#include "llama-impl.h"
// utility to get one slice from the third dimension
// input dim: [x, y, c, b]
@ -57,7 +57,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
const int CS = CHUNK_SIZE;
const int CS = kda ? 16 : 64; // chunk size
const int pad = (CS - n_tokens % CS) % CS;
const int n_chunks = (n_tokens + pad) / CS;
@ -225,9 +225,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
cb(kg_t, "key_gdiff_t", il);
ggml_tensor * s_t = ggml_transpose(ctx0, s);
s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
cb(s_t, "dnet_add_ch_state", il);
s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
cb(s, "dnet_add_ch_state", il);
// [CS, S_v, n_chunks, H_v * n_seqs]
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
@ -240,7 +239,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
// [CS, S_v, 1, H_v * n_seqs]
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
cb(v_t_p, "v_prime", il);
// [CS, S_v, 1, H_v * n_seqs]
@ -252,7 +251,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
cb(v_attn, "v_attn", il);
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
cb(attn_inter, "attn_inter", il);
// [S_v, CS, 1, H_v * n_seqs]
@ -268,13 +267,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
s_t = ggml_add(ctx0, s_t, kgv);
cb(s_t, "dnet_add_ch_state", il);
s = ggml_mul(ctx0, s, ch_g_last_exp_t);
s = ggml_add(ctx0, s, kgv);
cb(s, "dnet_add_ch_state", il);
}
s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
// truncate padded tokens
ggml_tensor * o = ggml_view_4d(ctx0, v,
S_v, n_tokens, H_v, n_seqs,
@ -282,7 +279,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
ggml_row_size(v->type, S_v * CS * n_chunks),
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
s = ggml_transpose(ctx0, s_t);
s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
cb(s, "output_state", il);
return {o, s};
@ -341,11 +338,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
g = ggml_exp(ctx0, g);
s = ggml_mul(ctx0, s, g);
ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
// [1, S_v, H_v, n_seqs]
ggml_tensor * sk;
sk = ggml_mul (ctx0, s_t, k);
sk = ggml_mul (ctx0, s, k);
sk = ggml_sum_rows(ctx0, sk);
// [S_v, 1, H_v, n_seqs]
@ -362,15 +357,89 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
k = ggml_repeat(ctx0, k, s);
kd = ggml_mul (ctx0, k, d_t);
s_t = ggml_add(ctx0, s_t, kd);
s = ggml_add(ctx0, s, kd);
cb(s_t, "dnet_add_ar_state", il);
cb(s, "dnet_add_ar_state", il);
ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
ggml_tensor * s_q = ggml_mul (ctx0, s, q);
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
return {o, s};
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_fused(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * b,
ggml_tensor * s,
int il) {
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
const int64_t n_tokens = q->ne[2];
const int64_t n_seqs = q->ne[3];
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
GGML_ASSERT(S_k == S_v);
GGML_ASSERT(H_v % H_k == 0);
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
if (n_tokens == 1) {
cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
} else {
cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
}
ggml_tensor * output = ggml_view_4d(ctx0, result,
S_v, H_v, n_tokens, n_seqs,
ggml_row_size(result->type, S_v),
ggml_row_size(result->type, S_v * H_v),
ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
ggml_tensor * new_state = ggml_view_4d(ctx0, result,
S_v, S_v, H_v, n_seqs,
ggml_row_size(result->type, S_v),
ggml_row_size(result->type, S_v * S_v),
ggml_row_size(result->type, S_v * S_v * H_v),
ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
return {output, new_state};
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * b,
ggml_tensor * s,
int il) {
const int64_t n_seq_tokens = q->ne[2];
if (n_seq_tokens == 1) {
if (cparams.fused_gdn_ar) {
return build_delta_net_fused(q, k, v, g, b, s, il);
}
return build_delta_net_autoregressive(q, k, v, g, b, s, il);
}
if (cparams.fused_gdn_ch) {
return build_delta_net_fused(q, k, v, g, b, s, il);
}
return build_delta_net_chunking(q, k, v, g, b, s, il);
}

View File

@ -1,13 +1,11 @@
#include "models.h"
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -91,7 +89,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -5,10 +5,10 @@
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
//copied from qwen2
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,13 +1,11 @@
#include "models.h"
llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -103,7 +101,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -2,10 +2,10 @@
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,9 +1,9 @@
#include "models.h"
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,12 +1,11 @@
#include "models.h"
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head_k();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -100,7 +99,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -4,10 +4,10 @@
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -4,10 +4,10 @@
template <bool iswa>
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head_k();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,7 +2,7 @@
llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,11 +2,11 @@
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,7 +2,7 @@
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,7 +2,7 @@
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,7 +1,7 @@
#include "models.h"
llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,7 +2,7 @@
template <bool iswa>
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
const int64_t n_embd_head = hparams.n_embd_head_k();
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -3,7 +3,7 @@
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
n_embd_head(model.hparams.n_embd_head_k),
n_embd_head(model.hparams.n_embd_head_k()),
n_embd_altup(model.hparams.n_embd_altup),
n_altup(model.hparams.n_altup),
i_altup_act(model.hparams.i_altup_act) {

View File

@ -1,9 +1,9 @@
#include "models.h"
llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(routed_out, "ffn_moe_out", il);

View File

@ -3,10 +3,10 @@
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * pos;

View File

@ -2,10 +2,10 @@
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,9 @@
#include "models.h"
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
@ -160,7 +159,7 @@ ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor * cur,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -1,15 +1,14 @@
#include "models.h"
llm_build_granite::llm_build_granite(
const llama_model & model,
const llm_graph_params & params)
: llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -175,7 +174,7 @@ ggml_tensor * llm_build_granite::build_layer_ffn(
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -99,7 +99,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
nullptr,
n_expert, n_expert_used,
LLM_FFN_GELU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -1,14 +1,12 @@
#include "models.h"
llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -90,7 +88,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il,
probs);
@ -106,7 +104,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
nullptr,
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il,
probs);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -119,8 +119,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
n_expert, n_expert_used,
LLM_FFN_SILU,
true, // norm_topk_prob
false,
0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur_moe, "ffn_moe_out", il);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -3,10 +3,10 @@
// JAIS-2 model graph builder
// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,7 +1,7 @@
#include "models.h"
llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
ggml_tensor * cur;
ggml_tensor * inpL;
@ -76,7 +76,7 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -1,5 +1,4 @@
#include "models.h"
#include "ggml.h"
#include "llama-memory-recurrent.h"
@ -103,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
const int64_t kv_lora_rank = hparams.n_lora_kv;
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
const int64_t n_embd_head_qk_rope = hparams.n_rot(); // config.qk_rope_head_dim
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
// Attention scale for MLA
const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
@ -118,12 +117,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_build_forward_expand(gf, cur);
// Check layer type by checking which tensors exist
// KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
bool is_kda = (layer.ssm_a != nullptr);
bool is_mla = (layer.wkv_a_mqa != nullptr);
if (is_kda) {
if (hparams.is_recurrent(il)) {
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
// Reference: vLLM kda.py
const auto * mctx_cur = inp_rs->mctx;
@ -175,9 +169,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm);
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
build_delta_net_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
build_delta_net_chunking(Qcur, Kcur, Vcur, g1, beta, state, il);
auto attn_out = build_delta_net(Qcur, Kcur, Vcur, g1, beta, state, il);
ggml_tensor * output = ggml_cont(ctx0, attn_out.first);
ggml_tensor * new_state = attn_out.second;
@ -211,7 +203,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
cur = ggml_mul_mat(ctx0, layer.wo, gated);
cb(cur, "kda_out", il);
} else if (is_mla) {
} else {
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
// Reference: vLLM mla.py
// Step 1: Q projection and reshape
@ -310,9 +302,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cb(cur, "mla_out", il);
}
} else {
// Unknown layer type - this should not happen
GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
}
// On last layer, select only the output tokens
@ -349,7 +338,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_SILU, true,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);

View File

@ -23,17 +23,23 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
};
auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
return build_moe_ffn(cur,
model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
hparams.expert_weights_scale,
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
il);
};
auto build_attn_block = [&model, this](ggml_tensor * cur,
ggml_tensor * inp_pos,
inp_attn_type * inp_attn,
int il) -> ggml_tensor * {
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
const auto n_embd_head = hparams.n_embd_head_v;
const auto n_embd_head = hparams.n_embd_head_v();
const auto n_head_kv = hparams.n_head_kv(il);
auto * q = build_lora_mm(model.layers[il].wq, cur);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -90,7 +90,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -2,10 +2,10 @@
llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -134,7 +134,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);

View File

@ -2,10 +2,10 @@
template <bool embed>
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -43,19 +43,19 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
cb(Qcur, "Qcur", il);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il);
}
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
cb(Kcur, "Kcur", il);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il);
}
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
cb(Vcur, "Vcur", il);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -91,6 +91,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
}
cb(cur, "attn_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
@ -109,9 +112,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cb(cur, "ffn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
@ -130,9 +133,13 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
il,
nullptr, nullptr,
model.layers[il].ffn_up_exps_s,
model.layers[il].ffn_gate_exps_s,
model.layers[il].ffn_down_exps_s);
cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -30,6 +30,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(d_inner % n_head == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
@ -167,6 +168,9 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
GGML_ASSERT(d_inner % n_head == 0);
GGML_ASSERT(d_inner % d_state == 0);
GGML_ASSERT(d_inner % n_group == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);

View File

@ -1,4 +1,3 @@
#include "models.h"
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
@ -88,10 +87,17 @@ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_
cb(cur, "ffn_out", il);
} else {
// MoE branch
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
cur = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
cb(cur, "ffn_moe_out", il);
}

View File

@ -5,10 +5,10 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
const int64_t n_embd_base = 256;
const float scale_embd = 12.0f;
const float scale_depth = 1.4f;
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t n_embd_head_qk_rope = hparams.n_rot();
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
const uint32_t kv_lora_rank = hparams.n_lora_kv;
@ -51,21 +51,21 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
LLM_NORM_RMS, il);
cb(q, "q", il);
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
// {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, hparams.n_embd_head_k()),
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
0);
cb(q_nope, "q_nope", il);
// and {n_head * n_embd_head_qk_rope, n_tokens}
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, hparams.n_embd_head_k()),
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
cb(q_pe, "q_pe", il);
@ -97,15 +97,15 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
// split into {n_head * n_embd_head_qk_nope, n_tokens}
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
0);
cb(k_nope, "k_nope", il);
// and {n_head * n_embd_head_v, n_tokens}
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
cb(v_states, "v_states", il);

View File

@ -1,11 +1,10 @@
#include "models.h"
llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
// GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
ggml_tensor * cur;
ggml_tensor * inpL;
@ -91,7 +90,7 @@ llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -127,7 +127,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, true,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -3,7 +3,7 @@
#include "llama-model.h"
#include "llama-graph.h"
// note: almost all graphs require atleast sqrtf, so include cmath globally
// note: almost all graphs require at least sqrtf, so include cmath globally
#include <cmath>
//
@ -44,6 +44,26 @@ struct llm_build_delta_net_base : public llm_graph_context {
ggml_tensor * b,
ggml_tensor * s,
int il);
// use the ggml_gated_delta_net fused operator
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * b,
ggml_tensor * s,
int il);
// choose one of two implementations above based on the number of tokens
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net(
ggml_tensor * q,
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * b,
ggml_tensor * s,
int il);
};
struct llm_build_rwkv6_base : public llm_graph_context {

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -3,10 +3,10 @@
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * pos;

View File

@ -2,8 +2,8 @@
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
llm_build_mamba_base(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;
@ -114,9 +114,18 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
ggml_tensor * ffn_inp = cur;
ggml_tensor * inp_emb = cur;
ggml_tensor * inp_latent = cur;
if (model.layers[il].ffn_latent_down) {
inp_latent = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_down, cur);
}
ggml_tensor * router_logits = build_lora_mm(model.layers[il].ffn_gate_inp, cur);
cb(router_logits, "ffn_moe_logits", il);
ggml_tensor * moe_out =
build_moe_ffn(ffn_inp,
build_moe_ffn(inp_latent,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
nullptr, // no gate
@ -124,12 +133,17 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
il,
router_logits);
cb(moe_out, "ffn_moe_out", il);
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
if (model.layers[il].ffn_latent_up) {
moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
}
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
model.layers[il].ffn_up_shexp, NULL, NULL,
NULL /* no gate */ , NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
//GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
//GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -2,10 +2,10 @@
template <bool iswa>
llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;
@ -92,7 +92,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para
nullptr,
n_expert, n_expert_used,
LLM_FFN_SILU, false,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -95,7 +95,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SWIGLU_OAI_MOE, false,
false, 0.0,
hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
il);
cb(cur, "ffn_moe_out", il);

View File

@ -1,9 +1,9 @@
#include "models.h"
llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -1,10 +1,10 @@
#include "models.h"
llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

View File

@ -5,10 +5,10 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
// NOTE: same with qwen2vl.cpp, but bias tensors are optional
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
ggml_tensor * cur;
ggml_tensor * inpL;

Some files were not shown because too many files have changed in this diff Show More