talk-llama : sync llama.cpp
This commit is contained in:
parent
ab1252c19e
commit
2bc630f197
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
|
||||
|
|
@ -184,6 +185,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" },
|
||||
{ LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" },
|
||||
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
||||
{ LLM_KV_MOE_LATENT_SIZE, "%s.moe_latent_size" },
|
||||
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
|
||||
{ LLM_KV_NUM_DEEPSTACK_LAYERS, "%s.n_deepstack_layers" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
|
|
@ -229,11 +231,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
||||
|
|
@ -361,6 +366,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
|
|||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
{ LLM_TENSOR_FFN_LATENT_DOWN, "blk.%d.ffn_latent_down" },
|
||||
{ LLM_TENSOR_FFN_LATENT_UP, "blk.%d.ffn_latent_up" },
|
||||
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||
|
|
@ -1083,6 +1090,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_TOKEN_EMBD,
|
||||
LLM_TENSOR_OUTPUT_NORM,
|
||||
LLM_TENSOR_OUTPUT,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_ATTN_NORM,
|
||||
LLM_TENSOR_ATTN_Q,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
|
|
@ -1874,6 +1882,8 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_FFN_LATENT_DOWN,
|
||||
LLM_TENSOR_FFN_LATENT_UP,
|
||||
// MoE shared expert layer
|
||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
|
|
@ -2749,6 +2759,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
// Nemotron 3 Super
|
||||
{LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
|
@ -2786,6 +2799,15 @@ std::string LLM_TN_IMPL::str() const {
|
|||
return name;
|
||||
}
|
||||
|
||||
std::vector<llm_arch> llm_arch_all() {
|
||||
std::vector<llm_arch> ret;
|
||||
ret.reserve(LLM_ARCH_NAMES.size());
|
||||
for (const auto & [arch, _] : LLM_ARCH_NAMES) {
|
||||
ret.push_back(arch);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
const char * llm_arch_name(llm_arch arch) {
|
||||
auto it = LLM_ARCH_NAMES.find(arch);
|
||||
if (it == LLM_ARCH_NAMES.end()) {
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// gguf constants (sync with gguf.py)
|
||||
|
|
@ -188,6 +189,7 @@ enum llm_kv {
|
|||
LLM_KV_EXPERT_GROUP_SCALE,
|
||||
LLM_KV_EXPERTS_PER_GROUP,
|
||||
LLM_KV_MOE_EVERY_N_LAYERS,
|
||||
LLM_KV_MOE_LATENT_SIZE,
|
||||
LLM_KV_NEXTN_PREDICT_LAYERS,
|
||||
LLM_KV_NUM_DEEPSTACK_LAYERS,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
|
|
@ -233,11 +235,14 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_SWA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
|
||||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_FREQ_BASE_SWA,
|
||||
|
|
@ -381,6 +386,8 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_GATE_CHEXPS,
|
||||
LLM_TENSOR_FFN_UP_CHEXPS,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_FFN_LATENT_DOWN,
|
||||
LLM_TENSOR_FFN_LATENT_UP,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_LAYER_OUT_NORM,
|
||||
|
|
@ -608,6 +615,8 @@ struct llm_tensor_info {
|
|||
ggml_op op;
|
||||
};
|
||||
|
||||
std::vector<llm_arch> llm_arch_all();
|
||||
|
||||
const char * llm_arch_name(llm_arch arch);
|
||||
|
||||
llm_arch llm_arch_from_string(const std::string & name);
|
||||
|
|
|
|||
|
|
@ -394,11 +394,13 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
|||
clear();
|
||||
split_reset();
|
||||
|
||||
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_per_embd;
|
||||
|
||||
auto udata = std::make_shared<llama_ubatch::data_t>();
|
||||
|
||||
udata->token .resize(n_tokens);
|
||||
udata->embd .clear();
|
||||
udata->pos .resize(n_tokens);
|
||||
udata->pos .resize(n_pos_all);
|
||||
udata->n_seq_id .resize(n_tokens);
|
||||
udata->seq_id .resize(n_tokens);
|
||||
udata->seq_id_unq.resize(0);
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include "llama-memory.h"
|
||||
#include "llama-mmap.h"
|
||||
#include "llama-model.h"
|
||||
#include "llama-ext.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
|
|
@ -150,6 +151,10 @@ llama_context::llama_context(
|
|||
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
||||
cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
|
||||
cparams.fused_gdn_ar = true;
|
||||
cparams.fused_gdn_ch = true;
|
||||
cparams.auto_fgdn = true;
|
||||
|
||||
// with causal attention, the batch size is limited by the context size
|
||||
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||
|
||||
|
|
@ -158,7 +163,7 @@ llama_context::llama_context(
|
|||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
|
||||
// intialized later
|
||||
// initialized later
|
||||
cparams.pipeline_parallel = false;
|
||||
|
||||
{
|
||||
|
|
@ -337,6 +342,14 @@ llama_context::llama_context(
|
|||
|
||||
if (cparams.pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled\n", __func__);
|
||||
|
||||
if (!graph_reuse_disable) {
|
||||
// TODO: figure out a way to make graph reuse work with pipeline parallelism
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/20463
|
||||
LLAMA_LOG_WARN("%s: graph reuse is currently not compatible with pipeline parallelism - disabling\n", __func__);
|
||||
|
||||
graph_reuse_disable = true;
|
||||
}
|
||||
}
|
||||
|
||||
sched_reserve();
|
||||
|
|
@ -422,7 +435,7 @@ void llama_context::sched_reserve() {
|
|||
if (cparams.auto_fa) {
|
||||
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to split graph for Flash Attention check");
|
||||
throw std::runtime_error("failed to reserve graph for Flash Attention check");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
||||
|
|
@ -432,8 +445,7 @@ void llama_context::sched_reserve() {
|
|||
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
||||
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
ggml_backend_dev_t device_fa = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
||||
|
|
@ -448,6 +460,7 @@ void llama_context::sched_reserve() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (fa_device_mismatch) {
|
||||
cparams.flash_attn = false;
|
||||
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
||||
|
|
@ -459,6 +472,88 @@ void llama_context::sched_reserve() {
|
|||
cparams.auto_fa = false;
|
||||
}
|
||||
|
||||
if (cparams.auto_fgdn) {
|
||||
LLAMA_LOG_INFO("%s: resolving fused Gated Delta Net support:\n", __func__);
|
||||
|
||||
if (cparams.fused_gdn_ar) {
|
||||
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (autoregressive)");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_AR) + 1;
|
||||
bool gdn_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_GATED_DELTA_NET) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_AR "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_gdn != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
|
||||
gdn_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (gdn_device_mismatch) {
|
||||
cparams.fused_gdn_ar = false;
|
||||
LLAMA_LOG_WARN("%s: fused Gated Delta Net (autoregressive) not supported, set to disabled\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: fused Gated Delta Net (autoregressive) enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
if (cparams.fused_gdn_ch) {
|
||||
// more than one token in the batch per sequence in order to take the chunked path
|
||||
// note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
|
||||
// because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
|
||||
// it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
|
||||
// the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
|
||||
const uint32_t n_tokens_ch = 16*n_seqs;
|
||||
auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
|
||||
}
|
||||
|
||||
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FGDN_CH) + 1;
|
||||
bool gdn_device_mismatch = false;
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
||||
ggml_tensor * n = ggml_graph_node(gf, i);
|
||||
if (n->op != GGML_OP_GATED_DELTA_NET) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t device_gdn = ggml_backend_get_device(ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
||||
|
||||
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FGDN_CH "-", prefix_len) == 0);
|
||||
const int il = std::stoi(n->name + prefix_len);
|
||||
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
||||
if (device_gdn != device_kv) {
|
||||
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the fused Gated Delta Net tensor "
|
||||
"is assigned to device %s (usually due to missing support)\n",
|
||||
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_gdn));
|
||||
gdn_device_mismatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (gdn_device_mismatch) {
|
||||
cparams.fused_gdn_ch = false;
|
||||
LLAMA_LOG_WARN("%s: fused Gated Delta Net (chunked) not supported, set to disabled\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: fused Gated Delta Net (chunked) enabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
cparams.auto_fgdn = false;
|
||||
}
|
||||
|
||||
// reserve worst-case graph
|
||||
int n_splits_pp = -1;
|
||||
int n_nodes_pp = -1;
|
||||
|
|
@ -1039,11 +1134,15 @@ void llama_context::set_adapters_lora(llama_adapter_lora ** adapters, size_t n_a
|
|||
bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_t n_adapters, float * scales) {
|
||||
LLAMA_LOG_DEBUG("%s: adapters = %p\n", __func__, (void *) adapters);
|
||||
|
||||
if (n_adapters != loras->size()) {
|
||||
return false;
|
||||
}
|
||||
// Adapters with a zero scale are never added to `loras`, so also ignore them for the comparison.
|
||||
size_t n_non_zero = 0;
|
||||
|
||||
for (size_t i = 0; i < n_adapters; i ++) {
|
||||
if (scales[i] == 0.0f) {
|
||||
continue;
|
||||
}
|
||||
n_non_zero++;
|
||||
|
||||
auto it = loras->find(adapters[i]);
|
||||
|
||||
if (it == loras->end() || it->second != scales[i]) {
|
||||
|
|
@ -1051,6 +1150,10 @@ bool llama_context::adapters_lora_are_same(llama_adapter_lora ** adapters, size_
|
|||
}
|
||||
}
|
||||
|
||||
if (n_non_zero != loras->size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1114,6 +1217,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|||
{
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
// FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
|
||||
res->set_inputs(&ubatch);
|
||||
|
||||
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
|
|
@ -1981,7 +2085,7 @@ ggml_cgraph * llama_context::graph_reserve(
|
|||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
|
||||
// when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
|
||||
gf_res_prev->reset();
|
||||
|
||||
// store the n_outputs as it is, and restore it afterwards
|
||||
|
|
@ -2831,19 +2935,23 @@ llama_context * llama_init_from_model(
|
|||
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||
if (model->hparams.n_embd_head_k % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
|
||||
return nullptr;
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||
if (model->hparams.n_embd_head_v % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
|
||||
return nullptr;
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3035,6 +3143,19 @@ uint32_t llama_get_sampled_probs_count_ith(llama_context * ctx, int32_t i) {
|
|||
return static_cast<uint32_t>(ctx->get_sampled_probs_count(i));
|
||||
}
|
||||
|
||||
struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs) {
|
||||
auto * memory = ctx->get_memory();
|
||||
llama_memory_context_ptr mctx;
|
||||
if (memory) {
|
||||
mctx = memory->init_full();
|
||||
}
|
||||
return ctx->graph_reserve(n_tokens, n_seqs, n_outputs, mctx.get());
|
||||
}
|
||||
|
||||
// llama adapter API
|
||||
|
||||
int32_t llama_set_adapters_lora(
|
||||
|
|
|
|||
|
|
@ -31,6 +31,9 @@ struct llama_cparams {
|
|||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool auto_fa;
|
||||
bool fused_gdn_ar; // use fused gated delta net (autoregressive)
|
||||
bool fused_gdn_ch; // use fused gated delta net (chunked)
|
||||
bool auto_fgdn;
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
|
||||
#include "llama-context.h"
|
||||
#include "ggml.h"
|
||||
#include "stdint.h"
|
||||
|
||||
// Reserve a new compute graph. It is valid until the next call to llama_graph_reserve.
|
||||
LLAMA_API struct ggml_cgraph * llama_graph_reserve(
|
||||
struct llama_context * ctx,
|
||||
uint32_t n_tokens,
|
||||
uint32_t n_seqs,
|
||||
uint32_t n_outputs);
|
||||
|
|
@ -601,7 +601,7 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||
}
|
||||
const char * int_end = parse_int(pos);
|
||||
uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
uint64_t min_times = std::stoull(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
|
||||
uint64_t max_times = UINT64_MAX; // default: no max limit
|
||||
|
|
@ -614,7 +614,7 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
|
||||
if (is_digit_char(*pos)) {
|
||||
const char * int_end = parse_int(pos);
|
||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
||||
max_times = std::stoull(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
}
|
||||
|
||||
|
|
@ -1160,13 +1160,13 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
// if there is a grammar, parse it
|
||||
// rules will be empty (default) if there are parse errors
|
||||
if (!parser.parse(grammar_str) || parser.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
LLAMA_LOG_ERROR("failed to parse grammar\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Ensure that there is a "root" node.
|
||||
if (parser.symbol_ids.find("root") == parser.symbol_ids.end()) {
|
||||
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
||||
// Ensure that the grammar contains the start symbol
|
||||
if (parser.symbol_ids.find(grammar_root) == parser.symbol_ids.end()) {
|
||||
LLAMA_LOG_ERROR("grammar does not contain a '%s' symbol\n", grammar_root);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -1195,7 +1195,7 @@ struct llama_grammar * llama_grammar_init_impl(
|
|||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu", i);
|
||||
LLAMA_LOG_ERROR("unsupported grammar, left recursion detected for nonterminal at index %zu\n", i);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -250,7 +250,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
|||
|
||||
const bool last = (
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
||||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
||||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL)) // qwen3 reranking & embedding models use last token
|
||||
);
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
|
|
@ -509,6 +509,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|||
float * data = (float *) cross_kq_mask->data;
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
|
||||
for (int j = 0; j < n_enc; ++j) {
|
||||
float f = -INFINITY;
|
||||
|
||||
|
|
@ -848,13 +849,13 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
ubatch (params.ubatch),
|
||||
n_embd (hparams.n_embd),
|
||||
n_layer (hparams.n_layer),
|
||||
n_rot (hparams.n_rot),
|
||||
n_rot (hparams.n_rot()),
|
||||
n_ctx (cparams.n_ctx),
|
||||
n_head (hparams.n_head()),
|
||||
n_head_kv (hparams.n_head_kv()),
|
||||
n_embd_head_k (hparams.n_embd_head_k),
|
||||
n_embd_head_k (hparams.n_embd_head_k()),
|
||||
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
||||
n_embd_head_v (hparams.n_embd_head_v),
|
||||
n_embd_head_v (hparams.n_embd_head_v()),
|
||||
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
|
||||
|
|
@ -899,7 +900,8 @@ ggml_tensor * llm_graph_context::build_cvec(
|
|||
|
||||
ggml_tensor * llm_graph_context::build_lora_mm(
|
||||
ggml_tensor * w,
|
||||
ggml_tensor * cur) const {
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * w_s) const {
|
||||
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
|
||||
|
||||
for (const auto & lora : *loras) {
|
||||
|
|
@ -920,6 +922,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
|
|||
res = ggml_add(ctx0, res, ab_cur);
|
||||
}
|
||||
|
||||
if (w_s) {
|
||||
res = ggml_mul(ctx0, res, w_s);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
@ -1161,12 +1167,14 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps) const {
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * up_exps_s,
|
||||
ggml_tensor * gate_exps_s,
|
||||
ggml_tensor * down_exps_s) const {
|
||||
return build_moe_ffn(
|
||||
cur,
|
||||
gate_inp, /* gate_inp_b */ nullptr,
|
||||
|
|
@ -1178,12 +1186,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
n_expert_used,
|
||||
type_op,
|
||||
norm_w,
|
||||
scale_w,
|
||||
w_scale,
|
||||
gating_op,
|
||||
il,
|
||||
probs_in,
|
||||
gate_up_exps
|
||||
gate_up_exps,
|
||||
/* gate_up_exps_b */ nullptr,
|
||||
up_exps_s,
|
||||
gate_exps_s,
|
||||
down_exps_s
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1202,13 +1213,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * gate_up_exps_b) const {
|
||||
ggml_tensor * gate_up_exps_b,
|
||||
ggml_tensor * up_exps_s,
|
||||
ggml_tensor * gate_exps_s,
|
||||
ggml_tensor * down_exps_s) const {
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||
|
|
@ -1330,7 +1343,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
|
||||
weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
|
||||
}
|
||||
if (scale_w) {
|
||||
if (w_scale != 0.0f && w_scale != 1.0f) {
|
||||
weights = ggml_scale(ctx0, weights, w_scale);
|
||||
cb(weights, "ffn_moe_weights_scaled", il);
|
||||
}
|
||||
|
|
@ -1360,6 +1373,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(gate_up, "ffn_moe_gate_up_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
|
||||
if (up_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
gate_up = ggml_mul(ctx0, gate_up, s);
|
||||
cb(gate_up, "ffn_moe_gate_up_scaled", il);
|
||||
}
|
||||
|
||||
const int64_t n_ff = gate_up->ne[0] / 2;
|
||||
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
|
@ -1375,6 +1397,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(up, "ffn_moe_up_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to up
|
||||
if (up_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
up = ggml_mul(ctx0, up, s);
|
||||
cb(up, "ffn_moe_up_scaled", il);
|
||||
}
|
||||
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
|
@ -1386,6 +1417,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
||||
cb(cur, "ffn_moe_gate_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to gate
|
||||
if (gate_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
cur = ggml_mul(ctx0, cur, s);
|
||||
cb(cur, "ffn_moe_gate_scaled", il);
|
||||
}
|
||||
}
|
||||
|
||||
const bool has_gate = gate_exps || gate_up_exps;
|
||||
|
|
@ -1465,6 +1505,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(experts, "ffn_moe_down_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to down
|
||||
if (down_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
experts = ggml_mul(ctx0, experts, s);
|
||||
cb(experts, "ffn_moe_down_scaled", il);
|
||||
}
|
||||
|
||||
if (!weight_before_ffn) {
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
|
|
@ -1607,6 +1656,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|||
// this need to be 1x1xN for broadcasting
|
||||
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
|
||||
ggml_set_input(cur);
|
||||
ggml_set_name(cur, "attn_scale");
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
|
|
@ -1616,7 +1666,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
|
|||
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
||||
// note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
|
||||
// but this would make the graph topology depend on the number of output tokens, which can interere with
|
||||
// features that require constant topology such as pipline parallelism
|
||||
// features that require constant topology such as pipeline parallelism
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
|
||||
//if (n_outputs < n_tokens) {
|
||||
// return nullptr;
|
||||
|
|
@ -1779,7 +1829,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||
if (v_mla) {
|
||||
#if 0
|
||||
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
||||
// However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
|
||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||
#else
|
||||
|
|
@ -2553,7 +2603,7 @@ void llm_graph_context::build_pooling(
|
|||
}
|
||||
|
||||
// softmax for qwen3 reranker
|
||||
if (arch == LLM_ARCH_QWEN3) {
|
||||
if (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL) {
|
||||
cur = ggml_soft_max(ctx0, cur);
|
||||
}
|
||||
} break;
|
||||
|
|
|
|||
|
|
@ -764,10 +764,11 @@ struct llm_graph_context {
|
|||
ggml_tensor * cur,
|
||||
int il) const;
|
||||
|
||||
// do mat_mul, while optionally apply lora
|
||||
// do mat_mul, while optionally apply lora and per-tensor scale
|
||||
ggml_tensor * build_lora_mm(
|
||||
ggml_tensor * w,
|
||||
ggml_tensor * cur) const;
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * w_s = nullptr) const;
|
||||
|
||||
// do mat_mul_id, while optionally apply lora
|
||||
ggml_tensor * build_lora_mm_id(
|
||||
|
|
@ -810,12 +811,14 @@ struct llm_graph_context {
|
|||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr) const;
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * up_exps_s = nullptr,
|
||||
ggml_tensor * gate_exps_s = nullptr,
|
||||
ggml_tensor * down_exps_s = nullptr) const;
|
||||
|
||||
ggml_tensor * build_moe_ffn(
|
||||
ggml_tensor * cur,
|
||||
|
|
@ -832,13 +835,15 @@ struct llm_graph_context {
|
|||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in = nullptr,
|
||||
ggml_tensor * gate_up_exps = nullptr,
|
||||
ggml_tensor * gate_up_exps_b = nullptr) const;
|
||||
ggml_tensor * gate_up_exps_b = nullptr,
|
||||
ggml_tensor * up_exps_s = nullptr,
|
||||
ggml_tensor * gate_exps_s = nullptr,
|
||||
ggml_tensor * down_exps_s = nullptr) const;
|
||||
|
||||
//
|
||||
// inputs
|
||||
|
|
|
|||
|
|
@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
|
|||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_rot(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_rot_swa : n_rot_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_inp() const {
|
||||
uint32_t n_embd_inp = n_embd;
|
||||
|
||||
|
|
@ -76,16 +84,32 @@ uint32_t llama_hparams::n_embd_out() const {
|
|||
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_k * n_head_kv;
|
||||
return n_embd_head_k(il) * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_v * n_head_kv;
|
||||
return n_embd_head_v(il) * n_head_kv;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
||||
|
|
@ -197,11 +221,11 @@ bool llama_hparams::is_mla() const {
|
|||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k_mla() const {
|
||||
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
|
||||
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v_mla() const {
|
||||
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
|
||||
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
|
||||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
|
|
|
|||
|
|
@ -44,13 +44,20 @@ struct llama_hparams {
|
|||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// different head size for full_attention and SWA layers
|
||||
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_embd_head_k_swa;
|
||||
uint32_t n_embd_head_v_swa;
|
||||
|
||||
// different RoPE dimensions for full_attention and SWA layers
|
||||
uint32_t n_rot_full;
|
||||
uint32_t n_rot_swa;
|
||||
|
||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
uint32_t n_embd_head_k_mla_impl = 0;
|
||||
uint32_t n_embd_head_v_mla_impl = 0;
|
||||
|
|
@ -82,6 +89,7 @@ struct llama_hparams {
|
|||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
uint32_t moe_every_n_layers = 0;
|
||||
uint32_t moe_latent_size = 0;
|
||||
uint32_t nextn_predict_layers = 0;
|
||||
|
||||
float f_norm_eps;
|
||||
|
|
@ -247,12 +255,18 @@ struct llama_hparams {
|
|||
|
||||
uint32_t n_gqa(uint32_t il = 0) const;
|
||||
|
||||
uint32_t n_rot(uint32_t il = 0) const;
|
||||
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t n_embd_out() const;
|
||||
|
||||
// dimension of key/value embeddings for each head (per layer)
|
||||
uint32_t n_embd_head_k(uint32_t il = 0) const;
|
||||
uint32_t n_embd_head_v(uint32_t il = 0) const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -100,9 +100,9 @@ std::string format(const char * fmt, ...) {
|
|||
|
||||
std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
||||
snprintf(buf, sizeof(buf), "%6" PRId64, ne.at(0));
|
||||
for (size_t i = 1; i < ne.size(); i++) {
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, ne.at(i));
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,4 +70,6 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
|
|||
|
||||
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
|
||||
|
||||
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
||||
#define LLAMA_TENSOR_NAME_FATTN "__fattn__"
|
||||
#define LLAMA_TENSOR_NAME_FGDN_AR "__fgdn_ar__"
|
||||
#define LLAMA_TENSOR_NAME_FGDN_CH "__fgdn_ch__"
|
||||
|
|
|
|||
|
|
@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
|
|||
break;
|
||||
}
|
||||
|
||||
// remeber the position that we found
|
||||
// remember the position that we found
|
||||
res.push_back(sinfo_new);
|
||||
|
||||
// store the old state of the cells in the recovery stack
|
||||
|
|
@ -1033,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
|
||||
return ggml_view_4d(ctx, k,
|
||||
hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k),
|
||||
hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k(il)),
|
||||
ggml_row_size(k->type, n_embd_k_gqa),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
|
||||
|
|
@ -1056,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
if (!v_trans) {
|
||||
// note: v->nb[1] <= v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
|
||||
hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
|
||||
|
|
@ -1065,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
|
||||
// note: v->nb[1] > v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, kv_size), // v->nb[2]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
||||
|
|
@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
|
|||
}
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
|
||||
// bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
|
||||
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
|
||||
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
|
||||
|
||||
|
|
@ -1544,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const {
|
||||
float freq_scale,
|
||||
uint32_t il) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
|
|
@ -1552,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & n_rot = hparams.n_rot(il);
|
||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||
// @ngxson : this is a workaround
|
||||
// for M-RoPE, we want to rotate the whole vector when doing KV shift
|
||||
|
|
@ -1606,13 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
|
||||
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
|
||||
|
|
@ -1626,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
const auto n_rot = hparams.n_rot(il);
|
||||
const auto n_embd_head_k = hparams.n_embd_head_k(il);
|
||||
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
|
|
@ -1638,7 +1636,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
||||
ggml_row_size(layer.k->type, n_embd_nope));
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
|
@ -1760,8 +1758,10 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t
|
|||
io.write(&pos, sizeof(pos));
|
||||
io.write(&n_seq_id, sizeof(n_seq_id));
|
||||
|
||||
// TODO: we also need to save llama_kv_cell_ext when apply_ubatch() support loading it
|
||||
// see: https://github.com/ggml-org/llama.cpp/pull/16825#issuecomment-3460868350
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
const llama_kv_cell_ext ext = cells.ext_get(i);
|
||||
io.write(&ext, sizeof(ext));
|
||||
}
|
||||
|
||||
for (const auto & seq_id : seq_ids) {
|
||||
io.write(&seq_id, sizeof(seq_id));
|
||||
|
|
@ -1895,6 +1895,14 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
return false;
|
||||
}
|
||||
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
llama_kv_cell_ext ext;
|
||||
io.read_to(&ext, sizeof(ext));
|
||||
|
||||
ubatch.pos[i + ubatch.n_tokens] = ext.y;
|
||||
ubatch.pos[i + ubatch.n_tokens*2] = ext.x;
|
||||
}
|
||||
|
||||
// read the sequence id, but directly discard it - we will use dest_seq_id instead
|
||||
{
|
||||
llama_seq_id seq_id;
|
||||
|
|
@ -1945,6 +1953,12 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
|
|||
|
||||
cells.pos_set(i, pos);
|
||||
|
||||
if (hparams.n_pos_per_embd() > 1) {
|
||||
llama_kv_cell_ext ext;
|
||||
io.read_to(&ext, sizeof(ext));
|
||||
cells.ext_set(i, ext);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
llama_seq_id seq_id;
|
||||
io.read_to(&seq_id, sizeof(seq_id));
|
||||
|
|
|
|||
|
|
@ -264,7 +264,8 @@ private:
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
float freq_scale,
|
||||
uint32_t il) const;
|
||||
|
||||
ggml_cgraph * build_graph_shift(
|
||||
llm_graph_result * res,
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
#include "llama-model-loader.h"
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml.h"
|
||||
#include "gguf.h"
|
||||
#include "llama-hparams.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cinttypes>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
static const size_t MiB = 1024*kiB;
|
||||
|
|
@ -37,6 +42,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
||||
case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return "MXFP4 MoE";
|
||||
case LLAMA_FTYPE_MOSTLY_NVFP4: return "NVFP4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
||||
|
|
@ -263,7 +269,7 @@ namespace GGUFMeta {
|
|||
template<typename T>
|
||||
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
||||
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
const int kid = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
|
|
@ -273,7 +279,7 @@ namespace GGUFMeta {
|
|||
}
|
||||
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
||||
|
||||
|
||||
result = arr_info.length;
|
||||
|
|
@ -290,7 +296,7 @@ namespace GGUFMeta {
|
|||
|
||||
template<typename T>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
|
||||
const gguf_context * ctx = meta.get();
|
||||
const gguf_context * ctx = metadata;
|
||||
const int kid = gguf_find_key(ctx, key.c_str());
|
||||
|
||||
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
||||
|
|
@ -331,7 +337,7 @@ namespace GGUFMeta {
|
|||
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const gguf_context * ctx = meta.get();
|
||||
const gguf_context * ctx = metadata;
|
||||
const int kid = gguf_find_key(ctx, key.c_str());
|
||||
|
||||
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
|
||||
|
|
@ -393,7 +399,7 @@ namespace GGUFMeta {
|
|||
const struct llama_model_kv_override * override =
|
||||
it != kv_overrides.end() ? &it->second : nullptr;
|
||||
|
||||
const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
|
||||
const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
|
||||
|
||||
if (required && !found) {
|
||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||
|
|
@ -427,7 +433,7 @@ namespace GGUFMeta {
|
|||
// get array of n <= N_MAX elements, or a single element repeated n times
|
||||
template<typename T, size_t N_MAX>
|
||||
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
const int kid = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (required) {
|
||||
|
|
@ -440,9 +446,9 @@ namespace GGUFMeta {
|
|||
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
||||
}
|
||||
|
||||
if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
|
||||
if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
|
||||
|
||||
if (n != arr_info.length) {
|
||||
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
|
||||
|
|
@ -473,7 +479,7 @@ namespace GGUFMeta {
|
|||
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
|
||||
const std::string key = llm_kv(kid);
|
||||
|
||||
const int id = gguf_find_key(meta.get(), key.c_str());
|
||||
const int id = gguf_find_key(metadata, key.c_str());
|
||||
|
||||
if (id < 0) {
|
||||
if (required) {
|
||||
|
|
@ -483,7 +489,7 @@ namespace GGUFMeta {
|
|||
}
|
||||
|
||||
// throw and error if type is an array
|
||||
if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
|
||||
if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
|
||||
if (required) {
|
||||
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
|
||||
}
|
||||
|
|
@ -500,6 +506,9 @@ namespace GGUFMeta {
|
|||
|
||||
|
||||
llama_model_loader::llama_model_loader(
|
||||
struct gguf_context * meta,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
bool use_mmap,
|
||||
|
|
@ -507,7 +516,8 @@ llama_model_loader::llama_model_loader(
|
|||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
|
||||
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
|
||||
: metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
|
||||
int trace = 0;
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
trace = atoi(getenv("LLAMA_TRACE"));
|
||||
|
|
@ -521,136 +531,142 @@ llama_model_loader::llama_model_loader(
|
|||
|
||||
tensor_buft_overrides = param_tensor_buft_overrides_p;
|
||||
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
if (!fname.empty()) {
|
||||
// Load the main GGUF
|
||||
struct ggml_context * ctx = NULL;
|
||||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
|
||||
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
if (!meta) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
use_mmap = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
use_direct_io = false;
|
||||
|
||||
// reopen file using std::fopen for mmap
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
}
|
||||
}
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
|
||||
}
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
metadata = metadata_ptr.get();
|
||||
if (metadata == nullptr) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
if (use_mmap && use_direct_io) {
|
||||
if (files.back()->has_direct_io()) {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
use_mmap = false;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
use_direct_io = false;
|
||||
|
||||
// reopen file using std::fopen for mmap
|
||||
files.pop_back();
|
||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
}
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
|
||||
}
|
||||
uint16_t n_split = 0;
|
||||
get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
// Load additional GGML contexts
|
||||
if (n_split > 1) {
|
||||
// make sure the main file is loaded first
|
||||
uint16_t idx = 0;
|
||||
const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
|
||||
get_key(kv_split_no, idx);
|
||||
if (idx != 0) {
|
||||
throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
|
||||
}
|
||||
|
||||
// check idx
|
||||
// generate list of splits if needed
|
||||
if (splits.empty()) {
|
||||
splits = llama_get_list_splits(fname, idx, n_split);
|
||||
}
|
||||
|
||||
// in case user give a custom list of splits, check if it matches the expected number
|
||||
if (n_split != (uint16_t)splits.size()) {
|
||||
throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
|
||||
}
|
||||
|
||||
if (trace > 0) {
|
||||
LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
|
||||
}
|
||||
|
||||
// load other splits
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
const char * fname_split = splits[idx].c_str();
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx,
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
}
|
||||
|
||||
// check idx
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
||||
}
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
|
||||
if (kid < 0) {
|
||||
throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
|
||||
}
|
||||
int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
|
||||
if (idx_gguf != idx) {
|
||||
throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
|
||||
const int n_tensors_loaded = (int) weights_map.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// Save tensors data offset info of the shard.
|
||||
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
std::string tensor_name = std::string(cur->name);
|
||||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
}
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
|
||||
|
||||
// sanity check
|
||||
{
|
||||
const int n_tensors_loaded = (int) weights_map.size();
|
||||
if (n_tensors != n_tensors_loaded) {
|
||||
throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
|
||||
} else {
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
}
|
||||
|
||||
n_kv = gguf_get_n_kv(meta.get());
|
||||
n_kv = gguf_get_n_kv(metadata);
|
||||
n_tensors = weights_map.size();
|
||||
|
||||
fver = (enum llama_fver) gguf_get_version(meta.get());
|
||||
fver = (enum llama_fver) gguf_get_version(metadata);
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
||||
|
|
@ -709,6 +725,7 @@ llama_model_loader::llama_model_loader(
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
|
@ -729,14 +746,14 @@ llama_model_loader::llama_model_loader(
|
|||
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(meta.get(), i);
|
||||
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
||||
const char * name = gguf_get_key(metadata, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(metadata, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
||||
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(meta.get(), i);
|
||||
std::string value = gguf_kv_to_str(metadata, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
|
|
@ -838,15 +855,382 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
|
|||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
||||
GGML_ASSERT(w != nullptr);
|
||||
|
||||
if (op == GGML_OP_NONE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||
if (!ctx_ptr) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
ggml_context * ctx = ctx_ptr.get();
|
||||
|
||||
ggml_tensor * op_tensor = nullptr;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_get_rows(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul_mat(ctx, w, b);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const int n_expert_used = hparams.n_expert_used;
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_add(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ADD_ID:
|
||||
{
|
||||
const int n_expert_used = hparams.n_expert_used;
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
||||
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
||||
op_tensor = ggml_add_id(ctx, a, w, c);
|
||||
} break;
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
||||
op_tensor = ggml_mul(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
||||
op_tensor = ggml_div(ctx, a, w);
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int n_embd_head = hparams.n_embd_head_v();
|
||||
const int n_head = hparams.n_head();
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
op_tensor = ggml_rope_ext(
|
||||
ctx, a, b, w,
|
||||
0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0
|
||||
);
|
||||
|
||||
} break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
{
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
||||
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
||||
} break;
|
||||
case GGML_OP_SSM_SCAN:
|
||||
{
|
||||
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
||||
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
||||
const int64_t n_head = w->ne[1];
|
||||
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
||||
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
||||
const int64_t n_seq_tokens = 512;
|
||||
const int64_t n_seqs = 3;
|
||||
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
||||
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
||||
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
||||
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
||||
} break;
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
{
|
||||
// FIXME
|
||||
const int64_t S = 123;
|
||||
const int64_t H = 123;
|
||||
const int64_t n_tokens = 123;
|
||||
const int64_t n_seqs = 123;
|
||||
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * tf = w;
|
||||
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
||||
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
||||
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
const int n_embd_inp = hparams.n_embd_inp();
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
||||
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
op_tensor = ggml_scale(ctx, w, 1.0f);
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
||||
}
|
||||
|
||||
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
||||
GGML_ASSERT(w->buffer == nullptr);
|
||||
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
||||
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
||||
ggml_backend_buffer_free(w->buffer);
|
||||
w->buffer = nullptr;
|
||||
|
||||
return op_supported;
|
||||
}
|
||||
|
||||
// find the first buffer type in the list that can use the tensor
|
||||
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
|
||||
GGML_ASSERT(!buft_list->empty());
|
||||
for (const auto & cur : *buft_list) {
|
||||
ggml_backend_dev_t cur_dev = cur.first;
|
||||
ggml_backend_buffer_type_t cur_buft = cur.second;
|
||||
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
||||
return cur_buft;
|
||||
}
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
// one ggml context per buffer type
|
||||
int max_n_tensors = n_tensors;
|
||||
max_n_tensors += 1; // duplicated output tensor
|
||||
max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
|
||||
if (files.empty()) {
|
||||
max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
|
||||
}
|
||||
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("failed to create ggml context"));
|
||||
}
|
||||
|
||||
ctx_map.emplace(buft, ctx);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
return it->second.get();
|
||||
};
|
||||
|
||||
auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
|
||||
if (!t_meta) {
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
return nullptr;
|
||||
}
|
||||
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
||||
// the tensor is duplicated
|
||||
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
||||
llm_tensor tn_tensor = tn.tensor;
|
||||
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
|
||||
tn_tensor = LLM_TENSOR_OUTPUT;
|
||||
}
|
||||
|
||||
llm_tensor_info info;
|
||||
try {
|
||||
info = llm_tensor_info_for(tn_tensor);
|
||||
} catch (const std::out_of_range & e) {
|
||||
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
||||
}
|
||||
|
||||
// skip unused tensors
|
||||
if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
|
||||
const size_t nbytes = ggml_nbytes(t_meta);
|
||||
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
||||
|
||||
size_data -= nbytes;
|
||||
n_created++;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
||||
ggml_op op;
|
||||
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
||||
if (bias) {
|
||||
if (info.op == GGML_OP_MUL_MAT_ID) {
|
||||
op = GGML_OP_ADD_ID;
|
||||
} else {
|
||||
op = GGML_OP_ADD;
|
||||
}
|
||||
} else {
|
||||
op = info.op;
|
||||
}
|
||||
|
||||
// sanity checks
|
||||
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
||||
if (tn.bid != -1) {
|
||||
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
||||
}
|
||||
} else {
|
||||
if (tn.bid == -1) {
|
||||
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// select the buffer type for this tensor
|
||||
const buft_list_t * buft_list;
|
||||
switch (info.layer) {
|
||||
case LLM_TENSOR_LAYER_INPUT:
|
||||
buft_list = buft_list_input;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_OUTPUT:
|
||||
buft_list = buft_list_output;
|
||||
break;
|
||||
case LLM_TENSOR_LAYER_REPEATING:
|
||||
GGML_ASSERT(buft_list_layer != nullptr);
|
||||
buft_list = buft_list_layer;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
// check overrides
|
||||
if (tensor_buft_overrides) {
|
||||
std::string tensor_name = tn.str();
|
||||
for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||
std::regex pattern(overrides->pattern);
|
||||
if (std::regex_search(tensor_name, pattern)) {
|
||||
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
||||
// when overriding to a CPU buffer, consider the extra buffer types
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
|
||||
} else {
|
||||
buft = overrides->buft;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||
tensor_name.c_str(),
|
||||
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||
ggml_backend_buft_name(buft));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!buft) {
|
||||
buft = select_weight_buft(hparams, t_meta, op, buft_list);
|
||||
if (!buft) {
|
||||
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
// avoid using a host buffer when using mmap
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
throw std::runtime_error("no CPU backend found");
|
||||
}
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
|
||||
if (buft != buft_list->front().second) {
|
||||
if (n_tensors_moved == 0) {
|
||||
first_tensor_moved_name = t_meta->name;
|
||||
first_tensor_moved_type_name = ggml_type_name(t_meta->type);
|
||||
first_moved_from_buft = buft_list->front().second;
|
||||
first_moved_to_buft = buft;
|
||||
}
|
||||
n_tensors_moved++;
|
||||
}
|
||||
|
||||
return buft;
|
||||
};
|
||||
|
||||
if (files.empty()) {
|
||||
if (flags & TENSOR_SKIP_IF_VIRTUAL) {
|
||||
return nullptr;
|
||||
}
|
||||
ggml_type type = GGML_TYPE_F32;
|
||||
const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
|
||||
if (tid != -1) {
|
||||
type = gguf_get_tensor_type(metadata, tid);
|
||||
}
|
||||
|
||||
// for tensors that are not required some of the dimensions can be invalid:
|
||||
if (flags & TENSOR_NOT_REQUIRED) {
|
||||
for (size_t dim = 0; dim < ne.size(); dim++) {
|
||||
if (ne.begin()[dim] <= 0) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor t_meta;
|
||||
memset(&t_meta, 0, sizeof(ggml_tensor));
|
||||
t_meta.type = type;
|
||||
for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
|
||||
GGML_ASSERT(t_meta.ne[dim] >= 1);
|
||||
t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
|
||||
GGML_ASSERT(t_meta.nb[dim] >= 1);
|
||||
}
|
||||
ggml_set_name(&t_meta, tn.str().c_str());
|
||||
|
||||
ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
|
||||
GGML_ASSERT(buft != nullptr);
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
|
||||
ggml_set_name(ret, tn.str().c_str());
|
||||
return ret;
|
||||
}
|
||||
|
||||
ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
|
||||
ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
|
||||
if (buft == nullptr) {
|
||||
return nullptr; // return type is ggml_tensor *
|
||||
}
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
||||
if (flags & TENSOR_DUPLICATED) {
|
||||
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
|
||||
if (t) {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
|
||||
const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
|
||||
|
||||
if (cur == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool duplicated = flags & TENSOR_DUPLICATED;
|
||||
const bool duplicated = flags & TENSOR_DUPLICATED;
|
||||
|
||||
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
||||
ggml_set_name(tensor, ggml_get_name(cur));
|
||||
|
|
@ -858,7 +1242,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx
|
|||
}
|
||||
|
||||
return tensor;
|
||||
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
|
||||
|
|
@ -893,6 +1276,11 @@ void llama_model_loader::done_getting_tensors() const {
|
|||
if (n_created != n_tensors) {
|
||||
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||
}
|
||||
if (n_tensors_moved > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
|
||||
__func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
|
||||
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
|
||||
|
|
@ -974,6 +1362,12 @@ bool llama_model_loader::load_all_data(
|
|||
llama_mlocks * lmlocks,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
if (files.empty()) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
set_tensor_data(t, set_tensor_data_ud);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
|
|
|
|||
|
|
@ -4,17 +4,22 @@
|
|||
|
||||
#include "llama-impl.h"
|
||||
#include "llama-arch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-mmap.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <unordered_map>
|
||||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
// lists of buffer types used for each layer
|
||||
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
||||
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
|
|
@ -58,9 +63,10 @@ struct llama_model_loader {
|
|||
}
|
||||
};
|
||||
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_NOT_REQUIRED = 1 << 0;
|
||||
static const int TENSOR_DUPLICATED = 1 << 1;
|
||||
static const int TENSOR_SKIP = 1 << 2;
|
||||
static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
|
||||
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
|
|
@ -84,7 +90,10 @@ struct llama_model_loader {
|
|||
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
|
||||
const llama_model_tensor_buft_override * tensor_buft_overrides;
|
||||
|
||||
gguf_context_ptr meta;
|
||||
gguf_context_ptr metadata_ptr;
|
||||
struct gguf_context * metadata; // either metadata_ptr.get() or externally set
|
||||
llama_model_set_tensor_data_t set_tensor_data;
|
||||
void * set_tensor_data_ud;
|
||||
std::vector<ggml_context_ptr> contexts;
|
||||
|
||||
std::string arch_name;
|
||||
|
|
@ -94,7 +103,26 @@ struct llama_model_loader {
|
|||
size_t size_data = 0;
|
||||
std::vector<std::pair<size_t, size_t>> mmaps_used;
|
||||
|
||||
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
||||
struct ggml_backend_buft_comparator {
|
||||
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
||||
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
||||
|
||||
// track tensors that had to be moved for debugging:
|
||||
size_t n_tensors_moved = 0;
|
||||
std::string first_tensor_moved_name;
|
||||
std::string first_tensor_moved_type_name;
|
||||
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
||||
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
||||
|
||||
llama_model_loader(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
bool use_mmap,
|
||||
|
|
@ -149,7 +177,9 @@ struct llama_model_loader {
|
|||
|
||||
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
|
||||
|
||||
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
|
||||
struct ggml_tensor * create_tensor(
|
||||
const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
|
||||
const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
|
||||
|
||||
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
|
||||
|
||||
|
|
|
|||
|
|
@ -7,14 +7,19 @@
|
|||
#include "llama-model.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
||||
gguf_ctx = gguf_init_empty();
|
||||
}
|
||||
llama_model_saver::llama_model_saver(const struct llama_model * model) :
|
||||
gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
|
||||
|
||||
llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
|
||||
gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
|
||||
|
||||
llama_model_saver::~llama_model_saver() {
|
||||
gguf_free(gguf_ctx);
|
||||
if (gguf_ctx_owned) {
|
||||
gguf_free(gguf_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
||||
|
|
@ -46,7 +51,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
|||
|
||||
template <typename Container>
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(model != nullptr || !per_layer);
|
||||
const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
|
||||
GGML_ASSERT(n_values <= value.size());
|
||||
|
||||
if (n_values == 0) {
|
||||
|
|
@ -83,6 +89,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// instantiate for external usage:
|
||||
template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
|
||||
|
||||
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
||||
std::vector<const char *> tmp(value.size());
|
||||
|
|
@ -104,37 +112,39 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
|||
}
|
||||
|
||||
void llama_model_saver::add_kv_from_model() {
|
||||
const llama_hparams & hparams = model.hparams;
|
||||
const llama_vocab & vocab = model.vocab;
|
||||
const llama_hparams & hparams = model->hparams;
|
||||
const llama_vocab & vocab = model->vocab;
|
||||
|
||||
const int32_t n_vocab = vocab.n_tokens();
|
||||
std::vector<std::string> tokens(n_vocab);
|
||||
std::vector<float> scores(n_vocab);
|
||||
std::vector<int32_t> token_types(n_vocab);
|
||||
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
|
||||
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
tokens[id] = token_data.text;
|
||||
scores[id] = token_data.score;
|
||||
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
switch(token_data.attr) {
|
||||
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
||||
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
|
||||
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
||||
add_kv(LLM_KV_GENERAL_NAME, model->name);
|
||||
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||
// add_kv(LLM_KV_GENERAL_URL, ???);
|
||||
|
|
@ -176,8 +186,10 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
|
|
@ -189,7 +201,8 @@ void llama_model_saver::add_kv_from_model() {
|
|||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||
|
|
@ -255,25 +268,25 @@ void llama_model_saver::add_kv_from_model() {
|
|||
}
|
||||
|
||||
void llama_model_saver::add_tensors_from_model() {
|
||||
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
||||
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
||||
if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
|
||||
add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
|
||||
}
|
||||
add_tensor(model.type_embd);
|
||||
add_tensor(model.pos_embd);
|
||||
add_tensor(model.tok_norm);
|
||||
add_tensor(model.tok_norm_b);
|
||||
add_tensor(model.output_norm);
|
||||
add_tensor(model.output_norm_b);
|
||||
add_tensor(model.output);
|
||||
add_tensor(model.output_b);
|
||||
add_tensor(model.output_norm_enc);
|
||||
add_tensor(model.cls);
|
||||
add_tensor(model.cls_b);
|
||||
add_tensor(model.cls_out);
|
||||
add_tensor(model.cls_out_b);
|
||||
add_tensor(model.cls_norm);
|
||||
add_tensor(model->type_embd);
|
||||
add_tensor(model->pos_embd);
|
||||
add_tensor(model->tok_norm);
|
||||
add_tensor(model->tok_norm_b);
|
||||
add_tensor(model->output_norm);
|
||||
add_tensor(model->output_norm_b);
|
||||
add_tensor(model->output);
|
||||
add_tensor(model->output_b);
|
||||
add_tensor(model->output_norm_enc);
|
||||
add_tensor(model->cls);
|
||||
add_tensor(model->cls_b);
|
||||
add_tensor(model->cls_out);
|
||||
add_tensor(model->cls_out_b);
|
||||
add_tensor(model->cls_norm);
|
||||
|
||||
for (const struct llama_layer & layer : model.layers) {
|
||||
for (const struct llama_layer & layer : model->layers) {
|
||||
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "gguf.h"
|
||||
#include "llama.h"
|
||||
#include "llama-arch.h"
|
||||
|
||||
|
|
@ -7,10 +8,12 @@
|
|||
|
||||
struct llama_model_saver {
|
||||
struct gguf_context * gguf_ctx = nullptr;
|
||||
const struct llama_model & model;
|
||||
const bool gguf_ctx_owned;
|
||||
const struct llama_model * model;
|
||||
const struct LLM_KV llm_kv;
|
||||
|
||||
llama_model_saver(const struct llama_model & model);
|
||||
llama_model_saver(const struct llama_model * model);
|
||||
llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
|
||||
~llama_model_saver();
|
||||
|
||||
void add_kv(enum llm_kv key, uint32_t value);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -54,6 +54,7 @@ enum llm_type {
|
|||
LLM_TYPE_0_3B,
|
||||
LLM_TYPE_0_5B,
|
||||
LLM_TYPE_0_6B,
|
||||
LLM_TYPE_0_8B,
|
||||
LLM_TYPE_1B,
|
||||
LLM_TYPE_1_2B,
|
||||
LLM_TYPE_1_3B,
|
||||
|
|
@ -125,12 +126,15 @@ enum llm_type {
|
|||
LLM_TYPE_100B_A6B,
|
||||
LLM_TYPE_102B_A12B, // Solar-Open
|
||||
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
||||
LLM_TYPE_120B_A12B, // Nemotron 3 Super
|
||||
LLM_TYPE_122B_A10B, // Qwen3.5
|
||||
LLM_TYPE_196B_A11B, // Step3.5-Flash
|
||||
LLM_TYPE_230B_A10B, // Minimax M2
|
||||
LLM_TYPE_235B_A22B,
|
||||
LLM_TYPE_300B_A47B, // Ernie MoE big
|
||||
LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
|
||||
LLM_TYPE_355B_A32B, // GLM-4.5
|
||||
LLM_TYPE_397B_A17B, // Qwen3.5
|
||||
LLM_TYPE_744B_A40B, // GLM-5
|
||||
LLM_TYPE_E2B,
|
||||
LLM_TYPE_E4B,
|
||||
|
|
@ -291,6 +295,15 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_up_exps_b = nullptr;
|
||||
struct ggml_tensor * ffn_gate_up_exps_b = nullptr;
|
||||
|
||||
// ff MoE per-expert scales (NVFP4 per-tensor scale2)
|
||||
struct ggml_tensor * ffn_gate_exps_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_s = nullptr;
|
||||
|
||||
// ff MoE latent proj
|
||||
struct ggml_tensor * ffn_latent_down = nullptr;
|
||||
struct ggml_tensor * ffn_latent_up = nullptr;
|
||||
|
||||
// ff shared expert (shexp)
|
||||
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp = nullptr;
|
||||
|
|
@ -384,13 +397,21 @@ struct llama_layer {
|
|||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale = nullptr;
|
||||
struct ggml_tensor * wk_scale = nullptr;
|
||||
struct ggml_tensor * wv_scale = nullptr;
|
||||
struct ggml_tensor * wo_scale = nullptr;
|
||||
struct ggml_tensor * ffn_gate_scale = nullptr;
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
struct ggml_tensor * wq_s = nullptr;
|
||||
struct ggml_tensor * wk_s = nullptr;
|
||||
struct ggml_tensor * wv_s = nullptr;
|
||||
struct ggml_tensor * wo_s = nullptr;
|
||||
struct ggml_tensor * wqkv_s = nullptr;
|
||||
struct ggml_tensor * wqkv_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp_s = nullptr;
|
||||
struct ggml_tensor * ssm_out_s = nullptr;
|
||||
struct ggml_tensor * ssm_alpha_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_s = nullptr;
|
||||
|
||||
// altup & laurel
|
||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1719,7 +1719,7 @@ private:
|
|||
};
|
||||
|
||||
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
struct gguf_context * ctx = ml.meta.get();
|
||||
struct gguf_context * ctx = ml.metadata;
|
||||
|
||||
// determine vocab type
|
||||
{
|
||||
|
|
@ -1833,7 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
// correct endiannes of data in precompiled_charsmap binary blob
|
||||
// correct endianness of data in precompiled_charsmap binary blob
|
||||
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
||||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
#include "llama-impl.h"
|
||||
|
||||
#include "llama-chat.h"
|
||||
|
|
@ -12,6 +13,7 @@
|
|||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
|
@ -825,7 +827,8 @@ int64_t llama_time_us(void) {
|
|||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
|
||||
const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
|
||||
// loading time will be recalculated after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
model.t_load_us = 0;
|
||||
|
|
@ -834,7 +837,8 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
model.t_start_us = tm.t_start_us;
|
||||
|
||||
try {
|
||||
llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||
|
||||
ml.print_info();
|
||||
|
||||
|
|
@ -880,9 +884,13 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|||
}
|
||||
|
||||
static struct llama_model * llama_model_load_from_file_impl(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
const std::string & path_model,
|
||||
std::vector<std::string> & splits,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
|
|
@ -1003,7 +1011,7 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
props.memory_free/1024/1024);
|
||||
}
|
||||
|
||||
const int status = llama_model_load(path_model, splits, *model, params);
|
||||
const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
if (status == -1) {
|
||||
|
|
@ -1019,6 +1027,18 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||
return model;
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_init_from_user(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data,
|
||||
void * set_tensor_data_ud,
|
||||
struct llama_model_params params) {
|
||||
GGML_ASSERT(metadata != nullptr);
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
params.use_mmap = false;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
}
|
||||
// deprecated
|
||||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
|
|
@ -1030,7 +1050,7 @@ struct llama_model * llama_model_load_from_file(
|
|||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
std::vector<std::string> splits = {};
|
||||
return llama_model_load_from_file_impl(path_model, splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_splits(
|
||||
|
|
@ -1046,11 +1066,11 @@ struct llama_model * llama_model_load_from_splits(
|
|||
for (size_t i = 0; i < n_paths; ++i) {
|
||||
splits.push_back(paths[i]);
|
||||
}
|
||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||
return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
|
||||
}
|
||||
|
||||
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||
llama_model_saver ms(*model);
|
||||
llama_model_saver ms(model);
|
||||
ms.add_kv_from_model();
|
||||
ms.add_tensors_from_model();
|
||||
ms.save(path_model);
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include "ggml-cpu.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-opt.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
|
@ -152,6 +153,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
|
@ -440,19 +442,30 @@ extern "C" {
|
|||
|
||||
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||
|
||||
typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
|
||||
|
||||
// Create a new model from GGUF metadata as well as a function to set the tensor data
|
||||
// - tensors are created as GGML_TYPE_F32 by default,
|
||||
// override by adding a tensor with the same name but a different name to the context
|
||||
LLAMA_API struct llama_model * llama_model_init_from_user(
|
||||
struct gguf_context * metadata,
|
||||
llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
|
||||
void * set_tensor_data_ud, // userdata for function
|
||||
struct llama_model_params params);
|
||||
|
||||
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params),
|
||||
"use llama_model_load_from_file instead");
|
||||
|
||||
// Load the model from a file
|
||||
// Load a model from a file
|
||||
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
|
||||
// If the split file name does not follow this pattern, use llama_model_load_from_splits
|
||||
LLAMA_API struct llama_model * llama_model_load_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params);
|
||||
|
||||
// Load the model from multiple splits (support custom naming scheme)
|
||||
// Load a model from multiple splits (support custom naming scheme)
|
||||
// The paths must be in the correct order
|
||||
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
||||
const char ** paths,
|
||||
|
|
@ -973,7 +986,7 @@ extern "C" {
|
|||
|
||||
// Logits for the ith token. For positive indices, Equivalent to:
|
||||
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
||||
// Negative indices can be used to access logits in reverse order, -1 is the last logit.
|
||||
// returns NULL for invalid ids.
|
||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
|
|
@ -988,7 +1001,7 @@ extern "C" {
|
|||
|
||||
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||
// Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
// returns NULL for invalid ids.
|
||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||
|
|
@ -1008,9 +1021,9 @@ extern "C" {
|
|||
// Returns LLAMA_TOKEN_NULL if no token was sampled.
|
||||
LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
// Get the backend sampled probabilites for the ith token
|
||||
// Get the backend sampled probabilities for the ith token
|
||||
// The index matches llama_get_sampled_token_ith().
|
||||
// Returns NULL if no probabilites were generated.
|
||||
// Returns NULL if no probabilities were generated.
|
||||
LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
|
||||
LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
|
||||
|
||||
|
|
@ -1337,7 +1350,7 @@ extern "C" {
|
|||
float tau,
|
||||
float eta);
|
||||
|
||||
/// @details Intializes a GBNF grammar, see grammars/README.md for details.
|
||||
/// @details Initializes a GBNF grammar, see grammars/README.md for details.
|
||||
/// @param vocab The vocabulary that this grammar will be used with.
|
||||
/// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
|
||||
/// @param grammar_root The name of the start symbol for the grammar.
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -127,7 +127,6 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU,
|
||||
hparams.expert_weights_norm, // norm_w (route_norm=True)
|
||||
hparams.expert_weights_scale, // scale_w
|
||||
hparams.expert_weights_scale, // w_scale (route_scale=2.826)
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -104,7 +103,7 @@ llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_pa
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
|
|||
);
|
||||
break;
|
||||
case LLM_TYPE_13B:
|
||||
case LLM_TYPE_UNKNOWN:
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -97,7 +96,7 @@ llm_build_bailingmoe::llm_build_bailingmoe(const llama_model & model, const llm_
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
false, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -90,7 +88,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,12 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -129,9 +127,17 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
|||
// feed-forward network
|
||||
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
||||
// MoE branch
|
||||
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, nullptr,
|
||||
model.layers[il].ffn_down_exps, nullptr, hparams.n_expert, hparams.n_expert_used,
|
||||
LLM_FFN_GELU, false, false, 0.0f, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
cur = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
nullptr,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
hparams.n_expert, hparams.n_expert_used,
|
||||
LLM_FFN_GELU, false,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
||||
model.arch == LLM_ARCH_JINA_BERT_V3) {
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
|
||||
|
||||
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].wq_scale) {
|
||||
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
|
||||
}
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
|
|
@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
}
|
||||
|
||||
// B1.K
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].wk_scale) {
|
||||
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
|
||||
}
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
|
|
@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
}
|
||||
|
||||
// B1.V
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].wv_scale) {
|
||||
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
|
||||
}
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
|
|
@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].wo, cur);
|
||||
if (model.layers[il].wo_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
|
||||
}
|
||||
cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
|
||||
if (model.layers[il].bo) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
}
|
||||
|
|
@ -115,8 +103,8 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
|
||||
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s,
|
||||
NULL, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
|
|
@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
|||
LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_sub_norm", il);
|
||||
|
||||
cur = build_lora_mm(model.layers[il].ffn_down, cur);
|
||||
if (model.layers[il].ffn_down_scale) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
|
||||
}
|
||||
cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
|
||||
cb(cur, "ffn_down", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
#include <float.h>
|
||||
|
||||
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * cur;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@
|
|||
|
||||
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -89,7 +88,7 @@ llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -100,7 +98,7 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_grap
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
||||
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
|
@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
||||
}
|
||||
|
||||
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
||||
// note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
|
||||
cur = build_attn(inp_attn_k,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
|
||||
|
|
@ -216,7 +216,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il,
|
||||
nullptr,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
#include "models.h"
|
||||
|
||||
#define CHUNK_SIZE 64
|
||||
#include "llama-impl.h"
|
||||
|
||||
// utility to get one slice from the third dimension
|
||||
// input dim: [x, y, c, b]
|
||||
|
|
@ -57,7 +57,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
g = ggml_permute(ctx0, g, 0, 2, 1, 3); // [g_0, n_tokens, H_v, n_seqs]
|
||||
b = ggml_permute(ctx0, b, 0, 2, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
|
||||
|
||||
const int CS = CHUNK_SIZE;
|
||||
const int CS = kda ? 16 : 64; // chunk size
|
||||
|
||||
const int pad = (CS - n_tokens % CS) % CS;
|
||||
const int n_chunks = (n_tokens + pad) / CS;
|
||||
|
|
@ -225,9 +225,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
|
||||
cb(kg_t, "key_gdiff_t", il);
|
||||
|
||||
ggml_tensor * s_t = ggml_transpose(ctx0, s);
|
||||
s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
|
||||
cb(s_t, "dnet_add_ch_state", il);
|
||||
s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
|
||||
cb(s, "dnet_add_ch_state", il);
|
||||
|
||||
// [CS, S_v, n_chunks, H_v * n_seqs]
|
||||
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
|
||||
|
|
@ -240,7 +239,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
|
||||
|
||||
// [CS, S_v, 1, H_v * n_seqs]
|
||||
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
|
||||
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
|
||||
cb(v_t_p, "v_prime", il);
|
||||
|
||||
// [CS, S_v, 1, H_v * n_seqs]
|
||||
|
|
@ -252,7 +251,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
cb(v_attn, "v_attn", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
|
||||
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
|
||||
cb(attn_inter, "attn_inter", il);
|
||||
|
||||
// [S_v, CS, 1, H_v * n_seqs]
|
||||
|
|
@ -268,13 +267,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
|
||||
ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);
|
||||
|
||||
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
|
||||
s_t = ggml_add(ctx0, s_t, kgv);
|
||||
cb(s_t, "dnet_add_ch_state", il);
|
||||
s = ggml_mul(ctx0, s, ch_g_last_exp_t);
|
||||
s = ggml_add(ctx0, s, kgv);
|
||||
cb(s, "dnet_add_ch_state", il);
|
||||
}
|
||||
|
||||
s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
|
||||
|
||||
// truncate padded tokens
|
||||
ggml_tensor * o = ggml_view_4d(ctx0, v,
|
||||
S_v, n_tokens, H_v, n_seqs,
|
||||
|
|
@ -282,7 +279,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
ggml_row_size(v->type, S_v * CS * n_chunks),
|
||||
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
|
||||
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
s = ggml_transpose(ctx0, s_t);
|
||||
s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
|
||||
cb(s, "output_state", il);
|
||||
|
||||
return {o, s};
|
||||
|
|
@ -341,11 +338,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
g = ggml_exp(ctx0, g);
|
||||
s = ggml_mul(ctx0, s, g);
|
||||
|
||||
ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
|
||||
|
||||
// [1, S_v, H_v, n_seqs]
|
||||
ggml_tensor * sk;
|
||||
sk = ggml_mul (ctx0, s_t, k);
|
||||
sk = ggml_mul (ctx0, s, k);
|
||||
sk = ggml_sum_rows(ctx0, sk);
|
||||
|
||||
// [S_v, 1, H_v, n_seqs]
|
||||
|
|
@ -362,15 +357,89 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
|
|||
k = ggml_repeat(ctx0, k, s);
|
||||
kd = ggml_mul (ctx0, k, d_t);
|
||||
|
||||
s_t = ggml_add(ctx0, s_t, kd);
|
||||
s = ggml_add(ctx0, s, kd);
|
||||
|
||||
cb(s_t, "dnet_add_ar_state", il);
|
||||
cb(s, "dnet_add_ar_state", il);
|
||||
|
||||
ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
|
||||
ggml_tensor * s_q = ggml_mul (ctx0, s, q);
|
||||
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
|
||||
|
||||
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
|
||||
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
|
||||
|
||||
return {o, s};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net_fused(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il) {
|
||||
const int64_t S_k = q->ne[0];
|
||||
const int64_t H_k = q->ne[1];
|
||||
const int64_t n_tokens = q->ne[2];
|
||||
const int64_t n_seqs = q->ne[3];
|
||||
|
||||
const int64_t S_v = v->ne[0];
|
||||
const int64_t H_v = v->ne[1];
|
||||
|
||||
GGML_ASSERT(S_k == S_v);
|
||||
GGML_ASSERT(H_v % H_k == 0);
|
||||
|
||||
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
|
||||
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
||||
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
||||
|
||||
GGML_ASSERT(g->ne[0] == 1 || g->ne[0] == S_v);
|
||||
GGML_ASSERT( g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
||||
GGML_ASSERT(b->ne[0] == 1 && b->ne[1] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
|
||||
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
|
||||
|
||||
ggml_tensor * result = ggml_gated_delta_net(ctx0, q, k, v, g, b, s);
|
||||
if (n_tokens == 1) {
|
||||
cb(result, LLAMA_TENSOR_NAME_FGDN_AR, il);
|
||||
} else {
|
||||
cb(result, LLAMA_TENSOR_NAME_FGDN_CH, il);
|
||||
}
|
||||
|
||||
ggml_tensor * output = ggml_view_4d(ctx0, result,
|
||||
S_v, H_v, n_tokens, n_seqs,
|
||||
ggml_row_size(result->type, S_v),
|
||||
ggml_row_size(result->type, S_v * H_v),
|
||||
ggml_row_size(result->type, S_v * H_v * n_tokens), 0);
|
||||
|
||||
ggml_tensor * new_state = ggml_view_4d(ctx0, result,
|
||||
S_v, S_v, H_v, n_seqs,
|
||||
ggml_row_size(result->type, S_v),
|
||||
ggml_row_size(result->type, S_v * S_v),
|
||||
ggml_row_size(result->type, S_v * S_v * H_v),
|
||||
ggml_row_size(result->type, S_v * H_v * n_tokens * n_seqs));
|
||||
|
||||
return {output, new_state};
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_net(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il) {
|
||||
const int64_t n_seq_tokens = q->ne[2];
|
||||
|
||||
if (n_seq_tokens == 1) {
|
||||
if (cparams.fused_gdn_ar) {
|
||||
return build_delta_net_fused(q, k, v, g, b, s, il);
|
||||
}
|
||||
return build_delta_net_autoregressive(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
if (cparams.fused_gdn_ch) {
|
||||
return build_delta_net_fused(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
||||
return build_delta_net_chunking(q, k, v, g, b, s, il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -91,7 +89,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@
|
|||
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
//copied from qwen2
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,13 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -103,7 +101,7 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -100,7 +99,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@
|
|||
|
||||
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@
|
|||
template <bool iswa>
|
||||
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
|
||||
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
|
||||
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
template <bool iswa>
|
||||
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params),
|
||||
model(model),
|
||||
n_embd_head(model.hparams.n_embd_head_k),
|
||||
n_embd_head(model.hparams.n_embd_head_k()),
|
||||
n_embd_altup(model.hparams.n_embd_altup),
|
||||
n_altup(model.hparams.n_altup),
|
||||
i_altup_act(model.hparams.i_altup_act) {
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
|
@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(routed_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * pos;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -160,7 +159,7 @@ ggml_tensor * llm_build_granite_hybrid::build_layer_ffn(ggml_tensor * cur,
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,15 +1,14 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
llm_build_granite::llm_build_granite(
|
||||
const llama_model & model,
|
||||
const llm_graph_params & params)
|
||||
: llm_graph_context(params) {
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -175,7 +174,7 @@ ggml_tensor * llm_build_granite::build_layer_ffn(
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -99,7 +99,7 @@ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_GELU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,14 +1,12 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -90,7 +88,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il,
|
||||
probs);
|
||||
|
|
@ -106,7 +104,7 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_grap
|
|||
nullptr,
|
||||
n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il,
|
||||
probs);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -119,8 +119,7 @@ llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const ll
|
|||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU,
|
||||
true, // norm_topk_prob
|
||||
false,
|
||||
0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur_moe, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
// JAIS-2 model graph builder
|
||||
// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
|
||||
llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -76,7 +76,7 @@ llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_para
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
#include "models.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include "llama-memory-recurrent.h"
|
||||
|
||||
|
|
@ -103,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
|
||||
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot(); // config.qk_rope_head_dim
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
|
||||
// Attention scale for MLA
|
||||
const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
|
||||
|
|
@ -118,12 +117,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
// Check layer type by checking which tensors exist
|
||||
// KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
|
||||
bool is_kda = (layer.ssm_a != nullptr);
|
||||
bool is_mla = (layer.wkv_a_mqa != nullptr);
|
||||
|
||||
if (is_kda) {
|
||||
if (hparams.is_recurrent(il)) {
|
||||
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
|
||||
// Reference: vLLM kda.py
|
||||
const auto * mctx_cur = inp_rs->mctx;
|
||||
|
|
@ -175,9 +169,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm);
|
||||
|
||||
// Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens
|
||||
std::pair<ggml_tensor *, ggml_tensor *> attn_out = n_seq_tokens == 1 ?
|
||||
build_delta_net_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) :
|
||||
build_delta_net_chunking(Qcur, Kcur, Vcur, g1, beta, state, il);
|
||||
auto attn_out = build_delta_net(Qcur, Kcur, Vcur, g1, beta, state, il);
|
||||
|
||||
ggml_tensor * output = ggml_cont(ctx0, attn_out.first);
|
||||
ggml_tensor * new_state = attn_out.second;
|
||||
|
|
@ -211,7 +203,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
cur = ggml_mul_mat(ctx0, layer.wo, gated);
|
||||
cb(cur, "kda_out", il);
|
||||
|
||||
} else if (is_mla) {
|
||||
} else {
|
||||
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
|
||||
// Reference: vLLM mla.py
|
||||
// Step 1: Q projection and reshape
|
||||
|
|
@ -310,9 +302,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
|
||||
cb(cur, "mla_out", il);
|
||||
}
|
||||
} else {
|
||||
// Unknown layer type - this should not happen
|
||||
GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
|
||||
}
|
||||
|
||||
// On last layer, select only the output tokens
|
||||
|
|
@ -349,7 +338,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
hparams.n_expert,
|
||||
hparams.n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -23,17 +23,23 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
|||
};
|
||||
auto build_moe_feed_forward = [&model, this](ggml_tensor * cur, int il) -> ggml_tensor * {
|
||||
return build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
|
||||
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func),
|
||||
il);
|
||||
};
|
||||
auto build_attn_block = [&model, this](ggml_tensor * cur,
|
||||
ggml_tensor * inp_pos,
|
||||
inp_attn_type * inp_attn,
|
||||
int il) -> ggml_tensor * {
|
||||
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
||||
const auto n_embd_head = hparams.n_embd_head_v;
|
||||
const auto n_embd_head = hparams.n_embd_head_v();
|
||||
const auto n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -90,7 +90,7 @@ llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_gr
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -134,7 +134,7 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il);
|
||||
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
template <bool embed>
|
||||
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -43,19 +43,19 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
|
|
@ -91,6 +91,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||
if (model.layers[il].wo_s) {
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
||||
}
|
||||
cb(cur, "attn_out", il);
|
||||
}
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
|
|
@ -109,9 +112,9 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up_s,
|
||||
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate_s,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down_s,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
|
@ -130,9 +133,13 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
il,
|
||||
nullptr, nullptr,
|
||||
model.layers[il].ffn_up_exps_s,
|
||||
model.layers[il].ffn_gate_exps_s,
|
||||
model.layers[il].ffn_down_exps_s);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
|
|||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(d_inner % n_head == 0);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
|
@ -167,6 +168,9 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
|
|||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(d_inner % n_head == 0);
|
||||
GGML_ASSERT(d_inner % d_state == 0);
|
||||
GGML_ASSERT(d_inner % n_group == 0);
|
||||
|
||||
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
|
||||
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
#include "models.h"
|
||||
|
||||
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
|
|
@ -88,10 +87,17 @@ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_
|
|||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// MoE branch
|
||||
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
|
||||
0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
|
||||
cur = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
|
|||
const int64_t n_embd_base = 256;
|
||||
const float scale_embd = 12.0f;
|
||||
const float scale_depth = 1.4f;
|
||||
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
||||
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));
|
||||
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
||||
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
|
|
@ -51,21 +51,21 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
|
|||
LLM_NORM_RMS, il);
|
||||
cb(q, "q", il);
|
||||
|
||||
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
||||
// {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
||||
cb(q, "q", il);
|
||||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k()),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
|
||||
0);
|
||||
cb(q_nope, "q_nope", il);
|
||||
|
||||
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
||||
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k()),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
|
||||
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
cb(q_pe, "q_pe", il);
|
||||
|
||||
|
|
@ -97,15 +97,15 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
|
|||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
||||
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
|
||||
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
|
||||
0);
|
||||
cb(k_nope, "k_nope", il);
|
||||
|
||||
// and {n_head * n_embd_head_v, n_tokens}
|
||||
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
||||
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
||||
cb(v_states, "v_states", il);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,10 @@
|
|||
|
||||
#include "models.h"
|
||||
|
||||
llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
// GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
// GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -91,7 +90,7 @@ llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -127,7 +127,7 @@ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_grap
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#include "llama-model.h"
|
||||
#include "llama-graph.h"
|
||||
|
||||
// note: almost all graphs require atleast sqrtf, so include cmath globally
|
||||
// note: almost all graphs require at least sqrtf, so include cmath globally
|
||||
#include <cmath>
|
||||
|
||||
//
|
||||
|
|
@ -44,6 +44,26 @@ struct llm_build_delta_net_base : public llm_graph_context {
|
|||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
|
||||
// use the ggml_gated_delta_net fused operator
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_fused(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
|
||||
// choose one of two implementations above based on the number of tokens
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_delta_net(
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
ggml_tensor * g,
|
||||
ggml_tensor * b,
|
||||
ggml_tensor * s,
|
||||
int il);
|
||||
};
|
||||
|
||||
struct llm_build_rwkv6_base : public llm_graph_context {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * pos;
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -114,9 +114,18 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
|||
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
ggml_tensor * ffn_inp = cur;
|
||||
ggml_tensor * inp_emb = cur;
|
||||
ggml_tensor * inp_latent = cur;
|
||||
|
||||
if (model.layers[il].ffn_latent_down) {
|
||||
inp_latent = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_down, cur);
|
||||
}
|
||||
|
||||
ggml_tensor * router_logits = build_lora_mm(model.layers[il].ffn_gate_inp, cur);
|
||||
cb(router_logits, "ffn_moe_logits", il);
|
||||
|
||||
ggml_tensor * moe_out =
|
||||
build_moe_ffn(ffn_inp,
|
||||
build_moe_ffn(inp_latent,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
nullptr, // no gate
|
||||
|
|
@ -124,12 +133,17 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
|
|||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
||||
il);
|
||||
il,
|
||||
router_logits);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
||||
if (model.layers[il].ffn_latent_up) {
|
||||
moe_out = ggml_mul_mat(ctx0, model.layers[il].ffn_latent_up, moe_out);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_shexp = build_ffn(inp_emb,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
NULL /* no gate */ , NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
//GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
//GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
template <bool iswa>
|
||||
llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
@ -92,7 +92,7 @@ llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_para
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SWIGLU_OAI_MOE, false,
|
||||
false, 0.0,
|
||||
hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
||||
il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
|
|||
|
||||
// NOTE: same with qwen2vl.cpp, but bias tensors are optional
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue