#pragma once #include "llama-model.h" #include "llama-graph.h" #include "llama-model-loader.h" // note: almost all graphs require at least sqrtf, so include cmath globally #include // // base classes // struct llm_build_mamba_base : public llm_graph_context { llm_build_mamba_base(const llm_graph_params & params); virtual ~llm_build_mamba_base() = default; ggml_tensor * build_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il); ggml_tensor * build_mamba2_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il) const; }; struct llm_build_delta_net_base : public llm_graph_context { llm_build_delta_net_base(const llm_graph_params & params); virtual ~llm_build_delta_net_base() = default; // returns pair of output and new state std::pair build_delta_net_chunking( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * b, ggml_tensor * s, int il); // returns pair of output and new state std::pair build_delta_net_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * b, ggml_tensor * s, int il); // use the ggml_gated_delta_net fused operator (K=1; state has shape (D, 1, n_seqs)) std::pair build_delta_net_fused( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * b, ggml_tensor * s, int il); // choose one of two implementations above based on the number of tokens std::pair build_delta_net( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * b, ggml_tensor * s, int il); // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) ggml_tensor * build_conv_state( llm_graph_input_rs * inp, ggml_tensor * conv_states_all, ggml_tensor * qkv_mixed, int64_t conv_kernel_size, int64_t conv_channels, int il); // run delta-net attention and write the new recurrent state(s) back to ssm_states_all // s: (head_v_dim, head_v_dim, num_v_heads, n_seqs); returns output: (head_v_dim, num_v_heads, n_seq_tokens, n_seqs) ggml_tensor * build_recurrent_attn( llm_graph_input_rs * inp, ggml_tensor * ssm_states_all, ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * b, ggml_tensor * s, int il); }; struct llm_build_rwkv6_base : public llm_graph_context { const llama_model & model; llm_build_rwkv6_base(const llama_model & model, const llm_graph_params & params); virtual ~llm_build_rwkv6_base() = default; ggml_tensor * build_rwkv6_channel_mix(const llama_layer * layer, ggml_tensor * cur, ggml_tensor * x_prev, llm_arch arch) const; ggml_tensor * build_rwkv6_time_mix(llm_graph_input_rs * inp, ggml_tensor * cur, ggml_tensor * x_prev, const llama_ubatch & ubatch, int il) const; }; // Base class for RWKV7-related models struct llm_build_rwkv7_base : public llm_graph_context { const llama_model & model; llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params); virtual ~llm_build_rwkv7_base() = default; // RWKV7-specific graph building methods ggml_tensor * build_rwkv7_channel_mix(const llama_layer * layer, ggml_tensor * cur, ggml_tensor * x_prev, llm_arch arch) const; ggml_tensor * build_rwkv7_time_mix(llm_graph_input_rs * inp, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor *& first_layer_value, const llama_ubatch & ubatch, int il) const; }; // // models // struct llama_model_llama : public llama_model_base { llama_model_llama(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_llama4 : public llama_model_base { llama_model_llama4(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_llama_embed : public llama_model_llama { llama_model_llama_embed(const struct llama_model_params & params) : llama_model_llama(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_llama template using graph = llama_model_llama::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_maincoder : public llama_model_base { llama_model_maincoder(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_deci : public llama_model_base { llama_model_deci(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_baichuan : public llama_model_base { llama_model_baichuan(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_falcon : public llama_model_base { llama_model_falcon(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_grok : public llama_model_base { llama_model_grok(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_starcoder : public llama_model_base { llama_model_starcoder(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_refact : public llama_model_base { llama_model_refact(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_bert : public llama_model_base { llama_model_bert(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_jina_bert_v2 : public llama_model_base { llama_model_jina_bert_v2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_bert::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_jina_bert_v3 : public llama_model_base { llama_model_jina_bert_v3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_bert::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_nomic_bert : public llama_model_base { llama_model_nomic_bert(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_bert::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_nomic_bert_moe : public llama_model_base { llama_model_nomic_bert_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_bert::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_modern_bert : public llama_model_base { llama_model_modern_bert(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_neo_bert : public llama_model_base { llama_model_neo_bert(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_eurobert : public llama_model_base { llama_model_eurobert(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_bloom : public llama_model_base { llama_model_bloom(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_mpt : public llama_model_base { llama_model_mpt(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_stablelm : public llama_model_base { llama_model_stablelm(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen : public llama_model_base { llama_model_qwen(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen2 : public llama_model_base { llama_model_qwen2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_dream : public llama_model_base { llama_model_dream(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_llada : public llama_model_base { llama_model_llada(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_llada_moe : public llama_model_base { llama_model_llada_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_rnd1 : public llama_model_base { llama_model_rnd1(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen2vl : public llama_model_base { llama_model_qwen2vl(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen2moe : public llama_model_base { llama_model_qwen2moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen3 : public llama_model_base { llama_model_qwen3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen3moe : public llama_model_base { llama_model_qwen3moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen3vl : public llama_model_base { llama_model_qwen3vl(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen3vlmoe : public llama_model_base { llama_model_qwen3vlmoe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_phi2 : public llama_model_base { llama_model_phi2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_phi3 : public llama_model_base { llama_model_phi3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_phimoe : public llama_model_base { llama_model_phimoe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template using graph = llama_model_phi3::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_plamo : public llama_model_base { llama_model_plamo(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_plamo2 : public llama_model_base { llama_model_plamo2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_mamba_base { graph(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_plamo2_mamba_layer(llm_graph_input_rs * inp, ggml_tensor * cur, const llama_model & model, const llama_ubatch & ubatch, int il); ggml_tensor * build_plamo2_attn_layer(llm_graph_input_attn_kv * inp, ggml_tensor * inp_pos, ggml_tensor * cur, const llama_model & model, int il); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_plamo3 : public llama_model_base { llama_model_plamo3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gpt2 : public llama_model_base { llama_model_gpt2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_codeshell : public llama_model_base { llama_model_codeshell(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_orion : public llama_model_base { llama_model_orion(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_internlm2 : public llama_model_base { llama_model_internlm2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_minicpm3 : public llama_model_base { llama_model_minicpm3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gemma : public llama_model_base { llama_model_gemma(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gemma2 : public llama_model_base { llama_model_gemma2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gemma3 : public llama_model_base { llama_model_gemma3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gemma3n : public llama_model_base { llama_model_gemma3n(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { const llama_model & model; const int64_t n_embd_head; const int64_t n_embd_altup; const int64_t n_altup; const int i_altup_act; const int n_layer_sparsity = 10; // number of layers using activation sparsity const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) graph(const llama_model & model, const llm_graph_params & params); ggml_tensor * calc_magnitude(ggml_tensor * x); // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER] ggml_tensor * build_inp_per_layer(); ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer); ggml_tensor * gaussian_topk(ggml_tensor * x); ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il); ggml_tensor * altup_predict(ggml_tensor * cur, int il); ggml_tensor * laurel(ggml_tensor * cur, int il); ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gemma4 : public llama_model_base { llama_model_gemma4(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { const llama_model & model; const int64_t n_embd_per_layer; graph(const llama_model & model, const llm_graph_params & params); // TODO: refactor in common "per-layer" functionality [TAG_PER_LAYER] ggml_tensor * build_inp_per_layer(); ggml_tensor * project_per_layer_inputs(ggml_tensor * inp_batch, ggml_tensor * inp_per_layer); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gemma_embedding : public llama_model_base { llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_starcoder2 : public llama_model_base { llama_model_starcoder2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_mamba : public llama_model_base { llama_model_mamba(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_mamba_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_mamba2 : public llama_model_base { llama_model_mamba2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_mamba::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_jamba : public llama_model_base { llama_model_jamba(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_mamba_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_xverse : public llama_model_base { llama_model_xverse(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_command_r : public llama_model_base { llama_model_command_r(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_cohere2 : public llama_model_base { llama_model_cohere2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_dbrx : public llama_model_base { llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_olmo : public llama_model_base { llama_model_olmo(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_olmo2 : public llama_model_base { llama_model_olmo2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_olmoe : public llama_model_base { llama_model_olmoe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_openelm : public llama_model_base { llama_model_openelm(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_gptneox : public llama_model_base { llama_model_gptneox(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_arctic : public llama_model_base { llama_model_arctic(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_deepseek : public llama_model_base { llama_model_deepseek(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_deepseek2 : public llama_model_base { llama_model_deepseek2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_deepseek2ocr : public llama_model_base { llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_deepseek2::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_glm_dsa : public llama_model_base { llama_model_glm_dsa(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_deepseek2::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_mistral4 : public llama_model_deepseek2 { llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_deepseek2 using graph = llama_model_deepseek2::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_chatglm : public llama_model_base { llama_model_chatglm(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_glm4 : public llama_model_base { llama_model_glm4(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_glm4_moe : public llama_model_base { llama_model_glm4_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_bitnet : public llama_model_base { llama_model_bitnet(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_t5 : public llama_model_base { llama_model_t5(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_t5encoder : public llama_model_base { llama_model_t5encoder(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_t5::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_jais : public llama_model_base { llama_model_jais(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_jais2 : public llama_model_base { llama_model_jais2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_nemotron : public llama_model_base { llama_model_nemotron(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_nemotron_h : public llama_model_base { llama_model_nemotron_h(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_mamba_base { graph(const llama_model & model, const llm_graph_params & params); ggml_tensor * build_ffn_layer(ggml_tensor * cur, const llama_model & model, int il); ggml_tensor * build_attention_layer(ggml_tensor * cur, llm_graph_input_attn_kv * inp_attn, const llama_model & model, int64_t n_embd_head, int il); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_nemotron_h_moe : public llama_model_nemotron_h { llama_model_nemotron_h_moe(const struct llama_model_params & params) : llama_model_nemotron_h(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_nemotron_h using graph = llama_model_nemotron_h::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_exaone : public llama_model_base { llama_model_exaone(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_exaone4 : public llama_model_base { llama_model_exaone4(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_exaone_moe : public llama_model_base { llama_model_exaone_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_rwkv6 : public llama_model_base { llama_model_rwkv6(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_rwkv6_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_rwkv6qwen2 : public llama_model_base { llama_model_rwkv6qwen2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_rwkv6_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_rwkv7 : public llama_model_base { llama_model_rwkv7(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_rwkv7_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_arwkv7 : public llama_model_base { llama_model_arwkv7(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_rwkv7_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_granite : public llama_model_base { llama_model_granite(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_attention_layer( ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, const llama_model & model, const int64_t n_embd_head, const int il); ggml_tensor * build_layer_ffn( ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_granite_moe : public llama_model_base { llama_model_granite_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_granite::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_minicpm : public llama_model_base { llama_model_minicpm(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; using graph = llama_model_granite::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_granite_hybrid : public llama_model_base { llama_model_granite_hybrid(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_mamba_base { graph(const llama_model & model, const llm_graph_params & params); ggml_tensor * build_layer_ffn(ggml_tensor * cur, ggml_tensor * inpSA, const llama_model & model, const int il); ggml_tensor * build_attention_layer(ggml_tensor * cur, ggml_tensor * inp_pos, llm_graph_input_attn_kv * inp_attn, const llama_model & model,const int64_t n_embd_head, const int il); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_chameleon : public llama_model_base { llama_model_chameleon(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_wavtokenizer_dec : public llama_model_base { llama_model_wavtokenizer_dec(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_plm : public llama_model_base { llama_model_plm(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_bailingmoe : public llama_model_base { llama_model_bailingmoe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_bailingmoe2 : public llama_model_base { llama_model_bailingmoe2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_seed_oss : public llama_model_base { llama_model_seed_oss(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_dots1 : public llama_model_base { llama_model_dots1(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_arcee : public llama_model_base { llama_model_arcee(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_afmoe : public llama_model_base { llama_model_afmoe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_ernie4_5 : public llama_model_base { llama_model_ernie4_5(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_ernie4_5_moe : public llama_model_ernie4_5 { llama_model_ernie4_5_moe(const struct llama_model_params & params) : llama_model_ernie4_5(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_ernie4_5 struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_paddleocr : public llama_model_ernie4_5 { llama_model_paddleocr(const struct llama_model_params & params) : llama_model_ernie4_5(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_ernie4_5 struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_hunyuan_moe : public llama_model_base { llama_model_hunyuan_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_hunyuan_vl : public llama_model_base { llama_model_hunyuan_vl(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_hunyuan_dense : public llama_model_hunyuan_vl { llama_model_hunyuan_dense(const struct llama_model_params & params) : llama_model_hunyuan_vl(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_hunyuan_vl using graph = llama_model_hunyuan_vl::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_smollm3 : public llama_model_base { llama_model_smollm3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_openai_moe : public llama_model_base { llama_model_openai_moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_falcon_h1 : public llama_model_base { llama_model_falcon_h1(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_mamba_base { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_lfm2 : public llama_model_base { llama_model_lfm2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_lfm2moe : public llama_model_base { llama_model_lfm2moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template using graph = llama_model_lfm2::graph; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_smallthinker : public llama_model_base { llama_model_smallthinker(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; template struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_grovemoe : public llama_model_base { llama_model_grovemoe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_apertus : public llama_model_base { llama_model_apertus(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_minimax_m2 : public llama_model_base { llama_model_minimax_m2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_cogvlm : public llama_model_base { llama_model_cogvlm(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_pangu_embed : public llama_model_base { llama_model_pangu_embed(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen3next : public llama_model_base { llama_model_qwen3next(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_delta_net_base { graph(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_layer_attn( llm_graph_input_attn_kv * inp_attn, ggml_tensor * cur, ggml_tensor * inp_pos, int il); ggml_tensor * build_layer_attn_linear( llm_graph_input_rs * inp, ggml_tensor * cur, int il); ggml_tensor * build_layer_ffn( ggml_tensor * cur, int il); ggml_tensor * build_norm_gated( ggml_tensor * input, ggml_tensor * weights, ggml_tensor * gate, int layer); // returns pair of qkv, z std::pair build_qkvz( ggml_tensor * input, int il); const llama_model & model; }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen35 : public llama_model_base { llama_model_qwen35(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_delta_net_base { graph(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_layer_attn( llm_graph_input_attn_kv * inp_attn, ggml_tensor * cur, ggml_tensor * inp_pos, int * sections, int il); ggml_tensor * build_layer_attn_linear( llm_graph_input_rs * inp, ggml_tensor * cur, int il); ggml_tensor * build_layer_ffn( ggml_tensor * cur, int il); ggml_tensor * build_norm_gated( ggml_tensor * input, ggml_tensor * weights, ggml_tensor * gate, int layer); // returns pair of qkv, z std::pair build_qkvz( ggml_tensor * input, int il); const llama_model & model; }; struct graph_mtp : public llm_graph_context { graph_mtp(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_qwen35moe : public llama_model_base { llama_model_qwen35moe(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_delta_net_base { graph(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_layer_attn( llm_graph_input_attn_kv * inp_attn, ggml_tensor * cur, ggml_tensor * inp_pos, int * sections, int il); ggml_tensor * build_layer_attn_linear( llm_graph_input_rs * inp, ggml_tensor * cur, int il); ggml_tensor * build_layer_ffn( ggml_tensor * cur, int il); ggml_tensor * build_norm_gated( ggml_tensor * input, ggml_tensor * weights, ggml_tensor * gate, int layer); // returns pair of qkv, z std::pair build_qkvz( ggml_tensor * input, int il); const llama_model & model; }; struct graph_mtp : public llm_graph_context { graph_mtp(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_mistral3 : public llama_model_base { llama_model_mistral3(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_mimo2 : public llama_model_base { llama_model_mimo2(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_kimi_linear : public llama_model_base { llama_model_kimi_linear(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_build_delta_net_base { graph(const llama_model & model, const llm_graph_params & params); std::pair build_kda_autoregressive( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * gk, ggml_tensor * beta, ggml_tensor * state, int il); std::pair build_kda_chunking( ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * gk, ggml_tensor * beta, ggml_tensor * state, ggml_tensor * causal_mask, ggml_tensor * identity, ggml_tensor * diag_mask, int il); const llama_model & model; }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; struct llama_model_step35 : public llama_model_base { llama_model_step35(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; void load_arch_tensors(llama_model_loader & ml) override; struct graph : public llm_graph_context { graph(const llama_model & model, const llm_graph_params & params); }; std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; };