#include "llama.h" #include "llama-impl.h" #include "llama-chat.h" #include "llama-context.h" #include "llama-mmap.h" #include "llama-vocab.h" #include "llama-model-loader.h" #include "llama-model-saver.h" #include "llama-model.h" #include "ggml.h" #include "ggml-cpp.h" #include "ggml-backend.h" #include "gguf.h" #include #include #include #include #include #include #include #include #include #include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif // // interface implementation // const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type) { switch (flash_attn_type) { case LLAMA_FLASH_ATTN_TYPE_AUTO: return "auto"; case LLAMA_FLASH_ATTN_TYPE_DISABLED: return "disabled"; case LLAMA_FLASH_ATTN_TYPE_ENABLED: return "enabled"; } GGML_ABORT("fatal error"); } struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params result = { /*.no_perf =*/ true, }; return result; } size_t llama_max_devices(void) { return 16; } size_t llama_max_tensor_buft_overrides() { return 4096; } bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; } bool llama_supports_mlock(void) { return llama_mlock::SUPPORTED; } bool llama_supports_gpu_offload(void) { return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr || ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr || llama_supports_rpc(); } bool llama_supports_rpc(void) { return ggml_backend_reg_by_name("RPC") != nullptr; } void llama_backend_init(void) { ggml_time_init(); // needed to initialize f16 tables { struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); ggml_free(ctx); } } void llama_numa_init(enum ggml_numa_strategy numa) { if (numa != GGML_NUMA_STRATEGY_DISABLED) { auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); GGML_ASSERT(dev && "CPU backend is not loaded"); auto * reg = ggml_backend_dev_backend_reg(dev); auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); if (numa_init_fn) { numa_init_fn(numa); } } } void llama_backend_free(void) { ggml_quantize_free(); } int64_t llama_time_us(void) { return ggml_time_us(); } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, const std::string & fname, std::vector & splits, FILE * file, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; time_meas tm(model.t_load_us); model.t_start_us = tm.t_start_us; try { llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); model.hparams.vocab_only = params.vocab_only; model.hparams.no_alloc = params.no_alloc; try { model.load_arch(ml); } catch(const std::exception & e) { throw std::runtime_error("error loading model architecture: " + std::string(e.what())); } try { model.load_hparams(ml); } catch(const std::exception & e) { throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); } if (model.arch == LLM_ARCH_CLIP) { throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead"); } try { model.load_vocab(ml); } catch(const std::exception & e) { throw std::runtime_error("error loading model vocabulary: " + std::string(e.what())); } model.load_stats(ml); model.print_info(); if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); return 0; } if (!model.load_tensors(ml)) { return -2; } } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); return -1; } return 0; } static struct llama_model * llama_model_load_from_file_impl( struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, const std::string & path_model, std::vector & splits, FILE * file, struct llama_model_params params) { { int n_sources_defined = 0; if (metadata != nullptr) { n_sources_defined++; } if (!path_model.empty()) { n_sources_defined++; } if (file != nullptr) { n_sources_defined++; } if (n_sources_defined != 1) { LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__); return nullptr; } } ggml_time_init(); if (!params.vocab_only && ggml_backend_reg_count() == 0) { LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); return nullptr; } unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; params.progress_callback = [](float progress, void * ctx) { unsigned * cur_percentage_p = (unsigned *) ctx; unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { *cur_percentage_p = percentage; LLAMA_LOG_CONT("."); if (percentage >= 100) { LLAMA_LOG_CONT("\n"); } } return true; }; } llama_model * model = new llama_model(params); // create list of devices to use with this model if (params.devices) { if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) { size_t n_devs = 0; while (params.devices[n_devs]) { n_devs++; } if (n_devs == 0) { LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__); return nullptr; } LLAMA_LOG_INFO("%s: creating a Meta device with %zu devices\n", __func__, n_devs); for (size_t i = 0; i < n_devs; ++i) { LLAMA_LOG_INFO("%s: - device %zu: %s\n", __func__, i, ggml_backend_dev_name(params.devices[i])); } model->get_split_state_ud.n_devices = n_devs; model->get_split_state_ud.model = model; model->devices.push_back({ true, ggml_backend_meta_device( params.devices, n_devs, llama_meta_device_get_split_state, &model->get_split_state_ud) }); } else { for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { model->devices.push_back({false, *dev}); } } } else { // default device selection // build list of available devices std::vector gpus; std::vector igpus; std::vector rpc_servers; if (params.split_mode == LLAMA_SPLIT_MODE_TENSOR) { std::vector devs; devs.reserve(ggml_backend_dev_count()); for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { auto * dev = ggml_backend_dev_get(i); if (ggml_backend_dev_buffer_type(dev) == ggml_backend_cpu_buffer_type()) { LLAMA_LOG_INFO("%s: skipping %s (%s) for tensor parallelism\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev)); continue; } devs.push_back(dev); } if (devs.empty()) { LLAMA_LOG_ERROR("%s: LLAMA_SPLIT_MODE_TENSOR needs >= 1 devices\n", __func__); return nullptr; } LLAMA_LOG_INFO("%s: creating a Meta device for tensor parallelism from %zu devices:\n", __func__, devs.size()); for (size_t i = 0; i < devs.size(); ++i) { LLAMA_LOG_INFO("%s: - device %zu: %s (%s)\n", __func__, i, ggml_backend_dev_name(devs[i]), ggml_backend_dev_description(devs[i])); } GGML_ASSERT(!devs.empty()); model->get_split_state_ud.n_devices = devs.size(); model->get_split_state_ud.model = model; gpus.push_back({ true, ggml_backend_meta_device( devs.data(), devs.size(), llama_meta_device_get_split_state, &model->get_split_state_ud) }); } else { for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); switch (ggml_backend_dev_type(dev)) { case GGML_BACKEND_DEVICE_TYPE_CPU: case GGML_BACKEND_DEVICE_TYPE_ACCEL: // skip CPU backends since they are handled separately break; case GGML_BACKEND_DEVICE_TYPE_GPU: { ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); if (ggml_backend_reg_name(reg) == std::string("RPC")) { rpc_servers.push_back({false, dev}); } else { // check if there is already a GPU with the same device id ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); auto it = std::find_if(gpus.begin(), gpus.end(), [&props](const llama_device & d) { ggml_backend_dev_props d_props; ggml_backend_dev_get_props(d.dev, &d_props); if (props.device_id && d_props.device_id) { return strcmp(props.device_id, d_props.device_id) == 0; } return false; }); if (it != gpus.end()) { LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), props.device_id ? props.device_id : "unknown id", ggml_backend_dev_name(it->dev), ggml_backend_dev_description(it->dev)); } else { gpus.push_back({false, dev}); } } break; } case GGML_BACKEND_DEVICE_TYPE_IGPU: igpus.push_back({false, dev}); break; case GGML_BACKEND_DEVICE_TYPE_META: GGML_ABORT("fatal error"); } } } // add RPC servers at the front of the list to minimize network transfers model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); // add GPUs model->devices.insert(model->devices.end(), gpus.begin(), gpus.end()); // add integrated GPUs only if no other devices were found if (model->devices.empty()) { model->devices.insert(model->devices.end(), igpus.begin(), igpus.end()); } } // if using single GPU mode, remove all except the main GPU if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { if (params.main_gpu < 0) { model->devices.clear(); } else { if (params.main_gpu >= (int)model->devices.size()) { LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size()); llama_model_free(model); return nullptr; } llama_device main_gpu = model->devices[params.main_gpu]; model->devices.clear(); model->devices.push_back(main_gpu); } } for (const auto & dev : model->devices) { ggml_backend_dev_props props; ggml_backend_dev_get_props(dev.dev, &props); LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev.dev), ggml_backend_dev_description(dev.dev), props.device_id ? props.device_id : "unknown id", props.memory_free/1024/1024); } const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); } else if (status == -2) { LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); } llama_model_free(model); return nullptr; } return model; } struct llama_model * llama_model_init_from_user( struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, struct llama_model_params params) { GGML_ASSERT(metadata != nullptr); std::string path_model; std::vector splits = {}; params.use_mmap = false; params.use_extra_bufts = false; return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params); } // deprecated struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_model_params params) { return llama_model_load_from_file(path_model, params); } struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params); } struct llama_model * llama_model_load_from_splits( const char ** paths, size_t n_paths, struct llama_model_params params) { std::vector splits; if (n_paths == 0) { LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__); return nullptr; } splits.reserve(n_paths); for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params); } struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) { if (!file) { LLAMA_LOG_ERROR("%s: file is NULL\n", __func__); return nullptr; } std::string path_model; std::vector splits = {}; return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params); } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { llama_model_saver ms(model); ms.add_kv_from_model(); ms.add_tensors_from_model(); ms.save(path_model); } // // chat templates // int32_t llama_chat_apply_template( const char * tmpl, const struct llama_chat_message * chat, size_t n_msg, bool add_ass, char * buf, int32_t length) { const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl); // format the chat to string std::vector chat_vec; chat_vec.resize(n_msg); for (size_t i = 0; i < n_msg; i++) { chat_vec[i] = &chat[i]; } std::string formatted_chat; llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl); if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) { return -1; } int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass); if (res < 0) { return res; } if (buf && length > 0) { strncpy(buf, formatted_chat.c_str(), length); } return res; } // // model split // int32_t llama_split_path( char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count) { static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; const int written = snprintf( split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count ); if (written < 0 || (size_t) written >= maxlen) { return 0; } return (int32_t) written; } int32_t llama_split_prefix( char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count) { const std::string str_split_path(split_path); char postfix[32]; snprintf(postfix, sizeof(postfix), "-%05d-of-%05d.gguf", split_no + 1, split_count); const std::string str_postfix(postfix); if (str_split_path.size() <= str_postfix.size()) { return 0; } const size_t size_prefix = str_split_path.size() - str_postfix.size(); if (str_split_path.compare(size_prefix, std::string::npos, str_postfix) == 0) { const size_t copy_len = std::min(size_prefix + 1, maxlen); snprintf(split_prefix, copy_len, "%s", split_path); return (int32_t) size_prefix; } return 0; } const char * llama_print_system_info(void) { static std::string s; s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls. for (size_t i = 0; i < ggml_backend_reg_count(); i++) { auto * reg = ggml_backend_reg_get(i); auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); if (get_features_fn) { ggml_backend_feature * features = get_features_fn(reg); s += ggml_backend_reg_name(reg); s += " : "; for (; features->name; features++) { s += features->name; s += " = "; s += features->value; s += " | "; } } } return s.c_str(); }