From 7c64fef91bcf0fbf473e8cb52bc1a4a14473ca8c Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 19 Mar 2024 13:42:37 +0100 Subject: [PATCH 01/27] split: support in llama_model_loader --- examples/gguf-split/gguf-split.cpp | 145 +++++------- llama.cpp | 362 +++++++++++++++++++++-------- llama.h | 10 + 3 files changed, 344 insertions(+), 173 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 8e12e64937bd7..e45151ab1bc41 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,31 +1,34 @@ #include "llama.h" -#include "ggml.h" #include "common.h" #include #include -#include #include #include -#include #include #include #include -#include #include +#include +#include + +#if defined(_WIN32) + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif enum split_operation : uint8_t { SPLIT_OP_SPLIT, SPLIT_OP_MERGE, }; -static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split"; -static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count"; - -static const int SPLIT_FILENAME_MAX = 256; - -static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf"; +static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "split.no"; +static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "split.count"; +static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count"; struct split_params { split_operation operation = SPLIT_OP_SPLIT; @@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para try { if (!split_params_parse_ex(argc, argv, params)) { split_print_usage(argv[0]); - exit(1); + exit(EXIT_FAILURE); } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); split_print_usage(argv[0]); - exit(1); + exit(EXIT_FAILURE); } return result; } @@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) { } } -static std::string split_file_name(const std::string & path, int i_split, int n_split) { - char f_split[SPLIT_FILENAME_MAX] = {0}; - snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split); - return std::string(f_split); -} - struct split_strategy { const split_params params; std::ifstream & f_input; @@ -180,8 +177,9 @@ struct split_strategy { if (i_split == 0) { gguf_set_kv(ctx_out, ctx_gguf); } - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); + gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); + gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors); // populate the original tensors, so we get an initial metadata for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { @@ -189,10 +187,11 @@ struct split_strategy { gguf_add_tensor(ctx_out, meta); } - auto split_name = split_file_name(params.output, i_split, n_split); + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); - fprintf(stderr, "%s: %s ...", __func__, split_name.c_str()); - fout = std::ofstream(split_name, std::ios::binary); + fprintf(stderr, "%s: %s ...", __func__, split_path); + fout = std::ofstream(split_path, std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors auto meta_size = gguf_get_meta_size(ctx_out); @@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) { std::ifstream f_input(split_params.input.c_str(), std::ios::binary); if (!f_input.is_open()) { fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); if (!ctx_gguf) { fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + + char first_split_path[PATH_MAX] = {0}; + llama_split_path(first_split_path, sizeof(first_split_path), + split_params.output.c_str(), strategy.i_split, strategy.n_split); fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n", __func__, split_params.input.c_str(), - split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(), + first_split_path, split_params.n_split_tensors); strategy.split_start(); @@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) { std::vector ctx_metas; std::vector ctx_ggufs; - std::string split_prefix; + char split_path[PATH_MAX] = {0}; + strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); + char split_prefix[PATH_MAX] = {0}; // First pass to find KV and tensors metadata for (int i_split = 0; i_split < n_split; i_split++) { @@ -309,16 +314,15 @@ static void gguf_merge(const split_params & split_params) { /*.ctx = */ &ctx_meta, }; - auto split_name = split_params.input; if (i_split > 0) { - split_name = split_file_name(split_prefix, i_split, n_split); + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); } - fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str()); + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); - auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params); + auto * ctx_gguf = gguf_init_from_file(split_path, params); if (!ctx_gguf) { fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } ctx_ggufs.push_back(ctx_gguf); ctx_metas.push_back(ctx_meta); @@ -331,65 +335,43 @@ static void gguf_merge(const split_params & split_params) { __func__, LLM_KV_GENERAL_SPLIT_N_SPLIT); gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - n_split = gguf_get_val_u8(ctx_gguf, key_n_split); + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); if (n_split < 1) { fprintf(stderr, "\n%s: input file does not contain a valid split count %d\n", __func__, n_split); gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - // Do not trigger merge if we try to merge again the output - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); - - // Set metadata from the first split - gguf_set_kv(ctx_out, ctx_gguf); - } - - // Verify the file naming - { - int i_split_file = 0; - int n_split_file = 0; - const char * i_split_format = "-00000-of-00000.gguf"; - - if (split_name.size() < strlen(i_split_format)) { - fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str()); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); - } + // Verify the file naming and extract split_prefix + if (!llama_split_prefix(split_prefix, split_path, strlen(split_path), i_split, n_split)) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d" + " n_split=%d\n", __func__, + split_path, i_split, n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format)); - - const char * split_name_c_str = split_name.c_str(); - int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file); + // Do not trigger merge if we try to merge again the output + gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); - if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) { - fprintf(stderr, "\n%s: unexpected input file name: %s" - " i_split=%d i_split_file=%d" - " n_split=%d n_split_file=%d\n", __func__, - split_params.input.c_str(), - i_split, i_split_file, - n_split, n_split_file); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); - } - gguf_free(ctx_out); - fout.close(); - exit(1); - } + // Set metadata from the first split + gguf_set_kv(ctx_out, ctx_gguf); } auto n_tensors = gguf_get_n_tensors(ctx_gguf); @@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) { // Write tensors data for (int i_split = 0; i_split < n_split; i_split++) { - auto split_name = split_file_name(split_prefix, i_split, n_split); - std::ifstream f_input(split_name.c_str(), std::ios::binary); + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + std::ifstream f_input(split_path, std::ios::binary); if (!f_input.is_open()) { - fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str()); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); + for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { + gguf_free(ctx_ggufs[i]); + ggml_free(ctx_metas[i]); } gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str()); + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); auto * ctx_gguf = ctx_ggufs[i_split]; auto * ctx_meta = ctx_metas[i_split]; @@ -481,8 +464,8 @@ int main(int argc, const char ** argv) { break; case SPLIT_OP_MERGE: gguf_merge(params); break; - default:split_print_usage(argv[0]); - exit(1); + default: split_print_usage(argv[0]); + exit(EXIT_FAILURE); } return 0; diff --git a/llama.cpp b/llama.cpp index 1a9fe0c4d2cea..cd7a7b8d60cd8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -52,6 +52,9 @@ #define NOMINMAX #endif #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif #include #endif @@ -290,6 +293,10 @@ enum llm_kv { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, + LLM_KV_SPLIT_NO, + LLM_KV_SPLIT_COUNT, + LLM_KV_SPLIT_TENSORS_COUNT, + LLM_KV_SSM_INNER_SIZE, LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, @@ -355,6 +362,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, @@ -1449,6 +1460,17 @@ struct llama_mlock { #endif }; +// Holds information on a tensor data source location. +struct llama_tensor_offset { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file + + llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, name); + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + } +}; + static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); @@ -2023,12 +2045,12 @@ struct llama_model { // the model memory buffers for the tensor data std::vector bufs; - // model memory mapped file - std::unique_ptr mapping; + // model memory mapped files + std::vector> mappings; // objects representing data potentially being locked in memory std::vector> mlock_bufs; - llama_mlock mlock_mmap; + std::vector> mlock_mmaps; // for quantize-stats only std::vector> tensors_by_name; @@ -2802,11 +2824,13 @@ struct llama_model_loader { bool use_mmap = false; - llama_file file; + std::vector> files; llama_ftype ftype; llama_fver fver; - std::unique_ptr mapping; + std::vector> mappings; + std::unordered_map tensors_offs; // unified tensor data offset accross files + std::unordered_map kv_overrides; struct gguf_context * ctx_gguf = NULL; @@ -2815,7 +2839,7 @@ struct llama_model_loader { std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { + llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -2836,10 +2860,103 @@ struct llama_model_loader { if (!ctx_gguf) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } + files.emplace_back(new llama_file(fname.c_str(), "rb")); get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + // Save tensors data offset of the main file. + // For subsidiary files, gguf_ctx tensor data offset must not be used, + // we build a unified tensors offset index. + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) { + tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf)); + } + + uint16_t n_split = 0; + get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); + + // Build virtual GGUF/GGML contexts to represent all tensors across files + if (n_split > 1) { + uint16_t idx = 0; + get_key(llm_kv(LLM_KV_SPLIT_NO), idx); + if (idx != 0) { + throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); + } + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); + + char split_prefix[4096] = {0}; + if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) { + throw std::runtime_error(format("invalid split file: %s", fname.c_str())); + } + + size_t mem_size = n_tensors*ggml_tensor_overhead(); + struct ggml_init_params pdata = { + /*.mem_size = */ mem_size, + /*.mem_buffer = */ NULL, + /*.no_alloc = */ true, + }; + + auto * new_ctx_meta = ggml_init(pdata); + + if (trace > 0) { + LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); + } + + for (; idx < n_split; idx++) { + char split_path[PATH_MAX] = {0}; + struct ggml_context * split_ctx_meta = NULL; + struct gguf_context * split_ctx_gguf = NULL; + if (idx == 0) { + split_ctx_gguf = ctx_gguf; + split_ctx_meta = ctx_meta; + strcpy(split_path, fname.c_str()); + } else { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &split_ctx_meta, + }; + split_ctx_gguf = gguf_init_from_file(split_path, split_params); + if (!split_ctx_gguf) { + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str())); + } + } + + bool ok = true; + for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) { + struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne); + ok = ok && copy != NULL; + + if (!ok) { + break; + } + + ggml_set_name(copy, tensor->name); + + // Add the tensor to the main gguf context if not already present + if (idx > 0) { + gguf_add_tensor(ctx_gguf, copy); + tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf)); + } + } + + if (!ok) { + throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__)); + } + + if (idx > 0) { + files.emplace_back(new llama_file(split_path, "rb")); + gguf_free(split_ctx_gguf); + ggml_free(split_ctx_meta); + } + } + + ggml_free(ctx_meta); + ctx_meta = new_ctx_meta; + + LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); + } + n_kv = gguf_get_n_kv(ctx_gguf); n_tensors = gguf_get_n_tensors(ctx_gguf); @@ -3075,118 +3192,129 @@ struct llama_model_loader { } } - size_t file_offset(const char * name) const { - const int idx = gguf_find_tensor(ctx_gguf, name); - - if (idx < 0) { - throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name)); - } - - return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); - } - - void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { - // prefetch the whole file - all the data is needed anyway + void init_mappings(bool prefetch = true, std::vector> * mlock_mmaps = nullptr) { if (use_mmap) { - mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); + for (const auto & file : files) { + auto * mapping = new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()); + mmaps_used.emplace_back(std::make_pair(mapping->size, 0)); + mappings.emplace_back(std::unique_ptr(mapping)); + if (mlock_mmaps) { + auto * mlock_mmap = new llama_mlock(); + mlock_mmap->init(mapping->addr); + mlock_mmaps->emplace_back(std::unique_ptr(mlock_mmap)); + } + } } // compute the total size of all tensors for progress reporting - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + for (int i = 0; i < n_tensors; i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); size_data += ggml_nbytes(cur); } - - if (use_mmap && mapping) { - if (lmlock) { - lmlock->init(mapping->addr); - } - mmap_used_first = mapping->size; - } } - void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const { - GGML_ASSERT(mapping); + void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { + GGML_ASSERT(!mappings.empty()); + const auto & mapping = mappings[idx]; *first = mapping->size; *last = 0; + *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const size_t offs = file_offset(ggml_get_name(tensor)); - *first = std::min(*first, offs); - *last = std::max(*last, offs + ggml_nbytes(tensor)); + const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor)); + *first = std::min(*first, tensor_off.offs); + *last = std::max(*last, tensor_off.offs + ggml_nbytes(tensor)); } } // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { - const size_t offs = file_offset(ggml_get_name(cur)); + const auto & t_offs = tensors_offs.at(ggml_get_name(cur)); - if (use_mmap && mapping) { + if (use_mmap && t_offs.idx < mappings.size()) { + const auto & mapping = mappings.at(t_offs.idx); if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr + offs; + cur->data = (uint8_t *)mapping->addr + t_offs.offs; } else { - memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur)); + memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur)); } } else { GGML_ASSERT(cur->data != nullptr); - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + GGML_ASSERT(t_offs.idx < files.size()); + const auto & file = files.at(t_offs.idx); + file->seek(t_offs.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); } } size_t size_done = 0; size_t size_data = 0; - size_t mmap_used_first = -1; - size_t mmap_used_last = 0; + std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { - GGML_ASSERT(size_data != 0 && "call init_mapping() first"); + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector bufs_mmap, std::vector> * lmlocks) { + GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; - for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + if (!progress_callback((float)size_done / size_data, progress_callback_user_data)) { return false; } } - const size_t offs = file_offset(ggml_get_name(cur)); + const auto & t_offs = tensors_offs.at(ggml_get_name(cur)); + size_t n_size = ggml_nbytes(cur); - if (use_mmap && mapping) { + if (use_mmap && t_offs.idx < mappings.size()) { + const auto & mapping = mappings.at(t_offs.idx); + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs_mmap.size() > 1) { + buf_mmap = bufs_mmap[t_offs.idx]; + } else if (!bufs_mmap.empty()) { + buf_mmap = bufs_mmap.front(); + } if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); - if (lmlock) { - lmlock->grow_to(offs + ggml_nbytes(cur)); + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs); + if (lmlocks) { + const auto & lmlock = lmlocks->at(t_offs.idx); + lmlock->grow_to(t_offs.offs + ggml_nbytes(cur)); } - mmap_used_first = std::min(mmap_used_first, offs); - mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); + + auto & mmap_used = mmaps_used[t_offs.idx]; + mmap_used.first = std::min(mmap_used.first, t_offs.offs); + mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size); } } else { + GGML_ASSERT(t_offs.idx < files.size()); + const auto & file = files.at(t_offs.idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + file->seek(t_offs.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); + file->seek(t_offs.offs, SEEK_SET); + file->read_raw(read_buf.data(), ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); } } - size_done += ggml_nbytes(cur); + size_done += n_size; } // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_used_first); - if (mmap_used_last != 0) { - mapping->unmap_fragment(mmap_used_last, mapping->size); + if (use_mmap && !mappings.empty()) { + for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) { + const auto & mmap_used = mmaps_used[file_no]; + auto & mapping = mappings.at(file_no); + mapping->unmap_fragment(0, mmap_used.first); + if (mmap_used.second != 0) { + mapping->unmap_fragment(mmap_used.second, mapping->size); + } } } if (progress_callback) { @@ -5024,56 +5152,74 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); + ml.init_mappings(true, &model.mlock_mmaps); // create the backend buffers - std::vector> ctx_bufs; + std::vector>> ctx_bufs; for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = nullptr; + std::vector bufs; // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { - size_t first, last; - ml.get_mapping_range(&first, &last, ctx); - buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); + for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first); + if (buf != nullptr) { + bufs.push_back(buf); #ifdef GGML_USE_CUBLAS - if (n_layer >= n_gpu_layers) { - ggml_backend_cuda_register_host_buffer( - ggml_backend_buffer_get_base(buf), - ggml_backend_buffer_get_size(buf)); - } + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( + ggml_backend_buffer_get_base(buf), + ggml_backend_buffer_get_size(buf)); + } #endif + } + } } #ifdef GGML_USE_METAL else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { - const size_t max_size = ggml_get_max_tensor_size(ctx); - size_t first, last; - ml.get_mapping_range(&first, &last, ctx); - buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); + for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { + const size_t max_size = ggml_get_max_tensor_size(ctx); + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); + if (buf != nullptr) { + bufs.push_back(buf); + } + } } #endif else { - buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) { - model.mlock_bufs.emplace_back(new llama_mlock); - auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init (ggml_backend_buffer_get_base(buf)); - mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf != nullptr) { + if (use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_bufs.emplace_back(new llama_mlock); + auto & mlock_buf = model.mlock_bufs.back(); + mlock_buf->init(ggml_backend_buffer_get_base(buf)); + mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + } + bufs.push_back(buf); } } - if (buf == nullptr) { + if (bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf); - ctx_bufs.emplace_back(ctx, buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf); + } + + ctx_bufs.emplace_back(ctx, bufs); } if (llama_supports_gpu_offload()) { @@ -5105,13 +5251,15 @@ static bool llm_load_tensors( // load tensor data for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; - ggml_backend_buffer_t buf = it.second; - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + std::vector bufs = it.second; + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, bufs, use_mlock ? &model.mlock_mmaps : NULL)) { return false; } } - model.mapping = std::move(ml.mapping); + for (auto & mapping : ml.mappings) { + model.mappings.emplace_back(std::move(mapping)); + } // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -12308,7 +12456,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s #endif llama_model_loader ml(fname_inp, use_mmap, NULL); - ml.init_mapping(false); // no prefetching? + ml.init_mappings(false); // no prefetching? llama_model model; llm_load_arch(ml, model); @@ -12582,7 +12730,7 @@ static int llama_apply_lora_from_file_internal( if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); - ml->init_mapping(/*prefetch*/ false); // no prefetching + ml->init_mappings(/*prefetch*/ false); // no prefetching } struct tensor_meta { @@ -14648,6 +14796,36 @@ LLAMA_API int32_t llama_chat_apply_template( return res; } +LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { + static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; + if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { + return strlen(split_path); + } + return 0; +} + +LLAMA_API int llama_split_prefix(char * dest, const char * split_path, size_t split_path_len, int split_no, int split_count) { + char split_prefix[PATH_MAX] = {0}; + int split_no_file = 0; + int split_count_file = 0; + const char * split_format = "-00000-of-00000.gguf"; + + if (split_path_len > strlen(split_format) + 1) { + size_t prefix_len = split_path_len - strlen(split_format); + if (prefix_len >= sizeof(split_prefix)) { + prefix_len = sizeof(split_prefix) - 1; // leave room for null terminator + } + strncpy(split_prefix, split_path, prefix_len); + + int n = sscanf(&split_path[0] + strlen(split_prefix), "-%d-of-%d", &split_no_file, &split_count_file); + if (n == 2 && split_no_file - 1 == split_no && split_count_file == split_count) { + strcpy(dest, split_prefix); + return strlen(split_prefix); + } + } + return 0; +} + struct llama_timings llama_get_timings(struct llama_context * ctx) { struct llama_timings result = { /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, diff --git a/llama.h b/llama.h index 40dcf54e394f8..c23172c55e328 100644 --- a/llama.h +++ b/llama.h @@ -960,6 +960,16 @@ extern "C" { int32_t n_past, int32_t n_predict); + /// @details Build a split GGUF final path for this chunk. + /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" + // Returns the split_path length. + LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); + + /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. + /// llama_split_prefix(split_prefix, "/models/ggml-model-q4_0-00002-of-00004.gguf", 43, 2, 4) => split_prefix = "/models/ggml-model-q4_0" + // Returns the split_prefix length. + LLAMA_API int llama_split_prefix(char * split_prefix, const char * split_path, size_t split_path_len, int split_no, int split_count); + // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); From b8feff411fc170de48c592cced8660f9e692db2b Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 21 Mar 2024 04:36:06 +0100 Subject: [PATCH 02/27] Avoir copying the entire vector Co-authored-by: slaren --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index cd7a7b8d60cd8..a8e6b4208c85b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5251,7 +5251,7 @@ static bool llm_load_tensors( // load tensor data for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; - std::vector bufs = it.second; + auto & bufs = it.second; if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, bufs, use_mlock ? &model.mlock_mmaps : NULL)) { return false; } From 18ff6ca8473fcc874157ed5dffe1cc21a11c1e5f Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 07:06:14 +0100 Subject: [PATCH 03/27] split: move llama_tensor_offset to llama_model_loader --- llama.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llama.cpp b/llama.cpp index a8e6b4208c85b..168ef4ee5467e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1460,17 +1460,6 @@ struct llama_mlock { #endif }; -// Holds information on a tensor data source location. -struct llama_tensor_offset { - uint16_t idx; // source file index - size_t offs; // tensor data offset in the original file - - llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) { - const int tensor_idx = gguf_find_tensor(gguf_ctx, name); - offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); - } -}; - static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); @@ -2829,7 +2818,18 @@ struct llama_model_loader { llama_fver fver; std::vector> mappings; - std::unordered_map tensors_offs; // unified tensor data offset accross files + + // Holds information on a tensor data source location. + struct llama_tensor_offset { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file + + llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, name); + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + } + }; + std::unordered_map tensors_offs; // unified tensor data offset across files std::unordered_map kv_overrides; @@ -2884,7 +2884,7 @@ struct llama_model_loader { } get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); - char split_prefix[4096] = {0}; + char split_prefix[PATH_MAX] = {0}; if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) { throw std::runtime_error(format("invalid split file: %s", fname.c_str())); } From 1892ae7eb1844f6704c0dd2ec0a4fe9508b77eb1 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 19:11:37 +0100 Subject: [PATCH 04/27] llama_model_loader: PR feedbacks: - use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts --- examples/gguf-split/gguf-split.cpp | 18 +- llama.cpp | 257 +++++++++++++---------------- 2 files changed, 124 insertions(+), 151 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index e45151ab1bc41..3f582506da86e 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -26,9 +26,9 @@ enum split_operation : uint8_t { SPLIT_OP_MERGE, }; -static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "split.no"; -static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "split.count"; -static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count"; +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; struct split_params { split_operation operation = SPLIT_OP_SPLIT; @@ -177,9 +177,9 @@ struct split_strategy { if (i_split == 0) { gguf_set_kv(ctx_out, ctx_gguf); } - gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); - gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); - gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split); + gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); // populate the original tensors, so we get an initial metadata for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { @@ -328,12 +328,12 @@ static void gguf_merge(const split_params & split_params) { ctx_metas.push_back(ctx_meta); if (i_split == 0) { - auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); if (key_n_split < 0) { fprintf(stderr, "\n%s: input file does not contain %s metadata\n", __func__, - LLM_KV_GENERAL_SPLIT_N_SPLIT); + LLM_KV_SPLIT_COUNT); gguf_free(ctx_gguf); ggml_free(ctx_meta); gguf_free(ctx_out); @@ -368,7 +368,7 @@ static void gguf_merge(const split_params & split_params) { } // Do not trigger merge if we try to merge again the output - gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); // Set metadata from the first split gguf_set_kv(ctx_out, ctx_gguf); diff --git a/llama.cpp b/llama.cpp index 168ef4ee5467e..ecfc905f3d3a0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2819,22 +2819,24 @@ struct llama_model_loader { std::vector> mappings; - // Holds information on a tensor data source location. - struct llama_tensor_offset { - uint16_t idx; // source file index - size_t offs; // tensor data offset in the original file + // Holds information on a model weights + struct llama_tensor_weights { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file - llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) { + ggml_tensor * tensor; + + llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { const int tensor_idx = gguf_find_tensor(gguf_ctx, name); offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); } }; - std::unordered_map tensors_offs; // unified tensor data offset across files + std::vector weights; std::unordered_map kv_overrides; - struct gguf_context * ctx_gguf = NULL; - struct ggml_context * ctx_meta = NULL; + struct gguf_context * meta = NULL; + std::vector contexts; std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); @@ -2845,128 +2847,91 @@ struct llama_model_loader { trace = atoi(getenv("LLAMA_TRACE")); } - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - if (param_overrides_p != nullptr) { for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { kv_overrides.insert({std::string(p->key), *p}); } } - ctx_gguf = gguf_init_from_file(fname.c_str(), params); - if (!ctx_gguf) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta = gguf_init_from_file(fname.c_str(), params); + if (!meta) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } - files.emplace_back(new llama_file(fname.c_str(), "rb")); get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); // Save tensors data offset of the main file. - // For subsidiary files, gguf_ctx tensor data offset must not be used, - // we build a unified tensors offset index. - for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) { - tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf)); + // For subsidiary files, `meta` tensor data offset must not be used, + // so we build a unified tensors index for weights. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur)); } + files.emplace_back(new llama_file(fname.c_str(), "rb")); + contexts.emplace_back(ctx); uint16_t n_split = 0; get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); - // Build virtual GGUF/GGML contexts to represent all tensors across files + // Load additional GGML contexts if (n_split > 1) { uint16_t idx = 0; get_key(llm_kv(LLM_KV_SPLIT_NO), idx); if (idx != 0) { throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); } - get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); char split_prefix[PATH_MAX] = {0}; if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) { throw std::runtime_error(format("invalid split file: %s", fname.c_str())); } - size_t mem_size = n_tensors*ggml_tensor_overhead(); - struct ggml_init_params pdata = { - /*.mem_size = */ mem_size, - /*.mem_buffer = */ NULL, - /*.no_alloc = */ true, - }; - - auto * new_ctx_meta = ggml_init(pdata); - if (trace > 0) { LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); } - for (; idx < n_split; idx++) { - char split_path[PATH_MAX] = {0}; - struct ggml_context * split_ctx_meta = NULL; - struct gguf_context * split_ctx_gguf = NULL; - if (idx == 0) { - split_ctx_gguf = ctx_gguf; - split_ctx_meta = ctx_meta; - strcpy(split_path, fname.c_str()); - } else { - llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); - struct gguf_init_params split_params = { - /*.no_alloc = */ true, - /*.ctx = */ &split_ctx_meta, - }; - split_ctx_gguf = gguf_init_from_file(split_path, split_params); - if (!split_ctx_gguf) { - throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str())); - } - } - - bool ok = true; - for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) { - struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne); - ok = ok && copy != NULL; - - if (!ok) { - break; - } + char split_path[PATH_MAX] = {0}; + for (idx = 1; idx < n_split; idx++) { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); - ggml_set_name(copy, tensor->name); - - // Add the tensor to the main gguf context if not already present - if (idx > 0) { - gguf_add_tensor(ctx_gguf, copy); - tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf)); - } + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + if (!ctx_gguf) { + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); } - if (!ok) { - throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__)); + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur)); } + files.emplace_back(new llama_file(split_path, "rb")); + contexts.emplace_back(ctx); - if (idx > 0) { - files.emplace_back(new llama_file(split_path, "rb")); - gguf_free(split_ctx_gguf); - ggml_free(split_ctx_meta); - } + gguf_free(ctx_gguf); } - - ggml_free(ctx_meta); - ctx_meta = new_ctx_meta; + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); + GGML_ASSERT(n_tensors == (int) weights.size()); LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); } - n_kv = gguf_get_n_kv(ctx_gguf); - n_tensors = gguf_get_n_tensors(ctx_gguf); + n_kv = gguf_get_n_kv(meta); + n_tensors = weights.size(); - fver = (enum llama_fver ) gguf_get_version(ctx_gguf); + fver = (enum llama_fver ) gguf_get_version(meta); - for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); - n_elements += ggml_nelements(t); - n_bytes += ggml_nbytes(t); + for (auto & w : weights) { + n_elements += ggml_nelements(w.tensor); + n_bytes += ggml_nbytes(w.tensor); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -2981,7 +2946,8 @@ struct llama_model_loader { enum ggml_type type_max = GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i); + const ggml_tensor * tensor = weights.at(i).tensor; + enum ggml_type type = tensor->type; n_type[type]++; @@ -2991,8 +2957,7 @@ struct llama_model_loader { } if (trace > 0) { - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } @@ -3028,22 +2993,22 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + const int kid = gguf_find_key(meta, "general.file_type"); if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + ftype = (llama_ftype) gguf_get_val_u32(meta, kid); } } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(ctx_gguf, i); - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + const char * name = gguf_get_key(meta, i); + const enum gguf_type type = gguf_get_kv_type(meta, i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i)) + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i)) : gguf_type_name(type); - std::string value = gguf_kv_to_str(ctx_gguf, i); + std::string value = gguf_kv_to_str(meta, i); const size_t MAX_VALUE_LEN = 40; if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); @@ -3072,18 +3037,18 @@ struct llama_model_loader { } ~llama_model_loader() { - if (ctx_gguf) { - gguf_free(ctx_gguf); + if (meta) { + gguf_free(meta); } - if (ctx_meta) { - ggml_free(ctx_meta); + for (auto & ctx : contexts) { + ggml_free(ctx); } } template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(ctx_gguf, key.c_str()); + const int kid = gguf_find_key(meta, key.c_str()); if (kid < 0) { if (required) { @@ -3093,7 +3058,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(ctx_gguf, kid); + GGUFMeta::GKV::get_kv(meta, kid); result = arr_info.length; @@ -3113,7 +3078,7 @@ struct llama_model_loader { const struct llama_model_kv_override * override = it != kv_overrides.end() ? &it->second : nullptr; - const bool found = GGUFMeta::GKV::set(ctx_gguf, key, result, override); + const bool found = GGUFMeta::GKV::set(meta, key, result, override); if (required && !found) { throw std::runtime_error(format("key not found in model: %s", key.c_str())); @@ -3136,20 +3101,29 @@ struct llama_model_loader { } const char * get_tensor_name(int i) const { - return gguf_get_tensor_name(ctx_gguf, i); + return weights.at(i).tensor->name; + } + + const llama_tensor_weights & get_weights(const char * name) const { + for (const auto & weight : weights) { + if (strcmp(name, weight.tensor->name) == 0) { + return weight; + } + } + throw std::runtime_error(format("tensor %s not found", name)); } struct ggml_tensor * get_tensor_meta(const char * name) const { - return ggml_get_tensor(ctx_meta, name); + return get_weights(name).tensor; } struct ggml_tensor * get_tensor_meta(int i) const { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - ggml_set_name(tensor, ggml_get_name(meta)); + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) { + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); + ggml_set_name(tensor, ggml_get_name(cur)); n_created++; @@ -3157,7 +3131,7 @@ struct llama_model_loader { } struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); if (cur == NULL) { if (!required) { @@ -3207,9 +3181,8 @@ struct llama_model_loader { } // compute the total size of all tensors for progress reporting - for (int i = 0; i < n_tensors; i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); + for (auto & w : weights) { + size_data += ggml_nbytes(w.tensor); } } @@ -3221,28 +3194,28 @@ struct llama_model_loader { *last = 0; *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor)); - *first = std::min(*first, tensor_off.offs); - *last = std::max(*last, tensor_off.offs + ggml_nbytes(tensor)); + const auto & w = get_weights(ggml_get_name(tensor)); + *first = std::min(*first, w.offs); + *last = std::max(*last, w.offs + ggml_nbytes(tensor)); } } // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { - const auto & t_offs = tensors_offs.at(ggml_get_name(cur)); + const auto & w = get_weights(ggml_get_name(cur)); - if (use_mmap && t_offs.idx < mappings.size()) { - const auto & mapping = mappings.at(t_offs.idx); + if (use_mmap && w.idx < mappings.size()) { + const auto & mapping = mappings.at(w.idx); if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr + t_offs.offs; + cur->data = (uint8_t *)mapping->addr + w.offs; } else { - memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur)); + memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur)); } } else { GGML_ASSERT(cur->data != nullptr); - GGML_ASSERT(t_offs.idx < files.size()); - const auto & file = files.at(t_offs.idx); - file->seek(t_offs.offs, SEEK_SET); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); + file->seek(w.offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); } } @@ -3263,39 +3236,39 @@ struct llama_model_loader { } } - const auto & t_offs = tensors_offs.at(ggml_get_name(cur)); + const auto & w = get_weights(ggml_get_name(cur)); size_t n_size = ggml_nbytes(cur); - if (use_mmap && t_offs.idx < mappings.size()) { - const auto & mapping = mappings.at(t_offs.idx); + if (use_mmap && w.idx < mappings.size()) { + const auto & mapping = mappings.at(w.idx); ggml_backend_buffer_t buf_mmap = nullptr; if (bufs_mmap.size() > 1) { - buf_mmap = bufs_mmap[t_offs.idx]; + buf_mmap = bufs_mmap[w.idx]; } else if (!bufs_mmap.empty()) { buf_mmap = bufs_mmap.front(); } if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs); + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); if (lmlocks) { - const auto & lmlock = lmlocks->at(t_offs.idx); - lmlock->grow_to(t_offs.offs + ggml_nbytes(cur)); + const auto & lmlock = lmlocks->at(w.idx); + lmlock->grow_to(w.offs + ggml_nbytes(cur)); } - auto & mmap_used = mmaps_used[t_offs.idx]; - mmap_used.first = std::min(mmap_used.first, t_offs.offs); - mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size); + auto & mmap_used = mmaps_used[w.idx]; + mmap_used.first = std::min(mmap_used.first, w.offs); + mmap_used.second = std::max(mmap_used.second, w.offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size); + ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size); } } else { - GGML_ASSERT(t_offs.idx < files.size()); - const auto & file = files.at(t_offs.idx); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file->seek(t_offs.offs, SEEK_SET); + file->seek(w.offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); - file->seek(t_offs.offs, SEEK_SET); + file->seek(w.offs, SEEK_SET); file->read_raw(read_buf.data(), ggml_nbytes(cur)); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); } @@ -3447,7 +3420,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; - const gguf_context * ctx = ml.ctx_gguf; + const gguf_context * ctx = ml.meta; // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -3837,7 +3810,7 @@ static void llm_load_vocab( llama_model & model) { auto & vocab = model.vocab; - struct gguf_context * ctx = ml.ctx_gguf; + struct gguf_context * ctx = ml.meta; const auto kv = LLM_KV(model.arch); @@ -4447,7 +4420,7 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { + if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) { layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } @@ -12480,12 +12453,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml.ctx_gguf); + gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); const std::string name = ggml_get_name(meta); @@ -12525,7 +12498,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // populate the original tensors so we get an initial meta data for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); gguf_add_tensor(ctx_out, meta); } @@ -12851,7 +12824,7 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * base_t; if (ml) { - if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + if (!ml->get_tensor_meta(base_name.c_str())) { LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } From 00381b07bbb40c3db504defe27cc730b50e9c2a5 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 19:18:39 +0100 Subject: [PATCH 05/27] avoid copying the entire vector --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index ecfc905f3d3a0..c69d414fab817 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3243,7 +3243,7 @@ struct llama_model_loader { const auto & mapping = mappings.at(w.idx); ggml_backend_buffer_t buf_mmap = nullptr; if (bufs_mmap.size() > 1) { - buf_mmap = bufs_mmap[w.idx]; + buf_mmap = bufs_mmap.at(w.idx); } else if (!bufs_mmap.empty()) { buf_mmap = bufs_mmap.front(); } From c34a5deee8980fd8d0059dc8a5562fe6fa68c3da Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 21 Mar 2024 20:50:11 +0100 Subject: [PATCH 06/27] Simplify this by making these optional, switch some layer creation tensor optional Co-authored-by: Georgi Gerganov --- llama.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index c69d414fab817..21f6ad76152bc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4420,10 +4420,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) { - layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); - layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); - } + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); From 1c931f3d4f7b7b296d588e19e738ae65006791eb Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Thu, 21 Mar 2024 20:50:28 +0100 Subject: [PATCH 07/27] Handle optional tensors Co-authored-by: Georgi Gerganov --- llama.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 21f6ad76152bc..b1e2e062c206c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3114,7 +3114,11 @@ struct llama_model_loader { } struct ggml_tensor * get_tensor_meta(const char * name) const { - return get_weights(name).tensor; + try { + return get_weights(name).tensor; + } catch (const std::runtime_error & e) { + return NULL; + } } struct ggml_tensor * get_tensor_meta(int i) const { From d8b567d254ade1cff4ce32eb33e1a8e237a98280 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 21:05:15 +0100 Subject: [PATCH 08/27] llama_model_loader: fail if backend cannot allocate buffer --- llama.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llama.cpp b/llama.cpp index b1e2e062c206c..cd20ad7a4e8b4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5155,6 +5155,8 @@ static bool llm_load_tensors( ggml_backend_buffer_get_size(buf)); } #endif + } else { + throw std::runtime_error("failed to allocate cpu buffer"); } } } @@ -5168,6 +5170,8 @@ static bool llm_load_tensors( ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); if (buf != nullptr) { bufs.push_back(buf); + } else { + throw std::runtime_error("failed to allocate metal buffer"); } } } @@ -5182,6 +5186,8 @@ static bool llm_load_tensors( mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } bufs.push_back(buf); + } else { + throw std::runtime_error("failed to allocate backend buffer"); } } if (bufs.empty()) { From 02020b0463d161ecbdea3995d0b4f11813e3ac8a Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 21 Mar 2024 22:06:37 +0100 Subject: [PATCH 09/27] fix mmap buffer management --- llama.cpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index cd20ad7a4e8b4..53b5a06088e6e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3199,6 +3199,9 @@ struct llama_model_loader { *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { const auto & w = get_weights(ggml_get_name(tensor)); + if (w.idx != idx) { + continue; + } *first = std::min(*first, w.offs); *last = std::max(*last, w.offs + ggml_nbytes(tensor)); } @@ -3245,12 +3248,8 @@ struct llama_model_loader { if (use_mmap && w.idx < mappings.size()) { const auto & mapping = mappings.at(w.idx); - ggml_backend_buffer_t buf_mmap = nullptr; - if (bufs_mmap.size() > 1) { - buf_mmap = bufs_mmap.at(w.idx); - } else if (!bufs_mmap.empty()) { - buf_mmap = bufs_mmap.front(); - } + ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr; + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); if (lmlocks) { @@ -5145,6 +5144,10 @@ static bool llm_load_tensors( void * addr = nullptr; size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + if (first >= last) { + bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync + continue; + } ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first); if (buf != nullptr) { bufs.push_back(buf); @@ -5167,6 +5170,10 @@ static bool llm_load_tensors( void * addr = nullptr; size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + if (first >= last) { + bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync + continue; + } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); if (buf != nullptr) { bufs.push_back(buf); @@ -5196,6 +5203,9 @@ static bool llm_load_tensors( // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight for (ggml_backend_buffer_t buf : bufs) { + if (buf == nullptr) { + continue; + } ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); model.bufs.push_back(buf); } From 078a1aca0648204c2abaec097b04c1bac8cf3795 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 21:33:14 +0100 Subject: [PATCH 10/27] llama_model_loader: map file to backend buffer if the allocation succeeds only --- llama.cpp | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/llama.cpp b/llama.cpp index 53b5a06088e6e..a7945ef092131 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3192,7 +3192,7 @@ struct llama_model_loader { void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { GGML_ASSERT(!mappings.empty()); - const auto & mapping = mappings[idx]; + const auto & mapping = mappings.at(idx); *first = mapping->size; *last = 0; @@ -3211,7 +3211,7 @@ struct llama_model_loader { void load_data_for(struct ggml_tensor * cur) const { const auto & w = get_weights(ggml_get_name(cur)); - if (use_mmap && w.idx < mappings.size()) { + if (use_mmap) { const auto & mapping = mappings.at(w.idx); if (cur->data == nullptr) { cur->data = (uint8_t *)mapping->addr + w.offs; @@ -3232,7 +3232,7 @@ struct llama_model_loader { std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector bufs_mmap, std::vector> * lmlocks) { + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map bufs_mmap, std::vector> * lmlocks) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -3246,9 +3246,12 @@ struct llama_model_loader { const auto & w = get_weights(ggml_get_name(cur)); size_t n_size = ggml_nbytes(cur); - if (use_mmap && w.idx < mappings.size()) { + if (use_mmap) { const auto & mapping = mappings.at(w.idx); - ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr; + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs_mmap.count(w.idx)) { + buf_mmap = bufs_mmap.at(w.idx); + } GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); @@ -3283,7 +3286,7 @@ struct llama_model_loader { // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata - if (use_mmap && !mappings.empty()) { + if (use_mmap) { for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) { const auto & mmap_used = mmaps_used[file_no]; auto & mapping = mappings.at(file_no); @@ -5129,12 +5132,12 @@ static bool llm_load_tensors( ml.init_mappings(true, &model.mlock_mmaps); // create the backend buffers - std::vector>> ctx_bufs; + std::vector>> ctx_bufs; for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - std::vector bufs; + std::map bufs; // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers @@ -5145,12 +5148,11 @@ static bool llm_load_tensors( size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); if (first >= last) { - bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync continue; } ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first); if (buf != nullptr) { - bufs.push_back(buf); + bufs.emplace(file_no, buf); #ifdef GGML_USE_CUBLAS if (n_layer >= n_gpu_layers) { ggml_backend_cuda_register_host_buffer( @@ -5158,8 +5160,6 @@ static bool llm_load_tensors( ggml_backend_buffer_get_size(buf)); } #endif - } else { - throw std::runtime_error("failed to allocate cpu buffer"); } } } @@ -5176,9 +5176,7 @@ static bool llm_load_tensors( } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); if (buf != nullptr) { - bufs.push_back(buf); - } else { - throw std::runtime_error("failed to allocate metal buffer"); + bufs.emplace(file_no, buf); } } } @@ -5192,9 +5190,9 @@ static bool llm_load_tensors( mlock_buf->init(ggml_backend_buffer_get_base(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } - bufs.push_back(buf); - } else { - throw std::runtime_error("failed to allocate backend buffer"); + for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { + bufs.emplace(file_no, buf); + } } } if (bufs.empty()) { @@ -5202,12 +5200,9 @@ static bool llm_load_tensors( } // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight - for (ggml_backend_buffer_t buf : bufs) { - if (buf == nullptr) { - continue; - } - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf); + for (auto & buf : bufs) { + ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf.second); } ctx_bufs.emplace_back(ctx, bufs); From 69bdee939a8604f7648d322f6ec5b0f202605fb9 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 21:42:30 +0100 Subject: [PATCH 11/27] llama_model_loader: only map tensors included in the context --- llama.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a7945ef092131..c3b97471c7943 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5171,7 +5171,6 @@ static bool llm_load_tensors( size_t first, last; ml.get_mapping_range(&first, &last, &addr, file_no, ctx); if (first >= last) { - bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync continue; } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); From 6df9757ad62972731dc48b53efcaaa4a01f15dec Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Thu, 21 Mar 2024 23:26:45 +0100 Subject: [PATCH 12/27] llama_model_loader: minor, use same variable name for consistency, fix spacing in types cast --- llama.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index c3b97471c7943..2105824af012f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3238,7 +3238,7 @@ struct llama_model_loader { std::vector> read_buf; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { if (progress_callback) { - if (!progress_callback((float)size_done / size_data, progress_callback_user_data)) { + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; } } @@ -3254,7 +3254,7 @@ struct llama_model_loader { } GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs); + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs); if (lmlocks) { const auto & lmlock = lmlocks->at(w.idx); lmlock->grow_to(w.offs + ggml_nbytes(cur)); @@ -3264,7 +3264,7 @@ struct llama_model_loader { mmap_used.first = std::min(mmap_used.first, w.offs); mmap_used.second = std::max(mmap_used.second, w.offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size); + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size); } } else { GGML_ASSERT(w.idx < files.size()); @@ -3287,9 +3287,9 @@ struct llama_model_loader { if (size_done >= size_data) { // unmap offloaded tensors and metadata if (use_mmap) { - for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) { - const auto & mmap_used = mmaps_used[file_no]; - auto & mapping = mappings.at(file_no); + for (uint32_t idx = 0; idx < mappings.size(); idx++) { + const auto & mmap_used = mmaps_used[idx]; + auto & mapping = mappings.at(idx); mapping->unmap_fragment(0, mmap_used.first); if (mmap_used.second != 0) { mapping->unmap_fragment(mmap_used.second, mapping->size); @@ -5143,16 +5143,16 @@ static bool llm_load_tensors( // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { - for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { void * addr = nullptr; size_t first, last; - ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + ml.get_mapping_range(&first, &last, &addr, idx, ctx); if (first >= last) { continue; } - ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first); + ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); if (buf != nullptr) { - bufs.emplace(file_no, buf); + bufs.emplace(idx, buf); #ifdef GGML_USE_CUBLAS if (n_layer >= n_gpu_layers) { ggml_backend_cuda_register_host_buffer( @@ -5165,17 +5165,17 @@ static bool llm_load_tensors( } #ifdef GGML_USE_METAL else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { - for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { const size_t max_size = ggml_get_max_tensor_size(ctx); void * addr = nullptr; size_t first, last; - ml.get_mapping_range(&first, &last, &addr, file_no, ctx); + ml.get_mapping_range(&first, &last, &addr, idx, ctx); if (first >= last) { continue; } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); if (buf != nullptr) { - bufs.emplace(file_no, buf); + bufs.emplace(idx, buf); } } } @@ -5189,8 +5189,8 @@ static bool llm_load_tensors( mlock_buf->init(ggml_backend_buffer_get_base(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } - for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) { - bufs.emplace(file_no, buf); + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); } } } From f9a29735fc6239614afab198fca4ee9b4577923e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 00:25:11 +0100 Subject: [PATCH 13/27] llama_model_loader: fail if any of backend buffer cannot be allocated --- llama.cpp | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2105824af012f..a0f917ce2122f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3232,7 +3232,7 @@ struct llama_model_loader { std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map bufs_mmap, std::vector> * lmlocks) { + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map & bufs_mmap, std::vector> * lmlocks) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -5151,16 +5151,17 @@ static bool llm_load_tensors( continue; } ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); - if (buf != nullptr) { - bufs.emplace(idx, buf); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend CPU buffer"); + } + bufs.emplace(idx, buf); #ifdef GGML_USE_CUBLAS - if (n_layer >= n_gpu_layers) { - ggml_backend_cuda_register_host_buffer( - ggml_backend_buffer_get_base(buf), - ggml_backend_buffer_get_size(buf)); - } -#endif + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( + ggml_backend_buffer_get_base(buf), + ggml_backend_buffer_get_size(buf)); } +#endif } } #ifdef GGML_USE_METAL @@ -5174,32 +5175,34 @@ static bool llm_load_tensors( continue; } ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); - if (buf != nullptr) { - bufs.emplace(idx, buf); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend metal buffer"); } + bufs.emplace(idx, buf); } } #endif else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf != nullptr) { - if (use_mlock && ggml_backend_buffer_is_host(buf)) { - model.mlock_bufs.emplace_back(new llama_mlock); - auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init(ggml_backend_buffer_get_base(buf)); - mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); - } - for (uint32_t idx = 0; idx < ml.files.size(); idx++) { - bufs.emplace(idx, buf); - } + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + if (use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_bufs.emplace_back(new llama_mlock); + auto & mlock_buf = model.mlock_bufs.back(); + mlock_buf->init(ggml_backend_buffer_get_base(buf)); + mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); + } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); } } if (bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } - // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight for (auto & buf : bufs) { + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); model.bufs.push_back(buf.second); } From 0fd652eba746179d8299d53463ae836e569c9cf7 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 22 Mar 2024 00:37:01 +0100 Subject: [PATCH 14/27] spacing Co-authored-by: slaren --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a0f917ce2122f..f9c75cd47745a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2820,7 +2820,7 @@ struct llama_model_loader { std::vector> mappings; // Holds information on a model weights - struct llama_tensor_weights { + struct llama_tensor_weights { uint16_t idx; // source file index size_t offs; // tensor data offset in the original file From 1a179bfc4e6079079e6ab7dbc7d1ddb8c5d74d5b Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 22 Mar 2024 00:38:23 +0100 Subject: [PATCH 15/27] fix loop over pointer Co-authored-by: slaren --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f9c75cd47745a..f0c187f9a9db1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3040,7 +3040,7 @@ struct llama_model_loader { if (meta) { gguf_free(meta); } - for (auto & ctx : contexts) { + for (auto * ctx : contexts) { ggml_free(ctx); } } From 7cbe1eac78588f2b7f9a6ee0f7f56d0dc68611d7 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 06:48:15 +0100 Subject: [PATCH 16/27] llama_model_loader: if n_tensors declared not equals to loaded tensors in split, throw an exception instead of asserting --- llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index f0c187f9a9db1..c129b16404926 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2919,7 +2919,10 @@ struct llama_model_loader { gguf_free(ctx_gguf); } get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); - GGML_ASSERT(n_tensors == (int) weights.size()); + int n_tensors_loaded = (int) weights.size(); + if (n_tensors != n_tensors_loaded) { + throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); + } LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); } From 9940df4f11382c740846ad792134fdcab998e94c Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 06:51:21 +0100 Subject: [PATCH 17/27] llama_model_loader: ensure mappings vector has the expected size --- llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama.cpp b/llama.cpp index c129b16404926..d2d74b025453b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3175,6 +3175,8 @@ struct llama_model_loader { void init_mappings(bool prefetch = true, std::vector> * mlock_mmaps = nullptr) { if (use_mmap) { + mappings.reserve(files.size()); + mmaps_used.reserve(files.size()); for (const auto & file : files) { auto * mapping = new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()); mmaps_used.emplace_back(std::make_pair(mapping->size, 0)); From ec372c66a4e79d152b89de010e7ca44557cc236f Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 06:52:00 +0100 Subject: [PATCH 18/27] llama_model_loader: use at instead of operator[] if this should never add to the map. --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index d2d74b025453b..decb895f3eefa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3293,7 +3293,7 @@ struct llama_model_loader { // unmap offloaded tensors and metadata if (use_mmap) { for (uint32_t idx = 0; idx < mappings.size(); idx++) { - const auto & mmap_used = mmaps_used[idx]; + const auto & mmap_used = mmaps_used.at(idx); auto & mapping = mappings.at(idx); mapping->unmap_fragment(0, mmap_used.first); if (mmap_used.second != 0) { From a9e88c6e57311b36a7f0e62c65b0ee2420fced1b Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 06:59:04 +0100 Subject: [PATCH 19/27] llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size. --- llama.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index decb895f3eefa..891892f251234 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5137,12 +5137,17 @@ static bool llm_load_tensors( ml.init_mappings(true, &model.mlock_mmaps); // create the backend buffers - std::vector>> ctx_bufs; + std::vector>> ctx_bufs; + + // Ensure we have enough capacity for the maximum backend buffer we will potentially create + size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); + model.bufs.reserve(n_max_backend_buffer); for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; - std::map bufs; + std::unordered_map bufs; + bufs.reserve(n_max_backend_buffer); // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers @@ -5159,6 +5164,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("unable to allocate backend CPU buffer"); } + model.bufs.push_back(buf); bufs.emplace(idx, buf); #ifdef GGML_USE_CUBLAS if (n_layer >= n_gpu_layers) { @@ -5183,6 +5189,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("unable to allocate backend metal buffer"); } + model.bufs.push_back(buf); bufs.emplace(idx, buf); } } @@ -5192,6 +5199,7 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("unable to allocate backend buffer"); } + model.bufs.push_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); @@ -5209,7 +5217,6 @@ static bool llm_load_tensors( // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf.second); } ctx_bufs.emplace_back(ctx, bufs); From b19af3643f68d2d1b9b27dd7cfd829f9dd33928e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 07:03:14 +0100 Subject: [PATCH 20/27] llama_model_loader: be sure the model mappings has enough capacity before allocating backend buffer --- llama.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama.cpp b/llama.cpp index 891892f251234..3e0aec8f6c266 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5135,9 +5135,11 @@ static bool llm_load_tensors( ml.done_getting_tensors(); ml.init_mappings(true, &model.mlock_mmaps); + model.mappings.reserve(ml.mappings.size()); // create the backend buffers std::vector>> ctx_bufs; + ctx_bufs.reserve(ctx_map.size()); // Ensure we have enough capacity for the maximum backend buffer we will potentially create size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); From 4c04400969b8f81be0a4795781acca510b5d2b74 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 07:07:00 +0100 Subject: [PATCH 21/27] llama_model_loader: fix map -> unordered map --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 3e0aec8f6c266..092eae8f6a120 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3237,7 +3237,7 @@ struct llama_model_loader { std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map & bufs_mmap, std::vector> * lmlocks) { + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::unordered_map & bufs_mmap, std::vector> * lmlocks) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; From e474e456ebaa5a169d7ea6d12ddb9a7c4087d971 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 07:48:50 +0100 Subject: [PATCH 22/27] llama_split_prefix: use a clearer version, not pass split path len but dest max len. Co-authored-by: Xuan Son Nguyen --- examples/gguf-split/gguf-split.cpp | 2 +- llama.cpp | 30 ++++++++++++------------------ llama.h | 4 ++-- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 3f582506da86e..f703588e164f6 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -355,7 +355,7 @@ static void gguf_merge(const split_params & split_params) { } // Verify the file naming and extract split_prefix - if (!llama_split_prefix(split_prefix, split_path, strlen(split_path), i_split, n_split)) { + if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { fprintf(stderr, "\n%s: unexpected input file name: %s" " i_split=%d" " n_split=%d\n", __func__, diff --git a/llama.cpp b/llama.cpp index 092eae8f6a120..ee0318feb473a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2888,7 +2888,7 @@ struct llama_model_loader { } char split_prefix[PATH_MAX] = {0}; - if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) { throw std::runtime_error(format("invalid split file: %s", fname.c_str())); } @@ -14806,25 +14806,19 @@ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * pa return 0; } -LLAMA_API int llama_split_prefix(char * dest, const char * split_path, size_t split_path_len, int split_no, int split_count) { - char split_prefix[PATH_MAX] = {0}; - int split_no_file = 0; - int split_count_file = 0; - const char * split_format = "-00000-of-00000.gguf"; +int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) { + std::string str_split_path(split_path); + char postfix[32]; + sprintf(postfix, "-%05d-of-%05d.gguf", split_no + 1, split_count); + std::string str_postfix(postfix); - if (split_path_len > strlen(split_format) + 1) { - size_t prefix_len = split_path_len - strlen(split_format); - if (prefix_len >= sizeof(split_prefix)) { - prefix_len = sizeof(split_prefix) - 1; // leave room for null terminator - } - strncpy(split_prefix, split_path, prefix_len); - - int n = sscanf(&split_path[0] + strlen(split_prefix), "-%d-of-%d", &split_no_file, &split_count_file); - if (n == 2 && split_no_file - 1 == split_no && split_count_file == split_count) { - strcpy(dest, split_prefix); - return strlen(split_prefix); - } + // check if dest ends with postfix + auto size_prefix = str_split_path.size() - str_postfix.size(); + if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { + strncpy(dest, split_path, std::min(size_prefix, maxlen)); + return size_prefix; } + return 0; } diff --git a/llama.h b/llama.h index c23172c55e328..7e8ac4b62beb5 100644 --- a/llama.h +++ b/llama.h @@ -966,9 +966,9 @@ extern "C" { LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. - /// llama_split_prefix(split_prefix, "/models/ggml-model-q4_0-00002-of-00004.gguf", 43, 2, 4) => split_prefix = "/models/ggml-model-q4_0" + /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" // Returns the split_prefix length. - LLAMA_API int llama_split_prefix(char * split_prefix, const char * split_path, size_t split_path_len, int split_no, int split_count); + LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); From 8326607cfe3bb416f40471836af3cc023078ad26 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Mar 2024 10:17:34 +0200 Subject: [PATCH 23/27] llama : minor ggml-ci --- llama.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index ee0318feb473a..b725a1a0095b3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2918,10 +2918,15 @@ struct llama_model_loader { gguf_free(ctx_gguf); } + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); - int n_tensors_loaded = (int) weights.size(); - if (n_tensors != n_tensors_loaded) { - throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); + + // sanity check + { + const int n_tensors_loaded = (int) weights.size(); + if (n_tensors != n_tensors_loaded) { + throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); + } } LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split); @@ -2930,7 +2935,7 @@ struct llama_model_loader { n_kv = gguf_get_n_kv(meta); n_tensors = weights.size(); - fver = (enum llama_fver ) gguf_get_version(meta); + fver = (enum llama_fver) gguf_get_version(meta); for (auto & w : weights) { n_elements += ggml_nelements(w.tensor); @@ -2960,7 +2965,8 @@ struct llama_model_loader { } if (trace > 0) { - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); + const uint16_t sid = weights.at(i).idx; + LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } From dbc35acff0a6e6d546dadb1288a02ff169428a84 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Mar 2024 10:58:42 +0200 Subject: [PATCH 24/27] llama : introduce some typedef helpers --- llama.cpp | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/llama.cpp b/llama.cpp index b725a1a0095b3..2332e7eccec15 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1110,6 +1110,7 @@ struct llama_file { } } }; +using llama_files = std::vector>; struct llama_mmap { void * addr; @@ -1310,6 +1311,7 @@ struct llama_mmap { } #endif }; +using llama_mmaps = std::vector>; // Represents some region of memory being locked using mlock or VirtualLock; // will automatically unlock on destruction. @@ -1459,6 +1461,7 @@ struct llama_mlock { static void raw_unlock(const void * addr, size_t len) {} #endif }; +using llama_mlocks = std::vector>; static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); @@ -2035,11 +2038,11 @@ struct llama_model { std::vector bufs; // model memory mapped files - std::vector> mappings; + llama_mmaps mappings; // objects representing data potentially being locked in memory - std::vector> mlock_bufs; - std::vector> mlock_mmaps; + llama_mlocks mlock_bufs; + llama_mlocks mlock_mmaps; // for quantize-stats only std::vector> tensors_by_name; @@ -2803,6 +2806,8 @@ namespace GGUFMeta { }; } +using llama_buf_map = std::unordered_map; + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -2813,11 +2818,11 @@ struct llama_model_loader { bool use_mmap = false; - std::vector> files; + llama_files files; llama_ftype ftype; llama_fver fver; - std::vector> mappings; + llama_mmaps mappings; // Holds information on a model weights struct llama_tensor_weights { @@ -3009,6 +3014,7 @@ struct llama_model_loader { } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(meta, i); const enum gguf_type type = gguf_get_kv_type(meta, i); @@ -3179,7 +3185,7 @@ struct llama_model_loader { } } - void init_mappings(bool prefetch = true, std::vector> * mlock_mmaps = nullptr) { + void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) { if (use_mmap) { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); @@ -3214,7 +3220,7 @@ struct llama_model_loader { continue; } *first = std::min(*first, w.offs); - *last = std::max(*last, w.offs + ggml_nbytes(tensor)); + *last = std::max(*last, w.offs + ggml_nbytes(tensor)); } } @@ -3243,7 +3249,12 @@ struct llama_model_loader { std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::unordered_map & bufs_mmap, std::vector> * lmlocks) { + bool load_all_data( + struct ggml_context * ctx, + llama_buf_map & bufs_mmap, + llama_mlocks * lmlocks, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; @@ -3272,7 +3283,7 @@ struct llama_model_loader { } auto & mmap_used = mmaps_used[w.idx]; - mmap_used.first = std::min(mmap_used.first, w.offs); + mmap_used.first = std::min(mmap_used.first, w.offs); mmap_used.second = std::max(mmap_used.second, w.offs + n_size); } else { ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size); @@ -5144,7 +5155,7 @@ static bool llm_load_tensors( model.mappings.reserve(ml.mappings.size()); // create the backend buffers - std::vector>> ctx_bufs; + std::vector> ctx_bufs; ctx_bufs.reserve(ctx_map.size()); // Ensure we have enough capacity for the maximum backend buffer we will potentially create @@ -5153,8 +5164,9 @@ static bool llm_load_tensors( for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - std::unordered_map bufs; + ggml_context * ctx = it.second; + + llama_buf_map bufs; bufs.reserve(n_max_backend_buffer); // only the mmap region containing the tensors in the model is mapped to the backend buffer @@ -5211,16 +5223,18 @@ static bool llm_load_tensors( if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); - mlock_buf->init(ggml_backend_buffer_get_base(buf)); + mlock_buf->init (ggml_backend_buffer_get_base(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } for (uint32_t idx = 0; idx < ml.files.size(); idx++) { bufs.emplace(idx, buf); } } + if (bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } + for (auto & buf : bufs) { // indicate that this buffer contains weights // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight @@ -5260,7 +5274,7 @@ static bool llm_load_tensors( for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; auto & bufs = it.second; - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, bufs, use_mlock ? &model.mlock_mmaps : NULL)) { + if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { return false; } } From f616b38b6bc0e169b0bb79508d69ac697eb7e7d9 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 12:12:13 +0100 Subject: [PATCH 25/27] docs: add model shard in hot topic --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c2f3342f0ff42..a4990e5ad3d75 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 +- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187 ---- From 1f3875995f599f70c9785f6971afa0f7638c61c6 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 22 Mar 2024 14:44:07 +0100 Subject: [PATCH 26/27] llama_model_loader: put mapping in a unique_ptr from the moment it is allocated Co-authored-by: slaren --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 2332e7eccec15..7da11d6be6f3e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3190,14 +3190,14 @@ struct llama_model_loader { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { - auto * mapping = new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); mmaps_used.emplace_back(std::make_pair(mapping->size, 0)); - mappings.emplace_back(std::unique_ptr(mapping)); if (mlock_mmaps) { - auto * mlock_mmap = new llama_mlock(); + std::unique_ptr mlock_mmap(new llama_mlock()); mlock_mmap->init(mapping->addr); - mlock_mmaps->emplace_back(std::unique_ptr(mlock_mmap)); + mlock_mmaps->emplace_back(std::move(mlock_mmap)); } + mappings.emplace_back(std::move(mapping)); } } From 764c7afee781c2d2d1966df6214073d467446767 Mon Sep 17 00:00:00 2001 From: ngxson Date: Fri, 22 Mar 2024 15:10:52 +0100 Subject: [PATCH 27/27] fix llama_split_prefix --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7da11d6be6f3e..0af78c6a3899f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14829,13 +14829,13 @@ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * pa int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) { std::string str_split_path(split_path); char postfix[32]; - sprintf(postfix, "-%05d-of-%05d.gguf", split_no + 1, split_count); + snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); std::string str_postfix(postfix); // check if dest ends with postfix - auto size_prefix = str_split_path.size() - str_postfix.size(); + int size_prefix = str_split_path.size() - str_postfix.size(); if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { - strncpy(dest, split_path, std::min(size_prefix, maxlen)); + snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path); return size_prefix; }