From 7c64fef91bcf0fbf473e8cb52bc1a4a14473ca8c Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Tue, 19 Mar 2024 13:42:37 +0100
Subject: [PATCH 01/27] split: support in llama_model_loader

---
 examples/gguf-split/gguf-split.cpp | 145 +++++-------
 llama.cpp                          | 362 +++++++++++++++++++++--------
 llama.h                            |  10 +
 3 files changed, 344 insertions(+), 173 deletions(-)
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 8e12e64937bd7..e45151ab1bc41 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -1,31 +1,34 @@
 #include "llama.h"
-#include "ggml.h"
 #include "common.h"
 
 #include <algorithm>
 #include <cmath>
-#include <cstdint>
 #include <cstdlib>
 #include <fstream>
-#include <ios>
 #include <string>
 #include <vector>
 
 #include <stdio.h>
-#include <fcntl.h>
 #include <string.h>
+#include <climits>
+#include <stdexcept>
+
+#if defined(_WIN32)
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
 
 enum split_operation : uint8_t {
     SPLIT_OP_SPLIT,
     SPLIT_OP_MERGE,
 };
 
-static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
-static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
-
-static const int SPLIT_FILENAME_MAX = 256;
-
-static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
+static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT   = "split.no";
+static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT   = "split.count";
+static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count";
 
 struct split_params {
     split_operation operation = SPLIT_OP_SPLIT;
@@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
     try {
         if (!split_params_parse_ex(argc, argv, params)) {
             split_print_usage(argv[0]);
-            exit(1);
+            exit(EXIT_FAILURE);
         }
     }
     catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
         split_print_usage(argv[0]);
-        exit(1);
+        exit(EXIT_FAILURE);
     }
     return result;
 }
@@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
-static std::string split_file_name(const std::string & path, int i_split, int n_split) {
-    char f_split[SPLIT_FILENAME_MAX] = {0};
-    snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
-    return std::string(f_split);
-}
-
 struct split_strategy {
     const split_params params;
     std::ifstream & f_input;
@@ -180,8 +177,9 @@ struct split_strategy {
         if (i_split == 0) {
             gguf_set_kv(ctx_out, ctx_gguf);
         }
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT,  i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT,  n_split);
+        gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors);
 
         // populate the original tensors, so we get an initial metadata
         for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@@ -189,10 +187,11 @@ struct split_strategy {
             gguf_add_tensor(ctx_out, meta);
         }
 
-        auto split_name = split_file_name(params.output, i_split, n_split);
+        char split_path[PATH_MAX] = {0};
+        llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
 
-        fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
-        fout = std::ofstream(split_name, std::ios::binary);
+        fprintf(stderr, "%s: %s ...", __func__, split_path);
+        fout = std::ofstream(split_path, std::ios::binary);
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
         auto meta_size = gguf_get_meta_size(ctx_out);
@@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) {
     std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
     if (!f_input.is_open()) {
         fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(1);
+        exit(EXIT_FAILURE);
     }
 
     auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
     if (!ctx_gguf) {
         fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(1);
+        exit(EXIT_FAILURE);
     }
 
     split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+
+    char first_split_path[PATH_MAX] = {0};
+    llama_split_path(first_split_path, sizeof(first_split_path),
+                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
     fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
             __func__, split_params.input.c_str(),
-            split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
+            first_split_path,
             split_params.n_split_tensors);
 
     strategy.split_start();
@@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) {
     std::vector<ggml_context *> ctx_metas;
     std::vector<gguf_context *> ctx_ggufs;
 
-    std::string split_prefix;
+    char split_path[PATH_MAX] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
+    char split_prefix[PATH_MAX] = {0};
 
     // First pass to find KV and tensors metadata
     for (int i_split = 0; i_split < n_split; i_split++) {
@@ -309,16 +314,15 @@ static void gguf_merge(const split_params & split_params) {
             /*.ctx      = */ &ctx_meta,
         };
 
-        auto split_name = split_params.input;
         if (i_split > 0) {
-            split_name = split_file_name(split_prefix, i_split, n_split);
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
         }
-        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
 
-        auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
         if (!ctx_gguf) {
             fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-            exit(1);
+            exit(EXIT_FAILURE);
         }
         ctx_ggufs.push_back(ctx_gguf);
         ctx_metas.push_back(ctx_meta);
@@ -331,65 +335,43 @@ static void gguf_merge(const split_params & split_params) {
                         __func__,
                         LLM_KV_GENERAL_SPLIT_N_SPLIT);
                 gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
             }
 
-            n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
             if (n_split < 1) {
                 fprintf(stderr,
                         "\n%s: input file does not contain a valid split count %d\n",
                         __func__,
                         n_split);
                 gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
             }
 
-            // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
-
-            // Set metadata from the first split
-            gguf_set_kv(ctx_out, ctx_gguf);
-        }
-
-        // Verify the file naming
-        {
-            int i_split_file = 0;
-            int n_split_file = 0;
-            const char * i_split_format = "-00000-of-00000.gguf";
-
-            if (split_name.size() < strlen(i_split_format)) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, split_path, strlen(split_path), i_split, n_split)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
             }
 
-            split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
-
-            const char * split_name_c_str = split_name.c_str();
-            int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
+            // Do not trigger merge if we try to merge again the output
+            gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
 
-            if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s"
-                                " i_split=%d i_split_file=%d"
-                                " n_split=%d n_split_file=%d\n", __func__,
-                        split_params.input.c_str(),
-                        i_split, i_split_file,
-                        n_split, n_split_file);
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
-                gguf_free(ctx_out);
-                fout.close();
-                exit(1);
-            }
+            // Set metadata from the first split
+            gguf_set_kv(ctx_out, ctx_gguf);
         }
 
         auto n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) {
 
     // Write tensors data
     for (int i_split = 0; i_split < n_split; i_split++) {
-        auto split_name = split_file_name(split_prefix, i_split, n_split);
-        std::ifstream f_input(split_name.c_str(), std::ios::binary);
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
         if (!f_input.is_open()) {
-            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_name.c_str());
-            for (auto * _ctx_gguf : ctx_ggufs) {
-                gguf_free(_ctx_gguf);
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
             }
             gguf_free(ctx_out);
             fout.close();
-            exit(1);
+            exit(EXIT_FAILURE);
         }
-        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
 
         auto * ctx_gguf = ctx_ggufs[i_split];
         auto * ctx_meta = ctx_metas[i_split];
@@ -481,8 +464,8 @@ int main(int argc, const char ** argv) {
             break;
         case SPLIT_OP_MERGE: gguf_merge(params);
             break;
-        default:split_print_usage(argv[0]);
-            exit(1);
+        default: split_print_usage(argv[0]);
+            exit(EXIT_FAILURE);
     }
 
     return 0;
diff --git a/llama.cpp b/llama.cpp
index 1a9fe0c4d2cea..cd7a7b8d60cd8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -52,6 +52,9 @@
         #define NOMINMAX
     #endif
     #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
     #include <io.h>
 #endif
 
@@ -290,6 +293,10 @@ enum llm_kv {
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
     LLM_KV_ROPE_SCALING_FINETUNED,
 
+    LLM_KV_SPLIT_NO,
+    LLM_KV_SPLIT_COUNT,
+    LLM_KV_SPLIT_TENSORS_COUNT,
+
     LLM_KV_SSM_INNER_SIZE,
     LLM_KV_SSM_CONV_KERNEL,
     LLM_KV_SSM_STATE_SIZE,
@@ -355,6 +362,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
     { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
 
+    { LLM_KV_SPLIT_NO,                      "split.no"            },
+    { LLM_KV_SPLIT_COUNT,                   "split.count"         },
+    { LLM_KV_SPLIT_TENSORS_COUNT,           "split.tensors.count" },
+
     { LLM_KV_SSM_CONV_KERNEL,               "%s.ssm.conv_kernel"    },
     { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
     { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
@@ -1449,6 +1460,17 @@ struct llama_mlock {
 #endif
 };
 
+// Holds information on a tensor data source location.
+struct llama_tensor_offset  {
+    uint16_t  idx;  // source file index
+    size_t    offs; // tensor data offset in the original file
+
+    llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
+        const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
+        offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+    }
+};
+
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
@@ -2023,12 +2045,12 @@ struct llama_model {
     // the model memory buffers for the tensor data
     std::vector<ggml_backend_buffer_t> bufs;
 
-    // model memory mapped file
-    std::unique_ptr<llama_mmap> mapping;
+    // model memory mapped files
+    std::vector<std::unique_ptr<llama_mmap>> mappings;
 
     // objects representing data potentially being locked in memory
     std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
-    llama_mlock mlock_mmap;
+    std::vector<std::unique_ptr<llama_mlock>> mlock_mmaps;
 
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2802,11 +2824,13 @@ struct llama_model_loader {
 
     bool use_mmap = false;
 
-    llama_file  file;
+    std::vector<std::unique_ptr<llama_file>> files;
     llama_ftype ftype;
     llama_fver  fver;
 
-    std::unique_ptr<llama_mmap> mapping;
+    std::vector<std::unique_ptr<llama_mmap>> mappings;
+    std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset accross files
+
     std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
     struct gguf_context * ctx_gguf = NULL;
@@ -2815,7 +2839,7 @@ struct llama_model_loader {
     std::string arch_name;
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
 
-    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
+    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
         int trace = 0;
         if (getenv("LLAMA_TRACE")) {
             trace = atoi(getenv("LLAMA_TRACE"));
@@ -2836,10 +2860,103 @@ struct llama_model_loader {
         if (!ctx_gguf) {
             throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
+        files.emplace_back(new llama_file(fname.c_str(), "rb"));
 
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
+        // Save tensors data offset of the main file.
+        // For subsidiary files, gguf_ctx tensor data offset must not be used,
+        // we build a unified tensors offset index.
+        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) {
+            tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf));
+        }
+
+        uint16_t n_split = 0;
+        get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+
+        // Build virtual GGUF/GGML contexts to represent all tensors across files
+        if (n_split > 1) {
+            uint16_t idx = 0;
+            get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
+            if (idx != 0) {
+                throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
+            }
+            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+
+            char split_prefix[4096] = {0};
+            if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
+                throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
+            }
+
+            size_t mem_size = n_tensors*ggml_tensor_overhead();
+            struct ggml_init_params pdata = {
+                /*.mem_size   = */ mem_size,
+                /*.mem_buffer = */ NULL,
+                /*.no_alloc   = */ true,
+            };
+
+            auto * new_ctx_meta = ggml_init(pdata);
+
+            if (trace > 0) {
+                LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+            }
+
+            for (; idx < n_split; idx++) {
+                char split_path[PATH_MAX] = {0};
+                struct ggml_context * split_ctx_meta = NULL;
+                struct gguf_context * split_ctx_gguf = NULL;
+                if (idx == 0) {
+                    split_ctx_gguf = ctx_gguf;
+                    split_ctx_meta = ctx_meta;
+                    strcpy(split_path, fname.c_str());
+                } else {
+                    llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+                    struct gguf_init_params split_params = {
+                        /*.no_alloc = */ true,
+                        /*.ctx      = */ &split_ctx_meta,
+                    };
+                    split_ctx_gguf = gguf_init_from_file(split_path, split_params);
+                    if (!split_ctx_gguf) {
+                        throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str()));
+                    }
+                }
+
+                bool ok = true;
+                for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) {
+                    struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne);
+                    ok = ok && copy != NULL;
+
+                    if (!ok) {
+                        break;
+                    }
+
+                    ggml_set_name(copy, tensor->name);
+
+                    // Add the tensor to the main gguf context if not already present
+                    if (idx > 0) {
+                        gguf_add_tensor(ctx_gguf, copy);
+                        tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf));
+                    }
+                }
+
+                if (!ok) {
+                    throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__));
+                }
+
+                if (idx > 0) {
+                    files.emplace_back(new llama_file(split_path, "rb"));
+                    gguf_free(split_ctx_gguf);
+                    ggml_free(split_ctx_meta);
+                }
+            }
+
+            ggml_free(ctx_meta);
+            ctx_meta = new_ctx_meta;
+
+            LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split);
+        }
+
         n_kv      = gguf_get_n_kv(ctx_gguf);
         n_tensors = gguf_get_n_tensors(ctx_gguf);
 
@@ -3075,118 +3192,129 @@ struct llama_model_loader {
         }
     }
 
-    size_t file_offset(const char * name) const {
-        const int idx = gguf_find_tensor(ctx_gguf, name);
-
-        if (idx < 0) {
-            throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
-        }
-
-        return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
-    }
-
-    void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
-        // prefetch the whole file - all the data is needed anyway
+    void init_mappings(bool prefetch = true, std::vector<std::unique_ptr<llama_mlock>> * mlock_mmaps = nullptr) {
         if (use_mmap) {
-            mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
+            for (const auto & file : files) {
+                auto * mapping = new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa());
+                mmaps_used.emplace_back(std::make_pair(mapping->size, 0));
+                mappings.emplace_back(std::unique_ptr<llama_mmap>(mapping));
+                if (mlock_mmaps) {
+                    auto * mlock_mmap = new llama_mlock();
+                    mlock_mmap->init(mapping->addr);
+                    mlock_mmaps->emplace_back(std::unique_ptr<llama_mlock>(mlock_mmap));
+                }
+            }
         }
 
         // compute the total size of all tensors for progress reporting
-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
+        for (int i = 0; i < n_tensors; i++) {
             struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
             size_data += ggml_nbytes(cur);
         }
-
-        if (use_mmap && mapping) {
-            if (lmlock) {
-                lmlock->init(mapping->addr);
-            }
-            mmap_used_first = mapping->size;
-        }
     }
 
-    void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
-        GGML_ASSERT(mapping);
+    void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
+        GGML_ASSERT(!mappings.empty());
+        const auto & mapping = mappings[idx];
 
         *first = mapping->size;
         *last  = 0;
+        *addr = mapping->addr;
         for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            const size_t offs = file_offset(ggml_get_name(tensor));
-            *first = std::min(*first, offs);
-            *last  = std::max(*last,  offs + ggml_nbytes(tensor));
+            const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor));
+            *first = std::min(*first, tensor_off.offs);
+            *last  = std::max(*last,  tensor_off.offs + ggml_nbytes(tensor));
         }
     }
 
     // for backwards compatibility, does not support ggml-backend
     void load_data_for(struct ggml_tensor * cur) const {
-        const size_t offs = file_offset(ggml_get_name(cur));
+        const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
 
-        if (use_mmap && mapping) {
+        if (use_mmap && t_offs.idx < mappings.size()) {
+            const auto & mapping = mappings.at(t_offs.idx);
             if (cur->data == nullptr) {
-                cur->data = (uint8_t *)mapping->addr + offs;
+                cur->data = (uint8_t *)mapping->addr + t_offs.offs;
             } else {
-                memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
+                memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur));
             }
         } else {
             GGML_ASSERT(cur->data != nullptr);
-            file.seek(offs, SEEK_SET);
-            file.read_raw(cur->data, ggml_nbytes(cur));
+            GGML_ASSERT(t_offs.idx < files.size());
+            const auto & file = files.at(t_offs.idx);
+            file->seek(t_offs.offs, SEEK_SET);
+            file->read_raw(cur->data, ggml_nbytes(cur));
         }
     }
 
     size_t size_done = 0;
     size_t size_data = 0;
-    size_t mmap_used_first = -1;
-    size_t mmap_used_last  = 0;
+    std::vector<std::pair<size_t, size_t>> mmaps_used;
 
     // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
-        GGML_ASSERT(size_data != 0 && "call init_mapping() first");
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector<ggml_backend_buffer_t> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
+        GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
         std::vector<no_init<uint8_t>> read_buf;
-
         for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
             if (progress_callback) {
-                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+                if (!progress_callback((float)size_done / size_data, progress_callback_user_data)) {
                     return false;
                 }
             }
 
-            const size_t offs = file_offset(ggml_get_name(cur));
+            const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
+            size_t n_size = ggml_nbytes(cur);
 
-            if (use_mmap && mapping) {
+            if (use_mmap && t_offs.idx < mappings.size()) {
+                const auto & mapping = mappings.at(t_offs.idx);
+                ggml_backend_buffer_t buf_mmap = nullptr;
+                if (bufs_mmap.size() > 1) {
+                    buf_mmap = bufs_mmap[t_offs.idx];
+                } else if (!bufs_mmap.empty()) {
+                    buf_mmap = bufs_mmap.front();
+                }
                 if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
-                    if (lmlock) {
-                        lmlock->grow_to(offs + ggml_nbytes(cur));
+                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs);
+                    if (lmlocks) {
+                        const auto & lmlock = lmlocks->at(t_offs.idx);
+                        lmlock->grow_to(t_offs.offs + ggml_nbytes(cur));
                     }
-                    mmap_used_first = std::min(mmap_used_first, offs);
-                    mmap_used_last  = std::max(mmap_used_last,  offs + ggml_nbytes(cur));
+
+                    auto & mmap_used = mmaps_used[t_offs.idx];
+                    mmap_used.first = std::min(mmap_used.first, t_offs.offs);
+                    mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size);
                 } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
+                    ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size);
                 }
             } else {
+                GGML_ASSERT(t_offs.idx < files.size());
+                const auto & file = files.at(t_offs.idx);
                 if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    file.seek(offs, SEEK_SET);
-                    file.read_raw(cur->data, ggml_nbytes(cur));
+                    file->seek(t_offs.offs, SEEK_SET);
+                    file->read_raw(cur->data, ggml_nbytes(cur));
                 } else {
                     read_buf.resize(ggml_nbytes(cur));
-                    file.seek(offs, SEEK_SET);
-                    file.read_raw(read_buf.data(), ggml_nbytes(cur));
-                    ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
+                    file->seek(t_offs.offs, SEEK_SET);
+                    file->read_raw(read_buf.data(), ggml_nbytes(cur));
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                 }
             }
 
-            size_done += ggml_nbytes(cur);
+            size_done += n_size;
         }
 
         // check if this is the last call and do final cleanup
         if (size_done >= size_data) {
             // unmap offloaded tensors and metadata
-            if (use_mmap && mapping) {
-                mapping->unmap_fragment(0, mmap_used_first);
-                if (mmap_used_last != 0) {
-                    mapping->unmap_fragment(mmap_used_last, mapping->size);
+            if (use_mmap && !mappings.empty()) {
+                for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) {
+                    const auto & mmap_used = mmaps_used[file_no];
+                    auto & mapping = mappings.at(file_no);
+                    mapping->unmap_fragment(0, mmap_used.first);
+                    if (mmap_used.second != 0) {
+                        mapping->unmap_fragment(mmap_used.second, mapping->size);
+                    }
                 }
             }
             if (progress_callback) {
@@ -5024,56 +5152,74 @@ static bool llm_load_tensors(
 
     ml.done_getting_tensors();
 
-    ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
+    ml.init_mappings(true, &model.mlock_mmaps);
 
     // create the backend buffers
-    std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
+    std::vector<std::pair<ggml_context *, std::vector<ggml_backend_buffer_t>>> ctx_bufs;
 
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx = it.second;
-        ggml_backend_buffer_t buf = nullptr;
+        std::vector<ggml_backend_buffer_t> bufs;
 
         // only the mmap region containing the tensors in the model is mapped to the backend buffer
         // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
         // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
         if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
-            size_t first, last;
-            ml.get_mapping_range(&first, &last, ctx);
-            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
+            for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
+                void * addr = nullptr;
+                size_t first, last;
+                ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
+                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
+                if (buf != nullptr) {
+                    bufs.push_back(buf);
 #ifdef GGML_USE_CUBLAS
-            if (n_layer >= n_gpu_layers) {
-                ggml_backend_cuda_register_host_buffer(
-                        ggml_backend_buffer_get_base(buf),
-                        ggml_backend_buffer_get_size(buf));
-            }
+                    if (n_layer >= n_gpu_layers) {
+                        ggml_backend_cuda_register_host_buffer(
+                            ggml_backend_buffer_get_base(buf),
+                            ggml_backend_buffer_get_size(buf));
+                    }
 #endif
+                }
+            }
         }
 #ifdef GGML_USE_METAL
         else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
-            const size_t max_size = ggml_get_max_tensor_size(ctx);
-            size_t first, last;
-            ml.get_mapping_range(&first, &last, ctx);
-            buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
+            for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
+                const size_t max_size = ggml_get_max_tensor_size(ctx);
+                void * addr = nullptr;
+                size_t first, last;
+                ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
+                ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
+                if (buf != nullptr) {
+                    bufs.push_back(buf);
+                }
+            }
         }
 #endif
         else {
-            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
-                model.mlock_bufs.emplace_back(new llama_mlock);
-                auto & mlock_buf = model.mlock_bufs.back();
-                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
-                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (buf != nullptr) {
+                if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+                    model.mlock_bufs.emplace_back(new llama_mlock);
+                    auto & mlock_buf = model.mlock_bufs.back();
+                    mlock_buf->init(ggml_backend_buffer_get_base(buf));
+                    mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+                }
+                bufs.push_back(buf);
             }
         }
-        if (buf == nullptr) {
+        if (bufs.empty()) {
             throw std::runtime_error("failed to allocate buffer");
         }
         // indicate that this buffer contains weights
         // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
-        ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        model.bufs.push_back(buf);
-        ctx_bufs.emplace_back(ctx, buf);
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            model.bufs.push_back(buf);
+        }
+
+        ctx_bufs.emplace_back(ctx, bufs);
     }
 
     if (llama_supports_gpu_offload()) {
@@ -5105,13 +5251,15 @@ static bool llm_load_tensors(
     // load tensor data
     for (auto & it : ctx_bufs) {
         ggml_context * ctx = it.first;
-        ggml_backend_buffer_t buf = it.second;
-        if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
+        std::vector<ggml_backend_buffer_t> bufs = it.second;
+        if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, bufs, use_mlock ? &model.mlock_mmaps : NULL)) {
             return false;
         }
     }
 
-    model.mapping = std::move(ml.mapping);
+    for (auto & mapping : ml.mappings) {
+        model.mappings.emplace_back(std::move(mapping));
+    }
 
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
@@ -12308,7 +12456,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 #endif
 
     llama_model_loader ml(fname_inp, use_mmap, NULL);
-    ml.init_mapping(false); // no prefetching?
+    ml.init_mappings(false); // no prefetching?
 
     llama_model model;
     llm_load_arch(ml, model);
@@ -12582,7 +12730,7 @@ static int llama_apply_lora_from_file_internal(
     if (path_base_model) {
         LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
         ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
-        ml->init_mapping(/*prefetch*/ false); // no prefetching
+        ml->init_mappings(/*prefetch*/ false); // no prefetching
     }
 
     struct tensor_meta {
@@ -14648,6 +14796,36 @@ LLAMA_API int32_t llama_chat_apply_template(
     return res;
 }
 
+LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
+    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
+    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
+        return strlen(split_path);
+    }
+    return 0;
+}
+
+LLAMA_API int llama_split_prefix(char * dest, const char * split_path, size_t split_path_len, int split_no, int split_count) {
+    char split_prefix[PATH_MAX] = {0};
+    int split_no_file = 0;
+    int split_count_file = 0;
+    const char * split_format = "-00000-of-00000.gguf";
+
+    if (split_path_len > strlen(split_format) + 1) {
+        size_t prefix_len = split_path_len - strlen(split_format);
+        if (prefix_len >= sizeof(split_prefix)) {
+            prefix_len = sizeof(split_prefix) - 1;  // leave room for null terminator
+        }
+        strncpy(split_prefix, split_path, prefix_len);
+
+        int n = sscanf(&split_path[0] + strlen(split_prefix), "-%d-of-%d", &split_no_file, &split_count_file);
+        if (n == 2 && split_no_file - 1 == split_no && split_count_file == split_count) {
+            strcpy(dest, split_prefix);
+            return strlen(split_prefix);
+        }
+    }
+    return 0;
+}
+
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
     struct llama_timings result = {
         /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
diff --git a/llama.h b/llama.h
index 40dcf54e394f8..c23172c55e328 100644
--- a/llama.h
+++ b/llama.h
@@ -960,6 +960,16 @@ extern "C" {
                                 int32_t   n_past,
                                 int32_t   n_predict);
 
+    /// @details Build a split GGUF final path for this chunk.
+    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+    //  Returns the split_path length.
+    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+    ///          llama_split_prefix(split_prefix, "/models/ggml-model-q4_0-00002-of-00004.gguf", 43, 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    //  Returns the split_prefix length.
+    LLAMA_API int llama_split_prefix(char * split_prefix, const char * split_path, size_t split_path_len, int split_no, int split_count);
+
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
 

From b8feff411fc170de48c592cced8660f9e692db2b Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 04:36:06 +0100
Subject: [PATCH 02/27] Avoir copying the entire vector

Co-authored-by: slaren <slarengh@gmail.com>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index cd7a7b8d60cd8..a8e6b4208c85b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5251,7 +5251,7 @@ static bool llm_load_tensors(
     // load tensor data
     for (auto & it : ctx_bufs) {
         ggml_context * ctx = it.first;
-        std::vector<ggml_backend_buffer_t> bufs = it.second;
+        auto & bufs = it.second;
         if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, bufs, use_mlock ? &model.mlock_mmaps : NULL)) {
             return false;
         }

From 18ff6ca8473fcc874157ed5dffe1cc21a11c1e5f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 07:06:14 +0100
Subject: [PATCH 03/27] split: move llama_tensor_offset to llama_model_loader

---
 llama.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a8e6b4208c85b..168ef4ee5467e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1460,17 +1460,6 @@ struct llama_mlock {
 #endif
 };
 
-// Holds information on a tensor data source location.
-struct llama_tensor_offset  {
-    uint16_t  idx;  // source file index
-    size_t    offs; // tensor data offset in the original file
-
-    llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
-        const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
-        offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
-    }
-};
-
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
@@ -2829,7 +2818,18 @@ struct llama_model_loader {
     llama_fver  fver;
 
     std::vector<std::unique_ptr<llama_mmap>> mappings;
-    std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset accross files
+
+    // Holds information on a tensor data source location.
+    struct llama_tensor_offset  {
+        uint16_t  idx;  // source file index
+        size_t    offs; // tensor data offset in the original file
+
+        llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
+            const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
+            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+        }
+    };
+    std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset across files
 
     std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
@@ -2884,7 +2884,7 @@ struct llama_model_loader {
             }
             get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
 
-            char split_prefix[4096] = {0};
+            char split_prefix[PATH_MAX] = {0};
             if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
                 throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
             }

From 1892ae7eb1844f6704c0dd2ec0a4fe9508b77eb1 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 19:11:37 +0100
Subject: [PATCH 04/27] llama_model_loader: PR feedbacks:  - use only one
 gguf_context for metadata only  - store all ggml_context in a vector as the
 files and mappings  - store all weights in a vector along with the source
 tensor  - rename ctx_gguf to meta  - rename ctx_meta to contexts

---
 examples/gguf-split/gguf-split.cpp |  18 +-
 llama.cpp                          | 257 +++++++++++++----------------
 2 files changed, 124 insertions(+), 151 deletions(-)

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index e45151ab1bc41..3f582506da86e 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -26,9 +26,9 @@ enum split_operation : uint8_t {
     SPLIT_OP_MERGE,
 };
 
-static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT   = "split.no";
-static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT   = "split.count";
-static const char * const LLM_KV_GENERAL_SPLIT_N_TENSORS = "split.tensors.count";
+static const char * const LLM_KV_SPLIT_NO            = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 struct split_params {
     split_operation operation = SPLIT_OP_SPLIT;
@@ -177,9 +177,9 @@ struct split_strategy {
         if (i_split == 0) {
             gguf_set_kv(ctx_out, ctx_gguf);
         }
-        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT,  i_split);
-        gguf_set_val_u16(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT,  n_split);
-        gguf_set_val_i32(ctx_out, LLM_KV_GENERAL_SPLIT_N_TENSORS,n_tensors);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
+        gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
 
         // populate the original tensors, so we get an initial metadata
         for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@@ -328,12 +328,12 @@ static void gguf_merge(const split_params & split_params) {
         ctx_metas.push_back(ctx_meta);
 
         if (i_split == 0) {
-            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
             if (key_n_split < 0) {
                 fprintf(stderr,
                         "\n%s: input file does not contain %s metadata\n",
                         __func__,
-                        LLM_KV_GENERAL_SPLIT_N_SPLIT);
+                        LLM_KV_SPLIT_COUNT);
                 gguf_free(ctx_gguf);
                 ggml_free(ctx_meta);
                 gguf_free(ctx_out);
@@ -368,7 +368,7 @@ static void gguf_merge(const split_params & split_params) {
             }
 
             // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u16(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
 
             // Set metadata from the first split
             gguf_set_kv(ctx_out, ctx_gguf);
diff --git a/llama.cpp b/llama.cpp
index 168ef4ee5467e..ecfc905f3d3a0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2819,22 +2819,24 @@ struct llama_model_loader {
 
     std::vector<std::unique_ptr<llama_mmap>> mappings;
 
-    // Holds information on a tensor data source location.
-    struct llama_tensor_offset  {
-        uint16_t  idx;  // source file index
-        size_t    offs; // tensor data offset in the original file
+    // Holds information on a model weights
+    struct llama_tensor_weights  {
+        uint16_t  idx; // source file index
+        size_t   offs; // tensor data offset in the original file
 
-        llama_tensor_offset(uint16_t idx, const char * name, struct gguf_context * gguf_ctx) : idx(idx) {
+        ggml_tensor * tensor;
+
+        llama_tensor_weights(uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
             const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
             offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
         }
     };
-    std::unordered_map<std::string, struct llama_tensor_offset> tensors_offs; // unified tensor data offset across files
+    std::vector<llama_tensor_weights> weights;
 
     std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
 
-    struct gguf_context * ctx_gguf = NULL;
-    struct ggml_context * ctx_meta = NULL;
+    struct gguf_context * meta = NULL;
+    std::vector<ggml_context *> contexts;
 
     std::string arch_name;
     LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
@@ -2845,128 +2847,91 @@ struct llama_model_loader {
             trace = atoi(getenv("LLAMA_TRACE"));
         }
 
-        struct gguf_init_params params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ &ctx_meta,
-        };
-
         if (param_overrides_p != nullptr) {
             for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
                 kv_overrides.insert({std::string(p->key), *p});
             }
         }
 
-        ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-        if (!ctx_gguf) {
+        struct ggml_context * ctx = NULL;
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx,
+        };
+
+        meta = gguf_init_from_file(fname.c_str(), params);
+        if (!meta) {
             throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
         }
-        files.emplace_back(new llama_file(fname.c_str(), "rb"));
 
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
         // Save tensors data offset of the main file.
-        // For subsidiary files, gguf_ctx tensor data offset must not be used,
-        // we build a unified tensors offset index.
-        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx_meta); tensor; tensor = ggml_get_next_tensor(ctx_meta, tensor)) {
-            tensors_offs.emplace(tensor->name, llama_tensor_offset(0, tensor->name, ctx_gguf));
+        // For subsidiary files, `meta` tensor data offset must not be used,
+        // so we build a unified tensors index for weights.
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+            weights.emplace_back(llama_tensor_weights(0, cur->name, meta, cur));
         }
+        files.emplace_back(new llama_file(fname.c_str(), "rb"));
+        contexts.emplace_back(ctx);
 
         uint16_t n_split = 0;
         get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
 
-        // Build virtual GGUF/GGML contexts to represent all tensors across files
+        // Load additional GGML contexts
         if (n_split > 1) {
             uint16_t idx = 0;
             get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
             if (idx != 0) {
                 throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
             }
-            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
 
             char split_prefix[PATH_MAX] = {0};
             if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
                 throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
             }
 
-            size_t mem_size = n_tensors*ggml_tensor_overhead();
-            struct ggml_init_params pdata = {
-                /*.mem_size   = */ mem_size,
-                /*.mem_buffer = */ NULL,
-                /*.no_alloc   = */ true,
-            };
-
-            auto * new_ctx_meta = ggml_init(pdata);
-
             if (trace > 0) {
                 LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
             }
 
-            for (; idx < n_split; idx++) {
-                char split_path[PATH_MAX] = {0};
-                struct ggml_context * split_ctx_meta = NULL;
-                struct gguf_context * split_ctx_gguf = NULL;
-                if (idx == 0) {
-                    split_ctx_gguf = ctx_gguf;
-                    split_ctx_meta = ctx_meta;
-                    strcpy(split_path, fname.c_str());
-                } else {
-                    llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
-                    struct gguf_init_params split_params = {
-                        /*.no_alloc = */ true,
-                        /*.ctx      = */ &split_ctx_meta,
-                    };
-                    split_ctx_gguf = gguf_init_from_file(split_path, split_params);
-                    if (!split_ctx_gguf) {
-                        throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname.c_str()));
-                    }
-                }
-
-                bool ok = true;
-                for (ggml_tensor * tensor = ggml_get_first_tensor(split_ctx_meta); tensor; tensor = ggml_get_next_tensor(split_ctx_meta, tensor)) {
-                    struct ggml_tensor * copy = ggml_new_tensor(new_ctx_meta, tensor->type, ggml_n_dims(tensor), tensor->ne);
-                    ok = ok && copy != NULL;
-
-                    if (!ok) {
-                        break;
-                    }
+            char split_path[PATH_MAX] = {0};
+            for (idx = 1; idx < n_split; idx++) {
+                llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
 
-                    ggml_set_name(copy, tensor->name);
-
-                    // Add the tensor to the main gguf context if not already present
-                    if (idx > 0) {
-                        gguf_add_tensor(ctx_gguf, copy);
-                        tensors_offs.emplace(tensor->name, llama_tensor_offset(idx, tensor->name, split_ctx_gguf));
-                    }
+                struct gguf_init_params split_params = {
+                    /*.no_alloc = */ true,
+                    /*.ctx      = */ &ctx,
+                };
+                struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
+                if (!ctx_gguf) {
+                    throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
                 }
 
-                if (!ok) {
-                    throw std::runtime_error(format("%s: failed to read the tensor metadata\n", __func__));
+                // Save tensors data offset info of the shard.
+                for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+                    weights.emplace_back(llama_tensor_weights(idx, cur->name, ctx_gguf, cur));
                 }
+                files.emplace_back(new llama_file(split_path, "rb"));
+                contexts.emplace_back(ctx);
 
-                if (idx > 0) {
-                    files.emplace_back(new llama_file(split_path, "rb"));
-                    gguf_free(split_ctx_gguf);
-                    ggml_free(split_ctx_meta);
-                }
+                gguf_free(ctx_gguf);
             }
-
-            ggml_free(ctx_meta);
-            ctx_meta = new_ctx_meta;
+            get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+            GGML_ASSERT(n_tensors == (int) weights.size());
 
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split);
         }
 
-        n_kv      = gguf_get_n_kv(ctx_gguf);
-        n_tensors = gguf_get_n_tensors(ctx_gguf);
+        n_kv      = gguf_get_n_kv(meta);
+        n_tensors = weights.size();
 
-        fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
+        fver = (enum llama_fver ) gguf_get_version(meta);
 
-        for (int i = 0; i < n_tensors; i++) {
-            const char * name = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
-            n_elements += ggml_nelements(t);
-            n_bytes    += ggml_nbytes(t);
+        for (auto & w : weights) {
+            n_elements += ggml_nelements(w.tensor);
+            n_bytes    += ggml_nbytes(w.tensor);
         }
 
         LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@@ -2981,7 +2946,8 @@ struct llama_model_loader {
             enum ggml_type type_max = GGML_TYPE_F32;
 
             for (int i = 0; i < n_tensors; i++) {
-                enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i);
+                const ggml_tensor * tensor = weights.at(i).tensor;
+                enum ggml_type type = tensor->type;
 
                 n_type[type]++;
 
@@ -2991,8 +2957,7 @@ struct llama_model_loader {
                 }
 
                 if (trace > 0) {
-                    struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
+                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
                 }
             }
 
@@ -3028,22 +2993,22 @@ struct llama_model_loader {
             ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
             {
-                const int kid = gguf_find_key(ctx_gguf, "general.file_type");
+                const int kid = gguf_find_key(meta, "general.file_type");
                 if (kid >= 0) {
-                    ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
+                    ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
                 }
             }
 
             LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
             for (int i = 0; i < n_kv; i++) {
-                const char * name           = gguf_get_key(ctx_gguf, i);
-                const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
+                const char * name           = gguf_get_key(meta, i);
+                const enum gguf_type type   = gguf_get_kv_type(meta, i);
                 const std::string type_name =
                     type == GGUF_TYPE_ARRAY
-                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
+                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
                     : gguf_type_name(type);
 
-                std::string value          = gguf_kv_to_str(ctx_gguf, i);
+                std::string value          = gguf_kv_to_str(meta, i);
                 const size_t MAX_VALUE_LEN = 40;
                 if (value.size() > MAX_VALUE_LEN) {
                     value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
@@ -3072,18 +3037,18 @@ struct llama_model_loader {
     }
 
     ~llama_model_loader() {
-        if (ctx_gguf) {
-            gguf_free(ctx_gguf);
+        if (meta) {
+            gguf_free(meta);
         }
-        if (ctx_meta) {
-            ggml_free(ctx_meta);
+        for (auto & ctx : contexts) {
+            ggml_free(ctx);
         }
     }
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
     get_arr_n(const std::string & key, T & result, const bool required = true) {
-        const int kid = gguf_find_key(ctx_gguf, key.c_str());
+        const int kid = gguf_find_key(meta, key.c_str());
 
         if (kid < 0) {
             if (required) {
@@ -3093,7 +3058,7 @@ struct llama_model_loader {
         }
 
         struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
 
 
         result = arr_info.length;
@@ -3113,7 +3078,7 @@ struct llama_model_loader {
         const struct llama_model_kv_override * override =
             it != kv_overrides.end() ? &it->second : nullptr;
 
-        const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
+        const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
 
         if (required && !found) {
             throw std::runtime_error(format("key not found in model: %s", key.c_str()));
@@ -3136,20 +3101,29 @@ struct llama_model_loader {
     }
 
     const char * get_tensor_name(int i) const {
-        return gguf_get_tensor_name(ctx_gguf, i);
+        return weights.at(i).tensor->name;
+    }
+
+    const llama_tensor_weights & get_weights(const char * name) const {
+        for (const auto & weight : weights) {
+            if (strcmp(name, weight.tensor->name) == 0) {
+                return weight;
+            }
+        }
+        throw std::runtime_error(format("tensor %s not found", name));
     }
 
     struct ggml_tensor * get_tensor_meta(const char * name) const {
-        return ggml_get_tensor(ctx_meta, name);
+        return get_weights(name).tensor;
     }
 
     struct ggml_tensor * get_tensor_meta(int i) const {
         return get_tensor_meta(get_tensor_name(i));
     }
 
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
-        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
-        ggml_set_name(tensor, ggml_get_name(meta));
+    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
+        struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
+        ggml_set_name(tensor, ggml_get_name(cur));
 
         n_created++;
 
@@ -3157,7 +3131,7 @@ struct llama_model_loader {
     }
 
     struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
+        const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
 
         if (cur == NULL) {
             if (!required) {
@@ -3207,9 +3181,8 @@ struct llama_model_loader {
         }
 
         // compute the total size of all tensors for progress reporting
-        for (int i = 0; i < n_tensors; i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-            size_data += ggml_nbytes(cur);
+        for (auto & w : weights) {
+            size_data += ggml_nbytes(w.tensor);
         }
     }
 
@@ -3221,28 +3194,28 @@ struct llama_model_loader {
         *last  = 0;
         *addr = mapping->addr;
         for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
-            const auto & tensor_off = tensors_offs.at(ggml_get_name(tensor));
-            *first = std::min(*first, tensor_off.offs);
-            *last  = std::max(*last,  tensor_off.offs + ggml_nbytes(tensor));
+            const auto & w = get_weights(ggml_get_name(tensor));
+            *first = std::min(*first, w.offs);
+            *last  = std::max(*last, w.offs + ggml_nbytes(tensor));
         }
     }
 
     // for backwards compatibility, does not support ggml-backend
     void load_data_for(struct ggml_tensor * cur) const {
-        const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
+        const auto & w = get_weights(ggml_get_name(cur));
 
-        if (use_mmap && t_offs.idx < mappings.size()) {
-            const auto & mapping = mappings.at(t_offs.idx);
+        if (use_mmap && w.idx < mappings.size()) {
+            const auto & mapping = mappings.at(w.idx);
             if (cur->data == nullptr) {
-                cur->data = (uint8_t *)mapping->addr + t_offs.offs;
+                cur->data = (uint8_t *)mapping->addr + w.offs;
             } else {
-                memcpy(cur->data, (uint8_t *)mapping->addr + t_offs.offs, ggml_nbytes(cur));
+                memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
             }
         } else {
             GGML_ASSERT(cur->data != nullptr);
-            GGML_ASSERT(t_offs.idx < files.size());
-            const auto & file = files.at(t_offs.idx);
-            file->seek(t_offs.offs, SEEK_SET);
+            GGML_ASSERT(w.idx < files.size());
+            const auto & file = files.at(w.idx);
+            file->seek(w.offs, SEEK_SET);
             file->read_raw(cur->data, ggml_nbytes(cur));
         }
     }
@@ -3263,39 +3236,39 @@ struct llama_model_loader {
                 }
             }
 
-            const auto & t_offs = tensors_offs.at(ggml_get_name(cur));
+            const auto & w = get_weights(ggml_get_name(cur));
             size_t n_size = ggml_nbytes(cur);
 
-            if (use_mmap && t_offs.idx < mappings.size()) {
-                const auto & mapping = mappings.at(t_offs.idx);
+            if (use_mmap && w.idx < mappings.size()) {
+                const auto & mapping = mappings.at(w.idx);
                 ggml_backend_buffer_t buf_mmap = nullptr;
                 if (bufs_mmap.size() > 1) {
-                    buf_mmap = bufs_mmap[t_offs.idx];
+                    buf_mmap = bufs_mmap[w.idx];
                 } else if (!bufs_mmap.empty()) {
                     buf_mmap = bufs_mmap.front();
                 }
                 if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + t_offs.offs);
+                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
                     if (lmlocks) {
-                        const auto & lmlock = lmlocks->at(t_offs.idx);
-                        lmlock->grow_to(t_offs.offs + ggml_nbytes(cur));
+                        const auto & lmlock = lmlocks->at(w.idx);
+                        lmlock->grow_to(w.offs + ggml_nbytes(cur));
                     }
 
-                    auto & mmap_used = mmaps_used[t_offs.idx];
-                    mmap_used.first = std::min(mmap_used.first, t_offs.offs);
-                    mmap_used.second = std::max(mmap_used.second, t_offs.offs + n_size);
+                    auto & mmap_used = mmaps_used[w.idx];
+                    mmap_used.first = std::min(mmap_used.first, w.offs);
+                    mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
                 } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + t_offs.offs, 0, n_size);
+                    ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size);
                 }
             } else {
-                GGML_ASSERT(t_offs.idx < files.size());
-                const auto & file = files.at(t_offs.idx);
+                GGML_ASSERT(w.idx < files.size());
+                const auto & file = files.at(w.idx);
                 if (ggml_backend_buffer_is_host(cur->buffer)) {
-                    file->seek(t_offs.offs, SEEK_SET);
+                    file->seek(w.offs, SEEK_SET);
                     file->read_raw(cur->data, ggml_nbytes(cur));
                 } else {
                     read_buf.resize(ggml_nbytes(cur));
-                    file->seek(t_offs.offs, SEEK_SET);
+                    file->seek(w.offs, SEEK_SET);
                     file->read_raw(read_buf.data(), ggml_nbytes(cur));
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                 }
@@ -3447,7 +3420,7 @@ static void llm_load_hparams(
         llama_model_loader & ml,
         llama_model & model) {
     auto & hparams = model.hparams;
-    const gguf_context * ctx = ml.ctx_gguf;
+    const gguf_context * ctx = ml.meta;
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
@@ -3837,7 +3810,7 @@ static void llm_load_vocab(
         llama_model & model) {
     auto & vocab = model.vocab;
 
-    struct gguf_context * ctx = ml.ctx_gguf;
+    struct gguf_context * ctx = ml.meta;
 
     const auto kv = LLM_KV(model.arch);
 
@@ -4447,7 +4420,7 @@ static bool llm_load_tensors(
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
+                        if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) {
                             layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
                             layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
                         }
@@ -12480,12 +12453,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     struct gguf_context * ctx_out = gguf_init_empty();
 
     // copy the KV pairs from the input file
-    gguf_set_kv     (ctx_out, ml.ctx_gguf);
+    gguf_set_kv     (ctx_out, ml.meta);
     gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
     gguf_set_val_u32(ctx_out, "general.file_type", ftype);
 
     for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * meta = ml.get_tensor_meta(i);
+        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
 
         const std::string name = ggml_get_name(meta);
 
@@ -12525,7 +12498,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     // populate the original tensors so we get an initial meta data
     for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * meta = ml.get_tensor_meta(i);
+        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
         gguf_add_tensor(ctx_out, meta);
     }
 
@@ -12851,7 +12824,7 @@ static int llama_apply_lora_from_file_internal(
 
         ggml_tensor * base_t;
         if (ml) {
-            if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
+            if (!ml->get_tensor_meta(base_name.c_str())) {
                 LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
                 return 1;
             }

From 00381b07bbb40c3db504defe27cc730b50e9c2a5 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 19:18:39 +0100
Subject: [PATCH 05/27] avoid copying the entire vector

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index ecfc905f3d3a0..c69d414fab817 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3243,7 +3243,7 @@ struct llama_model_loader {
                 const auto & mapping = mappings.at(w.idx);
                 ggml_backend_buffer_t buf_mmap = nullptr;
                 if (bufs_mmap.size() > 1) {
-                    buf_mmap = bufs_mmap[w.idx];
+                    buf_mmap = bufs_mmap.at(w.idx);
                 } else if (!bufs_mmap.empty()) {
                     buf_mmap = bufs_mmap.front();
                 }

From c34a5deee8980fd8d0059dc8a5562fe6fa68c3da Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 20:50:11 +0100
Subject: [PATCH 06/27] Simplify this by making these optional, switch some
 layer creation tensor optional

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c69d414fab817..21f6ad76152bc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4420,10 +4420,8 @@ static bool llm_load_tensors(
                         layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                         layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        if (ml.get_tensor_meta(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str())) {
-                            layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
-                            layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
-                        }
+                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false);
+                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, false);
 
                         layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
                         layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});

From 1c931f3d4f7b7b296d588e19e738ae65006791eb Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 20:50:28 +0100
Subject: [PATCH 07/27] Handle optional tensors

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 21f6ad76152bc..b1e2e062c206c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3114,7 +3114,11 @@ struct llama_model_loader {
     }
 
     struct ggml_tensor * get_tensor_meta(const char * name) const {
-        return get_weights(name).tensor;
+        try {
+            return get_weights(name).tensor;
+        } catch (const std::runtime_error & e) {
+            return NULL;
+        }
     }
 
     struct ggml_tensor * get_tensor_meta(int i) const {

From d8b567d254ade1cff4ce32eb33e1a8e237a98280 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 21:05:15 +0100
Subject: [PATCH 08/27] llama_model_loader: fail if backend cannot allocate
 buffer

---
 llama.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index b1e2e062c206c..cd20ad7a4e8b4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5155,6 +5155,8 @@ static bool llm_load_tensors(
                             ggml_backend_buffer_get_size(buf));
                     }
 #endif
+                } else {
+                    throw std::runtime_error("failed to allocate cpu buffer");
                 }
             }
         }
@@ -5168,6 +5170,8 @@ static bool llm_load_tensors(
                 ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
                 if (buf != nullptr) {
                     bufs.push_back(buf);
+                } else {
+                    throw std::runtime_error("failed to allocate metal buffer");
                 }
             }
         }
@@ -5182,6 +5186,8 @@ static bool llm_load_tensors(
                     mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
                 }
                 bufs.push_back(buf);
+            } else {
+                throw std::runtime_error("failed to allocate backend buffer");
             }
         }
         if (bufs.empty()) {

From 02020b0463d161ecbdea3995d0b4f11813e3ac8a Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 21 Mar 2024 22:06:37 +0100
Subject: [PATCH 09/27] fix mmap buffer management

---
 llama.cpp | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cd20ad7a4e8b4..53b5a06088e6e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3199,6 +3199,9 @@ struct llama_model_loader {
         *addr = mapping->addr;
         for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
             const auto & w = get_weights(ggml_get_name(tensor));
+            if (w.idx != idx) {
+                continue;
+            }
             *first = std::min(*first, w.offs);
             *last  = std::max(*last, w.offs + ggml_nbytes(tensor));
         }
@@ -3245,12 +3248,8 @@ struct llama_model_loader {
 
             if (use_mmap && w.idx < mappings.size()) {
                 const auto & mapping = mappings.at(w.idx);
-                ggml_backend_buffer_t buf_mmap = nullptr;
-                if (bufs_mmap.size() > 1) {
-                    buf_mmap = bufs_mmap.at(w.idx);
-                } else if (!bufs_mmap.empty()) {
-                    buf_mmap = bufs_mmap.front();
-                }
+                ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr;
+                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                 if (buf_mmap && cur->data == nullptr) {
                     ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
                     if (lmlocks) {
@@ -5145,6 +5144,10 @@ static bool llm_load_tensors(
                 void * addr = nullptr;
                 size_t first, last;
                 ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
+                if (first >= last) {
+                    bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
+                    continue;
+                }
                 ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
                 if (buf != nullptr) {
                     bufs.push_back(buf);
@@ -5167,6 +5170,10 @@ static bool llm_load_tensors(
                 void * addr = nullptr;
                 size_t first, last;
                 ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
+                if (first >= last) {
+                    bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
+                    continue;
+                }
                 ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
                 if (buf != nullptr) {
                     bufs.push_back(buf);
@@ -5196,6 +5203,9 @@ static bool llm_load_tensors(
         // indicate that this buffer contains weights
         // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
         for (ggml_backend_buffer_t buf : bufs) {
+            if (buf == nullptr) {
+                continue;
+            }
             ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
             model.bufs.push_back(buf);
         }

From 078a1aca0648204c2abaec097b04c1bac8cf3795 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 21:33:14 +0100
Subject: [PATCH 10/27] llama_model_loader: map file to backend buffer if the
 allocation succeeds only

---
 llama.cpp | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 53b5a06088e6e..a7945ef092131 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3192,7 +3192,7 @@ struct llama_model_loader {
 
     void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
         GGML_ASSERT(!mappings.empty());
-        const auto & mapping = mappings[idx];
+        const auto & mapping = mappings.at(idx);
 
         *first = mapping->size;
         *last  = 0;
@@ -3211,7 +3211,7 @@ struct llama_model_loader {
     void load_data_for(struct ggml_tensor * cur) const {
         const auto & w = get_weights(ggml_get_name(cur));
 
-        if (use_mmap && w.idx < mappings.size()) {
+        if (use_mmap) {
             const auto & mapping = mappings.at(w.idx);
             if (cur->data == nullptr) {
                 cur->data = (uint8_t *)mapping->addr + w.offs;
@@ -3232,7 +3232,7 @@ struct llama_model_loader {
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
     // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::vector<ggml_backend_buffer_t> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map<uint32_t, ggml_backend_buffer *> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
         GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
         std::vector<no_init<uint8_t>> read_buf;
@@ -3246,9 +3246,12 @@ struct llama_model_loader {
             const auto & w = get_weights(ggml_get_name(cur));
             size_t n_size = ggml_nbytes(cur);
 
-            if (use_mmap && w.idx < mappings.size()) {
+            if (use_mmap) {
                 const auto & mapping = mappings.at(w.idx);
-                ggml_backend_buffer_t buf_mmap = bufs_mmap.size() > w.idx ? bufs_mmap.at(w.idx) : nullptr;
+                ggml_backend_buffer_t buf_mmap = nullptr;
+                if (bufs_mmap.count(w.idx)) {
+                    buf_mmap = bufs_mmap.at(w.idx);
+                }
                 GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                 if (buf_mmap && cur->data == nullptr) {
                     ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
@@ -3283,7 +3286,7 @@ struct llama_model_loader {
         // check if this is the last call and do final cleanup
         if (size_done >= size_data) {
             // unmap offloaded tensors and metadata
-            if (use_mmap && !mappings.empty()) {
+            if (use_mmap) {
                 for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) {
                     const auto & mmap_used = mmaps_used[file_no];
                     auto & mapping = mappings.at(file_no);
@@ -5129,12 +5132,12 @@ static bool llm_load_tensors(
     ml.init_mappings(true, &model.mlock_mmaps);
 
     // create the backend buffers
-    std::vector<std::pair<ggml_context *, std::vector<ggml_backend_buffer_t>>> ctx_bufs;
+    std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
 
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx = it.second;
-        std::vector<ggml_backend_buffer_t> bufs;
+        std::map<uint32_t, ggml_backend_buffer_t> bufs;
 
         // only the mmap region containing the tensors in the model is mapped to the backend buffer
         // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -5145,12 +5148,11 @@ static bool llm_load_tensors(
                 size_t first, last;
                 ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
                 if (first >= last) {
-                    bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
                     continue;
                 }
                 ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
                 if (buf != nullptr) {
-                    bufs.push_back(buf);
+                    bufs.emplace(file_no, buf);
 #ifdef GGML_USE_CUBLAS
                     if (n_layer >= n_gpu_layers) {
                         ggml_backend_cuda_register_host_buffer(
@@ -5158,8 +5160,6 @@ static bool llm_load_tensors(
                             ggml_backend_buffer_get_size(buf));
                     }
 #endif
-                } else {
-                    throw std::runtime_error("failed to allocate cpu buffer");
                 }
             }
         }
@@ -5176,9 +5176,7 @@ static bool llm_load_tensors(
                 }
                 ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
                 if (buf != nullptr) {
-                    bufs.push_back(buf);
-                } else {
-                    throw std::runtime_error("failed to allocate metal buffer");
+                    bufs.emplace(file_no, buf);
                 }
             }
         }
@@ -5192,9 +5190,9 @@ static bool llm_load_tensors(
                     mlock_buf->init(ggml_backend_buffer_get_base(buf));
                     mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
                 }
-                bufs.push_back(buf);
-            } else {
-                throw std::runtime_error("failed to allocate backend buffer");
+                for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
+                    bufs.emplace(file_no, buf);
+                }
             }
         }
         if (bufs.empty()) {
@@ -5202,12 +5200,9 @@ static bool llm_load_tensors(
         }
         // indicate that this buffer contains weights
         // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
-        for (ggml_backend_buffer_t buf : bufs) {
-            if (buf == nullptr) {
-                continue;
-            }
-            ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-            model.bufs.push_back(buf);
+        for (auto & buf : bufs) {
+            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            model.bufs.push_back(buf.second);
         }
 
         ctx_bufs.emplace_back(ctx, bufs);

From 69bdee939a8604f7648d322f6ec5b0f202605fb9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 21:42:30 +0100
Subject: [PATCH 11/27] llama_model_loader: only map tensors included in the
 context

---
 llama.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index a7945ef092131..c3b97471c7943 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5171,7 +5171,6 @@ static bool llm_load_tensors(
                 size_t first, last;
                 ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
                 if (first >= last) {
-                    bufs.push_back(nullptr); // add a dummy buffer to keep the indices in sync
                     continue;
                 }
                 ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);

From 6df9757ad62972731dc48b53efcaaa4a01f15dec Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Thu, 21 Mar 2024 23:26:45 +0100
Subject: [PATCH 12/27] llama_model_loader: minor, use same variable name for
 consistency, fix spacing in types cast

---
 llama.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c3b97471c7943..2105824af012f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3238,7 +3238,7 @@ struct llama_model_loader {
         std::vector<no_init<uint8_t>> read_buf;
         for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
             if (progress_callback) {
-                if (!progress_callback((float)size_done / size_data, progress_callback_user_data)) {
+                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
                     return false;
                 }
             }
@@ -3254,7 +3254,7 @@ struct llama_model_loader {
                 }
                 GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                 if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *)mapping->addr + w.offs);
+                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + w.offs);
                     if (lmlocks) {
                         const auto & lmlock = lmlocks->at(w.idx);
                         lmlock->grow_to(w.offs + ggml_nbytes(cur));
@@ -3264,7 +3264,7 @@ struct llama_model_loader {
                     mmap_used.first = std::min(mmap_used.first, w.offs);
                     mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
                 } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + w.offs, 0, n_size);
+                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size);
                 }
             } else {
                 GGML_ASSERT(w.idx < files.size());
@@ -3287,9 +3287,9 @@ struct llama_model_loader {
         if (size_done >= size_data) {
             // unmap offloaded tensors and metadata
             if (use_mmap) {
-                for (uint32_t file_no = 0; file_no < mappings.size(); file_no++) {
-                    const auto & mmap_used = mmaps_used[file_no];
-                    auto & mapping = mappings.at(file_no);
+                for (uint32_t idx = 0; idx < mappings.size(); idx++) {
+                    const auto & mmap_used = mmaps_used[idx];
+                    auto & mapping = mappings.at(idx);
                     mapping->unmap_fragment(0, mmap_used.first);
                     if (mmap_used.second != 0) {
                         mapping->unmap_fragment(mmap_used.second, mapping->size);
@@ -5143,16 +5143,16 @@ static bool llm_load_tensors(
         // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
         // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
         if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
-            for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 void * addr = nullptr;
                 size_t first, last;
-                ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
+                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                 if (first >= last) {
                     continue;
                 }
-                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *)addr + first, last - first);
+                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
                 if (buf != nullptr) {
-                    bufs.emplace(file_no, buf);
+                    bufs.emplace(idx, buf);
 #ifdef GGML_USE_CUBLAS
                     if (n_layer >= n_gpu_layers) {
                         ggml_backend_cuda_register_host_buffer(
@@ -5165,17 +5165,17 @@ static bool llm_load_tensors(
         }
 #ifdef GGML_USE_METAL
         else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
-            for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 const size_t max_size = ggml_get_max_tensor_size(ctx);
                 void * addr = nullptr;
                 size_t first, last;
-                ml.get_mapping_range(&first, &last, &addr, file_no, ctx);
+                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                 if (first >= last) {
                     continue;
                 }
                 ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
                 if (buf != nullptr) {
-                    bufs.emplace(file_no, buf);
+                    bufs.emplace(idx, buf);
                 }
             }
         }
@@ -5189,8 +5189,8 @@ static bool llm_load_tensors(
                     mlock_buf->init(ggml_backend_buffer_get_base(buf));
                     mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
                 }
-                for (uint32_t file_no = 0; file_no < ml.files.size(); file_no++) {
-                    bufs.emplace(file_no, buf);
+                for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                    bufs.emplace(idx, buf);
                 }
             }
         }

From f9a29735fc6239614afab198fca4ee9b4577923e Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 00:25:11 +0100
Subject: [PATCH 13/27] llama_model_loader: fail if any of backend buffer
 cannot be allocated

---
 llama.cpp | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 2105824af012f..a0f917ce2122f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3232,7 +3232,7 @@ struct llama_model_loader {
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
     // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map<uint32_t, ggml_backend_buffer *> bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map<uint32_t, ggml_backend_buffer *> & bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
         GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
         std::vector<no_init<uint8_t>> read_buf;
@@ -5151,16 +5151,17 @@ static bool llm_load_tensors(
                     continue;
                 }
                 ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
-                if (buf != nullptr) {
-                    bufs.emplace(idx, buf);
+                if (buf == nullptr) {
+                    throw std::runtime_error("unable to allocate backend CPU buffer");
+                }
+                bufs.emplace(idx, buf);
 #ifdef GGML_USE_CUBLAS
-                    if (n_layer >= n_gpu_layers) {
-                        ggml_backend_cuda_register_host_buffer(
-                            ggml_backend_buffer_get_base(buf),
-                            ggml_backend_buffer_get_size(buf));
-                    }
-#endif
+                if (n_layer >= n_gpu_layers) {
+                    ggml_backend_cuda_register_host_buffer(
+                        ggml_backend_buffer_get_base(buf),
+                        ggml_backend_buffer_get_size(buf));
                 }
+#endif
             }
         }
 #ifdef GGML_USE_METAL
@@ -5174,32 +5175,34 @@ static bool llm_load_tensors(
                     continue;
                 }
                 ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
-                if (buf != nullptr) {
-                    bufs.emplace(idx, buf);
+                if (buf == nullptr) {
+                    throw std::runtime_error("unable to allocate backend metal buffer");
                 }
+                bufs.emplace(idx, buf);
             }
         }
 #endif
         else {
             ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-            if (buf != nullptr) {
-                if (use_mlock && ggml_backend_buffer_is_host(buf)) {
-                    model.mlock_bufs.emplace_back(new llama_mlock);
-                    auto & mlock_buf = model.mlock_bufs.back();
-                    mlock_buf->init(ggml_backend_buffer_get_base(buf));
-                    mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
-                }
-                for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
-                    bufs.emplace(idx, buf);
-                }
+            if (buf == nullptr) {
+                throw std::runtime_error("unable to allocate backend buffer");
+            }
+            if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+                model.mlock_bufs.emplace_back(new llama_mlock);
+                auto & mlock_buf = model.mlock_bufs.back();
+                mlock_buf->init(ggml_backend_buffer_get_base(buf));
+                mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+            }
+            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+                bufs.emplace(idx, buf);
             }
         }
         if (bufs.empty()) {
             throw std::runtime_error("failed to allocate buffer");
         }
-        // indicate that this buffer contains weights
-        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
         for (auto & buf : bufs) {
+            // indicate that this buffer contains weights
+            // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
             ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
             model.bufs.push_back(buf.second);
         }

From 0fd652eba746179d8299d53463ae836e569c9cf7 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 00:37:01 +0100
Subject: [PATCH 14/27] spacing

Co-authored-by: slaren <slarengh@gmail.com>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index a0f917ce2122f..f9c75cd47745a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2820,7 +2820,7 @@ struct llama_model_loader {
     std::vector<std::unique_ptr<llama_mmap>> mappings;
 
     // Holds information on a model weights
-    struct llama_tensor_weights  {
+    struct llama_tensor_weights {
         uint16_t  idx; // source file index
         size_t   offs; // tensor data offset in the original file
 

From 1a179bfc4e6079079e6ab7dbc7d1ddb8c5d74d5b Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 00:38:23 +0100
Subject: [PATCH 15/27] fix loop over pointer

Co-authored-by: slaren <slarengh@gmail.com>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index f9c75cd47745a..f0c187f9a9db1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3040,7 +3040,7 @@ struct llama_model_loader {
         if (meta) {
             gguf_free(meta);
         }
-        for (auto & ctx : contexts) {
+        for (auto * ctx : contexts) {
             ggml_free(ctx);
         }
     }

From 7cbe1eac78588f2b7f9a6ee0f7f56d0dc68611d7 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 06:48:15 +0100
Subject: [PATCH 16/27] llama_model_loader: if n_tensors declared not equals to
 loaded tensors in split, throw an exception instead of asserting

---
 llama.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index f0c187f9a9db1..c129b16404926 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2919,7 +2919,10 @@ struct llama_model_loader {
                 gguf_free(ctx_gguf);
             }
             get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
-            GGML_ASSERT(n_tensors == (int) weights.size());
+            int n_tensors_loaded = (int) weights.size();
+            if (n_tensors != n_tensors_loaded) {
+                throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+            }
 
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split);
         }

From 9940df4f11382c740846ad792134fdcab998e94c Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 06:51:21 +0100
Subject: [PATCH 17/27] llama_model_loader: ensure mappings vector has the
 expected size

---
 llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index c129b16404926..d2d74b025453b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3175,6 +3175,8 @@ struct llama_model_loader {
 
     void init_mappings(bool prefetch = true, std::vector<std::unique_ptr<llama_mlock>> * mlock_mmaps = nullptr) {
         if (use_mmap) {
+            mappings.reserve(files.size());
+            mmaps_used.reserve(files.size());
             for (const auto & file : files) {
                 auto * mapping = new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa());
                 mmaps_used.emplace_back(std::make_pair(mapping->size, 0));

From ec372c66a4e79d152b89de010e7ca44557cc236f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 06:52:00 +0100
Subject: [PATCH 18/27] llama_model_loader:  use at instead of operator[] if
 this should never add to the map.

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index d2d74b025453b..decb895f3eefa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3293,7 +3293,7 @@ struct llama_model_loader {
             // unmap offloaded tensors and metadata
             if (use_mmap) {
                 for (uint32_t idx = 0; idx < mappings.size(); idx++) {
-                    const auto & mmap_used = mmaps_used[idx];
+                    const auto & mmap_used = mmaps_used.at(idx);
                     auto & mapping = mappings.at(idx);
                     mapping->unmap_fragment(0, mmap_used.first);
                     if (mmap_used.second != 0) {

From a9e88c6e57311b36a7f0e62c65b0ee2420fced1b Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 06:59:04 +0100
Subject: [PATCH 19/27] llama_model_loader: immediately add the backend buffer
 to the model buffers in order to free them if an error occurs in the next
 allocation. Reserve the expected size.

---
 llama.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index decb895f3eefa..891892f251234 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5137,12 +5137,17 @@ static bool llm_load_tensors(
     ml.init_mappings(true, &model.mlock_mmaps);
 
     // create the backend buffers
-    std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
+    std::vector<std::pair<ggml_context *, std::unordered_map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
+
+    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
+    size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+    model.bufs.reserve(n_max_backend_buffer);
 
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
         ggml_context * ctx = it.second;
-        std::map<uint32_t, ggml_backend_buffer_t> bufs;
+        std::unordered_map<uint32_t, ggml_backend_buffer_t> bufs;
+        bufs.reserve(n_max_backend_buffer);
 
         // only the mmap region containing the tensors in the model is mapped to the backend buffer
         // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -5159,6 +5164,7 @@ static bool llm_load_tensors(
                 if (buf == nullptr) {
                     throw std::runtime_error("unable to allocate backend CPU buffer");
                 }
+                model.bufs.push_back(buf);
                 bufs.emplace(idx, buf);
 #ifdef GGML_USE_CUBLAS
                 if (n_layer >= n_gpu_layers) {
@@ -5183,6 +5189,7 @@ static bool llm_load_tensors(
                 if (buf == nullptr) {
                     throw std::runtime_error("unable to allocate backend metal buffer");
                 }
+                model.bufs.push_back(buf);
                 bufs.emplace(idx, buf);
             }
         }
@@ -5192,6 +5199,7 @@ static bool llm_load_tensors(
             if (buf == nullptr) {
                 throw std::runtime_error("unable to allocate backend buffer");
             }
+            model.bufs.push_back(buf);
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                 model.mlock_bufs.emplace_back(new llama_mlock);
                 auto & mlock_buf = model.mlock_bufs.back();
@@ -5209,7 +5217,6 @@ static bool llm_load_tensors(
             // indicate that this buffer contains weights
             // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
             ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-            model.bufs.push_back(buf.second);
         }
 
         ctx_bufs.emplace_back(ctx, bufs);

From b19af3643f68d2d1b9b27dd7cfd829f9dd33928e Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 07:03:14 +0100
Subject: [PATCH 20/27] llama_model_loader: be sure the model mappings has
 enough capacity before allocating backend buffer

---
 llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 891892f251234..3e0aec8f6c266 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5135,9 +5135,11 @@ static bool llm_load_tensors(
     ml.done_getting_tensors();
 
     ml.init_mappings(true, &model.mlock_mmaps);
+    model.mappings.reserve(ml.mappings.size());
 
     // create the backend buffers
     std::vector<std::pair<ggml_context *, std::unordered_map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
+    ctx_bufs.reserve(ctx_map.size());
 
     // Ensure we have enough capacity for the maximum backend buffer we will potentially create
     size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();

From 4c04400969b8f81be0a4795781acca510b5d2b74 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 07:07:00 +0100
Subject: [PATCH 21/27] llama_model_loader: fix map -> unordered map

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 3e0aec8f6c266..092eae8f6a120 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3237,7 +3237,7 @@ struct llama_model_loader {
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
     // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::map<uint32_t, ggml_backend_buffer *> & bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::unordered_map<uint32_t, ggml_backend_buffer *> & bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
         GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
         std::vector<no_init<uint8_t>> read_buf;

From e474e456ebaa5a169d7ea6d12ddb9a7c4087d971 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 07:48:50 +0100
Subject: [PATCH 22/27] llama_split_prefix: use a clearer version, not pass
 split path len but dest max len.

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 examples/gguf-split/gguf-split.cpp |  2 +-
 llama.cpp                          | 30 ++++++++++++------------------
 llama.h                            |  4 ++--
 3 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 3f582506da86e..f703588e164f6 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -355,7 +355,7 @@ static void gguf_merge(const split_params & split_params) {
             }
 
             // Verify the file naming and extract split_prefix
-            if (!llama_split_prefix(split_prefix, split_path, strlen(split_path), i_split, n_split)) {
+            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
                 fprintf(stderr, "\n%s: unexpected input file name: %s"
                                 " i_split=%d"
                                 " n_split=%d\n", __func__,
diff --git a/llama.cpp b/llama.cpp
index 092eae8f6a120..ee0318feb473a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2888,7 +2888,7 @@ struct llama_model_loader {
             }
 
             char split_prefix[PATH_MAX] = {0};
-            if (!llama_split_prefix(split_prefix, fname.c_str(), fname.size(), idx, n_split)) {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
                 throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
             }
 
@@ -14806,25 +14806,19 @@ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * pa
     return 0;
 }
 
-LLAMA_API int llama_split_prefix(char * dest, const char * split_path, size_t split_path_len, int split_no, int split_count) {
-    char split_prefix[PATH_MAX] = {0};
-    int split_no_file = 0;
-    int split_count_file = 0;
-    const char * split_format = "-00000-of-00000.gguf";
+int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
+    std::string str_split_path(split_path);
+    char postfix[32];
+    sprintf(postfix, "-%05d-of-%05d.gguf", split_no + 1, split_count);
+    std::string str_postfix(postfix);
 
-    if (split_path_len > strlen(split_format) + 1) {
-        size_t prefix_len = split_path_len - strlen(split_format);
-        if (prefix_len >= sizeof(split_prefix)) {
-            prefix_len = sizeof(split_prefix) - 1;  // leave room for null terminator
-        }
-        strncpy(split_prefix, split_path, prefix_len);
-
-        int n = sscanf(&split_path[0] + strlen(split_prefix), "-%d-of-%d", &split_no_file, &split_count_file);
-        if (n == 2 && split_no_file - 1 == split_no && split_count_file == split_count) {
-            strcpy(dest, split_prefix);
-            return strlen(split_prefix);
-        }
+    // check if dest ends with postfix
+    auto size_prefix = str_split_path.size() - str_postfix.size();
+    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
+        strncpy(dest, split_path, std::min(size_prefix, maxlen));
+        return size_prefix;
     }
+
     return 0;
 }
 
diff --git a/llama.h b/llama.h
index c23172c55e328..7e8ac4b62beb5 100644
--- a/llama.h
+++ b/llama.h
@@ -966,9 +966,9 @@ extern "C" {
     LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
 
     /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-    ///          llama_split_prefix(split_prefix, "/models/ggml-model-q4_0-00002-of-00004.gguf", 43, 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
     //  Returns the split_prefix length.
-    LLAMA_API int llama_split_prefix(char * split_prefix, const char * split_path, size_t split_path_len, int split_no, int split_count);
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
 
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

From 8326607cfe3bb416f40471836af3cc023078ad26 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 22 Mar 2024 10:17:34 +0200
Subject: [PATCH 23/27] llama : minor

ggml-ci
---
 llama.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ee0318feb473a..b725a1a0095b3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2918,10 +2918,15 @@ struct llama_model_loader {
 
                 gguf_free(ctx_gguf);
             }
+
             get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
-            int n_tensors_loaded = (int) weights.size();
-            if (n_tensors != n_tensors_loaded) {
-                throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+
+            // sanity check
+            {
+                const int n_tensors_loaded = (int) weights.size();
+                if (n_tensors != n_tensors_loaded) {
+                    throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+                }
             }
 
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split);
@@ -2930,7 +2935,7 @@ struct llama_model_loader {
         n_kv      = gguf_get_n_kv(meta);
         n_tensors = weights.size();
 
-        fver = (enum llama_fver ) gguf_get_version(meta);
+        fver = (enum llama_fver) gguf_get_version(meta);
 
         for (auto & w : weights) {
             n_elements += ggml_nelements(w.tensor);
@@ -2960,7 +2965,8 @@ struct llama_model_loader {
                 }
 
                 if (trace > 0) {
-                    LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
+                    const uint16_t sid = weights.at(i).idx;
+                    LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
                 }
             }
 

From dbc35acff0a6e6d546dadb1288a02ff169428a84 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 22 Mar 2024 10:58:42 +0200
Subject: [PATCH 24/27] llama : introduce some typedef helpers

---
 llama.cpp | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b725a1a0095b3..2332e7eccec15 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1110,6 +1110,7 @@ struct llama_file {
         }
     }
 };
+using llama_files = std::vector<std::unique_ptr<llama_file>>;
 
 struct llama_mmap {
     void * addr;
@@ -1310,6 +1311,7 @@ struct llama_mmap {
     }
 #endif
 };
+using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
 
 // Represents some region of memory being locked using mlock or VirtualLock;
 // will automatically unlock on destruction.
@@ -1459,6 +1461,7 @@ struct llama_mlock {
     static void raw_unlock(const void * addr, size_t len) {}
 #endif
 };
+using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
@@ -2035,11 +2038,11 @@ struct llama_model {
     std::vector<ggml_backend_buffer_t> bufs;
 
     // model memory mapped files
-    std::vector<std::unique_ptr<llama_mmap>> mappings;
+    llama_mmaps mappings;
 
     // objects representing data potentially being locked in memory
-    std::vector<std::unique_ptr<llama_mlock>> mlock_bufs;
-    std::vector<std::unique_ptr<llama_mlock>> mlock_mmaps;
+    llama_mlocks mlock_bufs;
+    llama_mlocks mlock_mmaps;
 
     // for quantize-stats only
     std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
@@ -2803,6 +2806,8 @@ namespace GGUFMeta {
     };
 }
 
+using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+
 struct llama_model_loader {
     int n_kv      = 0;
     int n_tensors = 0;
@@ -2813,11 +2818,11 @@ struct llama_model_loader {
 
     bool use_mmap = false;
 
-    std::vector<std::unique_ptr<llama_file>> files;
+    llama_files files;
     llama_ftype ftype;
     llama_fver  fver;
 
-    std::vector<std::unique_ptr<llama_mmap>> mappings;
+    llama_mmaps mappings;
 
     // Holds information on a model weights
     struct llama_tensor_weights {
@@ -3009,6 +3014,7 @@ struct llama_model_loader {
             }
 
             LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+
             for (int i = 0; i < n_kv; i++) {
                 const char * name           = gguf_get_key(meta, i);
                 const enum gguf_type type   = gguf_get_kv_type(meta, i);
@@ -3179,7 +3185,7 @@ struct llama_model_loader {
         }
     }
 
-    void init_mappings(bool prefetch = true, std::vector<std::unique_ptr<llama_mlock>> * mlock_mmaps = nullptr) {
+    void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
         if (use_mmap) {
             mappings.reserve(files.size());
             mmaps_used.reserve(files.size());
@@ -3214,7 +3220,7 @@ struct llama_model_loader {
                 continue;
             }
             *first = std::min(*first, w.offs);
-            *last  = std::max(*last, w.offs + ggml_nbytes(tensor));
+            *last  = std::max(*last,  w.offs + ggml_nbytes(tensor));
         }
     }
 
@@ -3243,7 +3249,12 @@ struct llama_model_loader {
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
     // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, std::unordered_map<uint32_t, ggml_backend_buffer *> & bufs_mmap, std::vector<std::unique_ptr<llama_mlock>> * lmlocks) {
+    bool load_all_data(
+            struct ggml_context * ctx,
+            llama_buf_map & bufs_mmap,
+            llama_mlocks * lmlocks,
+            llama_progress_callback progress_callback,
+            void * progress_callback_user_data) {
         GGML_ASSERT(size_data != 0 && "call init_mappings() first");
 
         std::vector<no_init<uint8_t>> read_buf;
@@ -3272,7 +3283,7 @@ struct llama_model_loader {
                     }
 
                     auto & mmap_used = mmaps_used[w.idx];
-                    mmap_used.first = std::min(mmap_used.first, w.offs);
+                    mmap_used.first  = std::min(mmap_used.first,  w.offs);
                     mmap_used.second = std::max(mmap_used.second, w.offs + n_size);
                 } else {
                     ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + w.offs, 0, n_size);
@@ -5144,7 +5155,7 @@ static bool llm_load_tensors(
     model.mappings.reserve(ml.mappings.size());
 
     // create the backend buffers
-    std::vector<std::pair<ggml_context *, std::unordered_map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
+    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
     ctx_bufs.reserve(ctx_map.size());
 
     // Ensure we have enough capacity for the maximum backend buffer we will potentially create
@@ -5153,8 +5164,9 @@ static bool llm_load_tensors(
 
     for (auto & it : ctx_map) {
         ggml_backend_buffer_type_t buft = it.first;
-        ggml_context * ctx = it.second;
-        std::unordered_map<uint32_t, ggml_backend_buffer_t> bufs;
+        ggml_context * ctx              = it.second;
+
+        llama_buf_map bufs;
         bufs.reserve(n_max_backend_buffer);
 
         // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -5211,16 +5223,18 @@ static bool llm_load_tensors(
             if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                 model.mlock_bufs.emplace_back(new llama_mlock);
                 auto & mlock_buf = model.mlock_bufs.back();
-                mlock_buf->init(ggml_backend_buffer_get_base(buf));
+                mlock_buf->init   (ggml_backend_buffer_get_base(buf));
                 mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
             }
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 bufs.emplace(idx, buf);
             }
         }
+
         if (bufs.empty()) {
             throw std::runtime_error("failed to allocate buffer");
         }
+
         for (auto & buf : bufs) {
             // indicate that this buffer contains weights
             // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
@@ -5260,7 +5274,7 @@ static bool llm_load_tensors(
     for (auto & it : ctx_bufs) {
         ggml_context * ctx = it.first;
         auto & bufs = it.second;
-        if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, bufs, use_mlock ? &model.mlock_mmaps : NULL)) {
+        if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
             return false;
         }
     }

From f616b38b6bc0e169b0bb79508d69ac697eb7e7d9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 12:12:13 +0100
Subject: [PATCH 25/27] docs: add model shard in hot topic

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c2f3342f0ff42..a4990e5ad3d75 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
 
 ----
 

From 1f3875995f599f70c9785f6971afa0f7638c61c6 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Fri, 22 Mar 2024 14:44:07 +0100
Subject: [PATCH 26/27] llama_model_loader: put mapping in a unique_ptr from
 the moment it is allocated

Co-authored-by: slaren <slarengh@gmail.com>
---
 llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 2332e7eccec15..7da11d6be6f3e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3190,14 +3190,14 @@ struct llama_model_loader {
             mappings.reserve(files.size());
             mmaps_used.reserve(files.size());
             for (const auto & file : files) {
-                auto * mapping = new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa());
+                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
                 mmaps_used.emplace_back(std::make_pair(mapping->size, 0));
-                mappings.emplace_back(std::unique_ptr<llama_mmap>(mapping));
                 if (mlock_mmaps) {
-                    auto * mlock_mmap = new llama_mlock();
+                    std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
                     mlock_mmap->init(mapping->addr);
-                    mlock_mmaps->emplace_back(std::unique_ptr<llama_mlock>(mlock_mmap));
+                    mlock_mmaps->emplace_back(std::move(mlock_mmap));
                 }
+                mappings.emplace_back(std::move(mapping));
             }
         }
 

From 764c7afee781c2d2d1966df6214073d467446767 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Fri, 22 Mar 2024 15:10:52 +0100
Subject: [PATCH 27/27] fix llama_split_prefix

---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7da11d6be6f3e..0af78c6a3899f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14829,13 +14829,13 @@ LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * pa
 int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
     std::string str_split_path(split_path);
     char postfix[32];
-    sprintf(postfix, "-%05d-of-%05d.gguf", split_no + 1, split_count);
+    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
     std::string str_postfix(postfix);
 
     // check if dest ends with postfix
-    auto size_prefix = str_split_path.size() - str_postfix.size();
+    int size_prefix = str_split_path.size() - str_postfix.size();
     if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
-        strncpy(dest, split_path, std::min(size_prefix, maxlen));
+        snprintf(dest, std::min((size_t) size_prefix, maxlen), "%s", split_path);
         return size_prefix;
     }