llama_model_loader: support multiple split/shard GGUFs (ggerganov#6187)

* split: support in llama_model_loader * avoid copying the entire vector Co-authored-by: slaren <[email protected]> * split: move llama_tensor_offset to llama_model_loader * llama_model_loader: PR feedbacks: - use only one gguf_context for metadata only - store all ggml_context in a vector as the files and mappings - store all weights in a vector along with the source tensor - rename ctx_gguf to meta - rename ctx_meta to contexts * avoid copying the entire vector * Simplify this by making these optional, switch some layer creation tensor optional Co-authored-by: Georgi Gerganov <[email protected]> * Handle optional tensors Co-authored-by: Georgi Gerganov <[email protected]> * llama_model_loader: fail if backend cannot allocate buffer * fix mmap buffer management * llama_model_loader: map file to backend buffer if the allocation succeeds only * llama_model_loader: only map tensors included in the context * llama_model_loader: minor, use same variable name for consistency, fix spacing in types cast * llama_model_loader: fail if any of backend buffer cannot be allocated * spacing Co-authored-by: slaren <[email protected]> * fix loop over pointer Co-authored-by: slaren <[email protected]> * llama_model_loader: if n_tensors declared not equals to loaded tensors in split, throw an exception instead of asserting * llama_model_loader: ensure mappings vector has the expected size * llama_model_loader: use at instead of operator[] if this should never add to the map. * llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size. * llama_model_loader: be sure the model mappings has enough capacity before allocating backend buffer * llama_model_loader: fix map -> unordered map * llama_split_prefix: use a clearer version, not pass split path len but dest max len. Co-authored-by: Xuan Son Nguyen <[email protected]> * llama : minor ggml-ci * llama : introduce some typedef helpers * docs: add model shard in hot topic * llama_model_loader: put mapping in a unique_ptr from the moment it is allocated Co-authored-by: slaren <[email protected]> * fix llama_split_prefix --------- Co-authored-by: slaren <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]> Co-authored-by: Xuan Son Nguyen <[email protected]>
hodlen · Apr 3, 2024 · 366624b · 366624b
1 parent d174faf
commit 366624b
Show file tree

Hide file tree

Showing 4 changed files with 411 additions and 223 deletions.
diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187
 
 ----
 

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
@@ -1,31 +1,34 @@
 #include "llama.h"
-#include "ggml.h"
 #include "common.h"
 
 #include <algorithm>
 #include <cmath>
-#include <cstdint>
 #include <cstdlib>
 #include <fstream>
-#include <ios>
 #include <string>
 #include <vector>
 
 #include <stdio.h>
-#include <fcntl.h>
 #include <string.h>
+#include <climits>
+#include <stdexcept>
+
+#if defined(_WIN32)
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
 
 enum split_operation : uint8_t {
     SPLIT_OP_SPLIT,
     SPLIT_OP_MERGE,
 };
 
-static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
-static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
-
-static const int SPLIT_FILENAME_MAX = 256;
-
-static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
+static const char * const LLM_KV_SPLIT_NO            = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
 struct split_params {
     split_operation operation = SPLIT_OP_SPLIT;
@@ -116,13 +119,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
     try {
         if (!split_params_parse_ex(argc, argv, params)) {
             split_print_usage(argv[0]);
-            exit(1);
+            exit(EXIT_FAILURE);
         }
     }
     catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
         split_print_usage(argv[0]);
-        exit(1);
+        exit(EXIT_FAILURE);
     }
     return result;
 }
@@ -134,12 +137,6 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
-static std::string split_file_name(const std::string & path, int i_split, int n_split) {
-    char f_split[SPLIT_FILENAME_MAX] = {0};
-    snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
-    return std::string(f_split);
-}
-
 struct split_strategy {
     const split_params params;
     std::ifstream & f_input;
@@ -180,19 +177,21 @@ struct split_strategy {
         if (i_split == 0) {
             gguf_set_kv(ctx_out, ctx_gguf);
         }
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
+        gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
 
         // populate the original tensors, so we get an initial metadata
         for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
             struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
             gguf_add_tensor(ctx_out, meta);
         }
 
-        auto split_name = split_file_name(params.output, i_split, n_split);
+        char split_path[PATH_MAX] = {0};
+        llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
 
-        fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
-        fout = std::ofstream(split_name, std::ios::binary);
+        fprintf(stderr, "%s: %s ...", __func__, split_path);
+        fout = std::ofstream(split_path, std::ios::binary);
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
         auto meta_size = gguf_get_meta_size(ctx_out);
@@ -250,19 +249,23 @@ static void gguf_split(const split_params & split_params) {
     std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
     if (!f_input.is_open()) {
         fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(1);
+        exit(EXIT_FAILURE);
     }
 
     auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
     if (!ctx_gguf) {
         fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(1);
+        exit(EXIT_FAILURE);
     }
 
     split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+
+    char first_split_path[PATH_MAX] = {0};
+    llama_split_path(first_split_path, sizeof(first_split_path),
+                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
     fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
             __func__, split_params.input.c_str(),
-            split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
+            first_split_path,
             split_params.n_split_tensors);
 
     strategy.split_start();
@@ -298,7 +301,9 @@ static void gguf_merge(const split_params & split_params) {
     std::vector<ggml_context *> ctx_metas;
     std::vector<gguf_context *> ctx_ggufs;
 
-    std::string split_prefix;
+    char split_path[PATH_MAX] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
+    char split_prefix[PATH_MAX] = {0};
 
     // First pass to find KV and tensors metadata
     for (int i_split = 0; i_split < n_split; i_split++) {
@@ -309,87 +314,64 @@ static void gguf_merge(const split_params & split_params) {
             /*.ctx      = */ &ctx_meta,
         };
 
-        auto split_name = split_params.input;
         if (i_split > 0) {
-            split_name = split_file_name(split_prefix, i_split, n_split);
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
         }
-        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
 
-        auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
         if (!ctx_gguf) {
             fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-            exit(1);
+            exit(EXIT_FAILURE);
         }
         ctx_ggufs.push_back(ctx_gguf);
         ctx_metas.push_back(ctx_meta);
 
         if (i_split == 0) {
-            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
             if (key_n_split < 0) {
                 fprintf(stderr,
                         "\n%s: input file does not contain %s metadata\n",
                         __func__,
-                        LLM_KV_GENERAL_SPLIT_N_SPLIT);
+                        LLM_KV_SPLIT_COUNT);
                 gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
             }
 
-            n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
             if (n_split < 1) {
                 fprintf(stderr,
                         "\n%s: input file does not contain a valid split count %d\n",
                         __func__,
                         n_split);
                 gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
             }
 
-            // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
-
-            // Set metadata from the first split
-            gguf_set_kv(ctx_out, ctx_gguf);
-        }
-
-        // Verify the file naming
-        {
-            int i_split_file = 0;
-            int n_split_file = 0;
-            const char * i_split_format = "-00000-of-00000.gguf";
-
-            if (split_name.size() < strlen(i_split_format)) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                 gguf_free(ctx_out);
                 fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
             }
 
-            split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
-
-            const char * split_name_c_str = split_name.c_str();
-            int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
+            // Do not trigger merge if we try to merge again the output
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
 
-            if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s"
-                                " i_split=%d i_split_file=%d"
-                                " n_split=%d n_split_file=%d\n", __func__,
-                        split_params.input.c_str(),
-                        i_split, i_split_file,
-                        n_split, n_split_file);
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
-                gguf_free(ctx_out);
-                fout.close();
-                exit(1);
-            }
+            // Set metadata from the first split
+            gguf_set_kv(ctx_out, ctx_gguf);
         }
 
         auto n_tensors = gguf_get_n_tensors(ctx_gguf);
@@ -411,18 +393,19 @@ static void gguf_merge(const split_params & split_params) {
 
     // Write tensors data
     for (int i_split = 0; i_split < n_split; i_split++) {
-        auto split_name = split_file_name(split_prefix, i_split, n_split);
-        std::ifstream f_input(split_name.c_str(), std::ios::binary);
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
         if (!f_input.is_open()) {
-            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_name.c_str());
-            for (auto * _ctx_gguf : ctx_ggufs) {
-                gguf_free(_ctx_gguf);
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
             }
             gguf_free(ctx_out);
             fout.close();
-            exit(1);
+            exit(EXIT_FAILURE);
         }
-        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
 
         auto * ctx_gguf = ctx_ggufs[i_split];
         auto * ctx_meta = ctx_metas[i_split];
@@ -481,8 +464,8 @@ int main(int argc, const char ** argv) {
             break;
         case SPLIT_OP_MERGE: gguf_merge(params);
             break;
-        default:split_print_usage(argv[0]);
-            exit(1);
+        default: split_print_usage(argv[0]);
+            exit(EXIT_FAILURE);
     }
 
     return 0;