diff --git a/main.cpp b/main.cpp index 6dc9ae98036ed..a6660021f3049 100644 --- a/main.cpp +++ b/main.cpp @@ -14,6 +14,9 @@ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include #include +#include +#include +#include #endif #define ANSI_COLOR_RED "\x1b[31m" @@ -83,14 +86,131 @@ struct llama_model { std::map tensors; }; +#ifndef USE_MMAP +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#define USE_MMAP 1 +#else +#define USE_MMAP 0 +#endif +#endif + +#if USE_MMAP +// since std::istrstream is deprecated, reimplement it. +struct membuf : std::streambuf { + membuf(char const* base, size_t size) { + char* gptr(const_cast(base)); + this->setg(gptr, gptr, gptr + size); + } +}; +struct llama_istream: virtual membuf, std::istream { + size_t mapped_size; + + llama_istream(const std::string & fname, std::ios::openmode mode = std::ios::binary) : + llama_istream(mmap_file(fname)) {} + + llama_istream(std::tuple t) : + llama_istream(std::get<0>(t), std::get<1>(t), std::get<2>(t)) {} + + llama_istream(char const* base, size_t size, size_t mapped_size) : + membuf(base, size), + std::istream(static_cast(this)), + mapped_size(mapped_size) { + if (base == errcontent) + setstate(std::ios::failbit); + } + + std::char_traits::pos_type seekoff( + std::char_traits::off_type off, + std::ios_base::seekdir dir, + std::ios_base::openmode which = std::ios_base::in) override { + if (dir == std::ios_base::cur) + gbump(off); + else if (dir == std::ios_base::end) + setg(eback(), egptr() + off, egptr()); + else if (dir == std::ios_base::beg) + setg(eback(), eback() + off, egptr()); + return gptr() - eback(); + } + + std::char_traits::pos_type seekpos( + std::char_traits::pos_type sp, + std::ios_base::openmode which = std::ios_base::binary) override { + return seekoff(sp - std::char_traits::pos_type(std::char_traits::off_type(0)), std::ios_base::beg, which); + } + + void close() { + char* gptr = const_cast(this->gptr()); + if (gptr == errcontent) { + fprintf(stderr, "Closing an invalid llama_istream.\n"); + return; + } + munmap(gptr, mapped_size); + } + private: + constexpr static char const* errcontent = ""; + + static std::tuple mmap_file(const std::string & fname) { + static long pagesize; + if (!pagesize) + pagesize = sysconf(_SC_PAGESIZE); + if (pagesize == -1 || pagesize == 0) { + fprintf(stderr, "%s: could not get the OS page size.\n", __func__); + return {errcontent, 1, 0}; + } + + int fd = open(fname.c_str(), O_RDONLY); + if (fd == -1) { + fprintf(stderr, "%s: failed to open() '%s'\n", __func__, fname.c_str()); + return {errcontent, 1, 0}; + } + + struct stat st; + if (fstat(fd, &st) == -1) { + fprintf(stderr, "%s: failed to stat '%s'\n", __func__, fname.c_str()); + return {errcontent, 1, 0}; + } + + size_t file_size = st.st_size; + size_t map_size = (file_size + pagesize - 1) & -pagesize; + int prot = PROT_READ; + int map = MAP_SHARED; + char* file_contents = (char*)mmap(NULL, map_size, prot, map, fd, 0); + if (!file_contents || file_contents == MAP_FAILED) { + fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str()); + return {errcontent, 1, 0}; + } + +#if 1 + int advice = MADV_SEQUENTIAL | MADV_WILLNEED; + #if defined(MADV_HUGEPAGE) + advice |= MADV_HUGEPAGE; + #endif + if (madvise(file_contents, map_size, advice) == -1) { + fprintf(stderr, "%s: failed to madvise '%s'\n", __func__, fname.c_str()); + return {errcontent, 1, 0}; + } +#endif + + ::close(fd); + + return std::make_tuple(file_contents, file_size, map_size); + } + +}; +#else +using llama_istream = std::ifstream; +#endif + + // load the model's weights from a file bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + llama_istream fin{fname, std::ios::binary}; +#if !USE_MMAP std::vector f_buf(1024*1024); - - auto fin = std::ifstream(fname, std::ios::binary); fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); +#endif if (!fin) { fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); return false; @@ -327,8 +447,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); - fin = std::ifstream(fname_part, std::ios::binary); + llama_istream fin{fname_part, std::ios::binary}; +#if !USE_MMAP fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); +#endif fin.seekg(file_offset); // load weights