diff --git a/main.cpp b/main.cpp
index 6dc9ae98036ed..a6660021f3049 100644
--- a/main.cpp
+++ b/main.cpp
@@ -14,6 +14,9 @@
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 #endif
 
 #define ANSI_COLOR_RED     "\x1b[31m"
@@ -83,14 +86,131 @@ struct llama_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
+#ifndef USE_MMAP
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#define USE_MMAP 1
+#else
+#define USE_MMAP 0
+#endif
+#endif
+
+#if USE_MMAP
+// since std::istrstream is deprecated, reimplement it.
+struct membuf : std::streambuf {
+    membuf(char const* base, size_t size) {
+        char* gptr(const_cast<char*>(base));
+        this->setg(gptr, gptr, gptr + size);
+    }
+};
+struct llama_istream: virtual membuf, std::istream {
+    size_t mapped_size;
+
+    llama_istream(const std::string & fname, std::ios::openmode mode = std::ios::binary) :
+        llama_istream(mmap_file(fname)) {}
+
+    llama_istream(std::tuple<char const*, size_t, size_t> t) :
+        llama_istream(std::get<0>(t), std::get<1>(t), std::get<2>(t)) {}
+
+    llama_istream(char const* base, size_t size, size_t mapped_size) :
+        membuf(base, size),
+        std::istream(static_cast<std::streambuf*>(this)),
+        mapped_size(mapped_size) {
+            if (base == errcontent)
+                setstate(std::ios::failbit);
+    }
+
+    std::char_traits<char>::pos_type seekoff(
+                     std::char_traits<char>::off_type off,
+                     std::ios_base::seekdir dir,
+                     std::ios_base::openmode which = std::ios_base::in) override {
+        if (dir == std::ios_base::cur)
+            gbump(off);
+        else if (dir == std::ios_base::end)
+            setg(eback(), egptr() + off, egptr());
+        else if (dir == std::ios_base::beg)
+            setg(eback(), eback() + off, egptr());
+        return gptr() - eback();
+    }
+
+    std::char_traits<char>::pos_type seekpos(
+                     std::char_traits<char>::pos_type sp,
+                     std::ios_base::openmode which = std::ios_base::binary) override {
+        return seekoff(sp - std::char_traits<char>::pos_type(std::char_traits<char>::off_type(0)), std::ios_base::beg, which);
+    }
+
+    void close() {
+        char* gptr = const_cast<char*>(this->gptr());
+        if (gptr == errcontent) {
+            fprintf(stderr, "Closing an invalid llama_istream.\n");
+            return;
+        }
+        munmap(gptr, mapped_size);
+    }
+  private:
+    constexpr static char const* errcontent = "";
+
+    static std::tuple<char const*, size_t, size_t> mmap_file(const std::string & fname) {
+        static long pagesize;
+        if (!pagesize)
+            pagesize = sysconf(_SC_PAGESIZE);
+        if (pagesize == -1 || pagesize == 0) {
+            fprintf(stderr, "%s: could not get the OS page size.\n", __func__);
+            return {errcontent, 1, 0};
+        }
+
+        int fd = open(fname.c_str(), O_RDONLY);
+        if (fd == -1) {
+            fprintf(stderr, "%s: failed to open() '%s'\n", __func__, fname.c_str());
+            return {errcontent, 1, 0};
+        }
+
+        struct stat st;
+        if (fstat(fd, &st) == -1) {
+            fprintf(stderr, "%s: failed to stat '%s'\n", __func__, fname.c_str());
+            return {errcontent, 1, 0};
+        }
+
+        size_t file_size = st.st_size;
+        size_t map_size = (file_size + pagesize - 1) & -pagesize;
+        int prot = PROT_READ;
+        int map = MAP_SHARED;
+        char* file_contents = (char*)mmap(NULL, map_size, prot, map, fd, 0);
+        if (!file_contents || file_contents == MAP_FAILED) {
+            fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
+            return {errcontent, 1, 0};
+        }
+
+#if 1
+        int advice = MADV_SEQUENTIAL | MADV_WILLNEED;
+        #if defined(MADV_HUGEPAGE)
+        advice |= MADV_HUGEPAGE;
+        #endif
+        if (madvise(file_contents, map_size, advice) == -1) {
+            fprintf(stderr, "%s: failed to madvise '%s'\n", __func__, fname.c_str());
+            return {errcontent, 1, 0};
+        }
+#endif
+
+        ::close(fd);
+
+        return std::make_tuple(file_contents, file_size, map_size);
+    }
+
+};
+#else
+using llama_istream = std::ifstream;
+#endif
+
+
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
+    llama_istream fin{fname, std::ios::binary};
+#if !USE_MMAP
     std::vector<char> f_buf(1024*1024);
-
-    auto fin = std::ifstream(fname, std::ios::binary);
     fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+#endif
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
         return false;
@@ -327,8 +447,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 
-        fin = std::ifstream(fname_part, std::ios::binary);
+        llama_istream fin{fname_part, std::ios::binary};
+#if !USE_MMAP
         fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+#endif
         fin.seekg(file_offset);
 
         // load weights