diff --git a/ggml.c b/ggml.c index ea72778959dc9f..78395b08c1a07f 100644 --- a/ggml.c +++ b/ggml.c @@ -2696,6 +2696,10 @@ bool ggml_mlock(struct ggml_context * ctx, char ** err_p) { #endif // GGML_MLOCK_SUPPORT //////////////////////////////////////////////////////////////////////////////// +int g_nomem = 0; +void ggml_nomem(int nomem) { + g_nomem = nomem; +} struct ggml_tensor * ggml_new_tensor_impl( struct ggml_context * ctx, @@ -2712,7 +2716,7 @@ struct ggml_tensor * ggml_new_tensor_impl( size_t size_needed = 0; - if (data == NULL) { + if (data == NULL && !g_nomem) { size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); for (int i = 1; i < n_dims; i++) { size_needed *= ne[i]; @@ -2796,7 +2800,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, - /*.data =*/ data == NULL ? (void *)(result + 1) : data, + /*.data =*/ (data == NULL && !g_nomem) ? (void *)(result + 1) : data, /*.pad =*/ { 0 }, }; diff --git a/ggml.h b/ggml.h index 335230f9f0bb26..351dcb4cdbbccc 100644 --- a/ggml.h +++ b/ggml.h @@ -346,6 +346,8 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); bool ggml_mlock_supported(void); bool ggml_mlock(struct ggml_context * ctx, char ** err_p); +void ggml_nomem(int nomem); + struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, diff --git a/llama.cpp b/llama.cpp index ee7eb8ea7cf576..3d08377786371f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12,6 +12,10 @@ #include #include +#include +#include +#include + #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 @@ -452,36 +456,37 @@ static bool llama_model_load( auto & ctx = model.ctx; size_t ctx_size = 0; - { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings + if (n_parts > 1) { + ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm - ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output + ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm - ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 - ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 - ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 + ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 + ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 + ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 + } - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v + // this is no longer stored in this context + //ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k + //ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v ctx_size += (5 + 10*n_layer)*256; // object overhead @@ -533,6 +538,9 @@ static bool llama_model_load( model.layers.resize(n_layer); + if (n_parts == 1) + ggml_nomem(1); // hack to stop ggml from allocating memory for these tensors + model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab); model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); @@ -576,6 +584,9 @@ static bool llama_model_load( } } + if (n_parts == 1) + ggml_nomem(0); + const size_t file_offset = fin.tellg(); fin.close(); @@ -600,6 +611,17 @@ static bool llama_model_load( fin = std::ifstream(fname_part, std::ios::binary); fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); + // mmap support + int fd = open(fname.c_str(), O_RDONLY); + size_t len = lseek(fd, 0, SEEK_END); + char* mm = (char*)mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + if (mm == MAP_FAILED) { + perror("mmap"); + mm = NULL; + } + close(fd); + // + fin.seekg(0, fin.end); const size_t file_size = fin.tellg(); @@ -736,13 +758,23 @@ static bool llama_model_load( } if (part_id == 0) { - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + if (mm == NULL) { + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + } + else { + fprintf(stderr, "tensor mmaped: %s\n", name.c_str()); + off_t offset = fin.tellg();; + tensor->data = mm + offset; + fin.seekg(ggml_nbytes(tensor), std::ios::cur); + } } else { + fprintf(stderr, "tensor skipped: %s\n", name.c_str()); fin.seekg(ggml_nbytes(tensor), std::ios::cur); } - total_size += ggml_nbytes(tensor); + //total_size += ggml_nbytes(tensor); } else { + fprintf(stderr, "tensor not mmaped: %s\n", name.c_str()); if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);