Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #31

Merged
merged 6 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ set(
${RNLLAMA_LIB_DIR}/ggml-alloc.c
${RNLLAMA_LIB_DIR}/ggml-backend.c
${RNLLAMA_LIB_DIR}/ggml.c
${RNLLAMA_LIB_DIR}/k_quants.c
${RNLLAMA_LIB_DIR}/ggml-quants.c
${RNLLAMA_LIB_DIR}/common.cpp
${RNLLAMA_LIB_DIR}/grammar-parser.cpp
${RNLLAMA_LIB_DIR}/sampling.cpp
Expand All @@ -32,7 +32,7 @@ function(build_library target_name)

target_link_libraries(${target_name} ${LOG_LIB} android)

target_compile_options(${target_name} PRIVATE -DLM_GGML_USE_K_QUANTS -pthread)
target_compile_options(${target_name} PRIVATE -pthread)

if (${target_name} STREQUAL "rnllama_v8fp16_va")
target_compile_options(${target_name} PRIVATE -march=armv8.4-a+fp16+dotprod)
Expand Down
9 changes: 0 additions & 9 deletions cpp/build-info.h

This file was deleted.

163 changes: 130 additions & 33 deletions cpp/common.cpp

Large diffs are not rendered by default.

61 changes: 40 additions & 21 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"

#include <cmath>
#include <string>
#include <vector>
#include <random>
Expand All @@ -25,35 +26,51 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;

//
// CLI argument parsing
//
int32_t get_num_physical_cores();

struct gpt_params {
uint32_t seed = -1; // RNG seed
uint32_t seed = -1; // RNG seed

int32_t n_threads = get_num_physical_cores();
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
// pinging @cebtenzzre

// // sampling parameters
struct llama_sampling_params sparams;
Expand All @@ -77,7 +94,7 @@ struct gpt_params {
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score

bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
Expand Down Expand Up @@ -110,6 +127,8 @@ struct gpt_params {
std::string image = ""; // path to an image file
};

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
Expand Down
21 changes: 12 additions & 9 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -378,9 +378,13 @@ static bool lm_ggml_op_can_inplace(enum lm_ggml_op op) {
}
}

static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view) {
static void init_view(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor * view, bool update_backend) {
assert(view->view_src != NULL && view->view_src->data != NULL);
view->backend = view->view_src->backend;

if (update_backend) {
view->backend = view->view_src->backend;
}

view->buffer = view->view_src->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

Expand All @@ -394,7 +398,7 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) {
if (lm_ggml_is_view(node)) {
init_view(alloc, node);
init_view(alloc, node, true);
} else {
// see if we can reuse a parent's buffer (inplace)
if (lm_ggml_op_can_inplace(node->op)) {
Expand Down Expand Up @@ -424,15 +428,14 @@ static void allocate_node(struct lm_ggml_allocr * alloc, struct lm_ggml_tensor *
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->view_src = view_src;
view_src_hn->n_views += 1;
init_view(alloc, node);
init_view(alloc, node, false);
return;
}
}
else {
} else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->view_src = parent;
p_hn->n_views += 1;
init_view(alloc, node);
init_view(alloc, node, false);
return;
}
}
Expand Down Expand Up @@ -463,7 +466,7 @@ size_t lm_ggml_allocr_alloc_graph_n(
hash_get(ht, view_src)->n_views += 1;
if (node->buffer == NULL && node->data != NULL) {
// view of a pre-allocated tensor, didn't call init_view() yet
init_view(alloc, node);
init_view(alloc, node, true);
}
}

Expand All @@ -474,7 +477,7 @@ size_t lm_ggml_allocr_alloc_graph_n(
}
hash_get(ht, parent)->n_children += 1;
if (lm_ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
init_view(alloc, parent);
init_view(alloc, parent, true);
}
}
}
Expand Down
Loading
Loading