Skip to content

Commit

Permalink
Merge branch 'main' into static-model-info
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Jan 20, 2025
2 parents 341b2e0 + 2cbf1aa commit 30368d3
Show file tree
Hide file tree
Showing 13 changed files with 105 additions and 70 deletions.
24 changes: 11 additions & 13 deletions .github/workflows/build-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ on:
jobs:
build-linux-x86_64:
runs-on: ubuntu-22.04
container: nvidia/cuda:12.6.2-devel-ubuntu22.04
steps:
- name: Upgrade git
run: apt-get update && apt-get install -y git
- uses: actions/checkout@v4
with:
submodules: "true"
Expand Down Expand Up @@ -54,8 +57,11 @@ jobs:
retention-days: ${{ inputs.artifacts-retention-days }}

build-linux-arm64:
runs-on: ubuntu-22.04
runs-on: ubuntu-22.04-arm
container: nvidia/cuda:12.6.2-devel-ubuntu22.04
steps:
- name: Upgrade git
run: apt-get update && apt-get install -y git
- uses: actions/checkout@v4
with:
submodules: "true"
Expand All @@ -72,22 +78,14 @@ jobs:
- uses: actions/[email protected]
with:
node-version: 20
cache: "yarn"
- name: Install yarn
run: npm install -g yarn
- name: Install dependencies
run: yarn install
- name: Setup QEMU
uses: docker/setup-qemu-action@v3
with:
platforms: linux/arm64
- name: Prepare & build
run: |
docker run --rm \
-e CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
-v $(pwd):/${{ github.workspace }} \
-w /${{ github.workspace }} \
--platform linux/arm64 \
arm64v8/ubuntu:latest \
bash -c "./scripts/prepare-linux.sh && ./scripts/build-linux.sh"
bash ./scripts/prepare-linux.sh
bash ./scripts/build-linux.sh
- name: Upload build artifacts
if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES'
uses: actions/upload-artifact@v4
Expand Down
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@
[![NPM Version](https://img.shields.io/npm/v/%40fugood%2Fllama.node)](https://www.npmjs.com/package/@fugood/llama.node)
![NPM Downloads](https://img.shields.io/npm/dw/%40fugood%2Fllama.node)

Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
An another Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp) to make same API with [llama.rn](https://github.com/mybigday/llama.rn) as much as possible.

[llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
- [llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
- [llama.rn](https://github.com/mybigday/llama.rn): React Native binding of llama.cpp

## Platform Support

- macOS
- arm64: CPU and Metal GPU acceleration
- x86_64: CPU only
- Windows (x86_64 and arm64)
- CPU
- GPU acceleration via Vulkan
- Linux (x86_64 and arm64)
- CPU
- GPU acceleration via Vulkan
- GPU acceleration via CUDA

## Installation

Expand Down Expand Up @@ -49,6 +63,7 @@ console.log('Result:', text)

- [x] `default`: General usage, not support GPU except macOS (Metal)
- [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable
- [x] `cuda`: Support GPU CUDA (Linux), but only for limited capability (x86_64: 8.9, arm64: 8.7)

## License

Expand Down
2 changes: 1 addition & 1 deletion lib/binding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ export interface Module {
LlamaContext: LlamaContext
}

export type LibVariant = 'default' | 'vulkan'
export type LibVariant = 'default' | 'vulkan' | 'cuda'

const setupEnv = (variant?: string) => {
const postfix = variant ? `-${variant}` : ''
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@fugood/llama.node",
"access": "public",
"version": "0.3.6",
"version": "0.3.7",
"description": "Llama.cpp for Node.js",
"main": "lib/index.js",
"scripts": {
Expand Down
25 changes: 23 additions & 2 deletions scripts/build-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,28 @@ ARCH=${ARCH:-${1:-$(uname -m)}}
if [ $ARCH == "x86_64" ]; then
yarn clean && yarn build-native
yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVARIANT=vulkan

# Check CUDA is available
if [ -f /usr/local/cuda/bin/nvcc ]; then
yarn clean && yarn build-native \
--CDLLAMA_CUDA=1 \
--CDVARIANT=cuda \
--CDCMAKE_CUDA_ARCHITECTURES=89 # > GeForce RTX 40 series
else
echo "CUDA is not available, skipping CUDA build"
fi
else
yarn clean && yarn build-native
yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan
yarn clean && yarn build-native --CDGGML_NATIVE=OFF
yarn clean && yarn build-native --CDGGML_NATIVE=OFF --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan

# Check CUDA is available
if [ -f /usr/local/cuda/bin/nvcc ]; then
yarn clean && yarn build-native \
--CDLLAMA_CUDA=1 \
--CDVARIANT=cuda \
--CDGGML_NATIVE=OFF \
--CDCMAKE_CUDA_ARCHITECTURES=87 # > Jetson Orin series
else
echo "CUDA is not available, skipping CUDA build"
fi
fi
54 changes: 27 additions & 27 deletions scripts/llama.cpp.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,29 @@
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
index 451826d5..a85ac028 100644
--- a/src/llama.cpp/common/common.cpp
+++ b/src/llama.cpp/common/common.cpp
@@ -1043,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
+ mparams.vocab_only = params.vocab_only;
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp /common/common.h
index 3bcc637c..19ae7dad 100644
--- a/src/llama.cpp/common/common.h
+++ b/src/llama.cpp/common/common.h
@@ -189,6 +189,7 @@ struct common_params_vocoder {
};

struct common_params {
+ bool vocab_only = false;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
index 683b90af..e1bf104c 100644
index 6b3641c4..6d6cb27f 100644
--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
@@ -80,7 +80,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
Expand All @@ -9,29 +33,5 @@ index 683b90af..e1bf104c 100644
- message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
+ list(APPEND ARCH_FLAGS /arch:armv8.7)
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
index 1d2bd932..b5007c66 100644
--- a/src/llama.cpp/common/common.h
+++ b/src/llama.cpp/common/common.h
@@ -183,6 +183,7 @@ struct common_params_vocoder {
};

struct common_params {
+ bool vocab_only = false;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
index 20be9291..1bedc55d 100644
--- a/src/llama.cpp/common/common.cpp
+++ b/src/llama.cpp/common/common.cpp
@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
+ mparams.vocab_only = params.vocab_only;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
5 changes: 4 additions & 1 deletion scripts/prepare-linux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ export DEBIAN_FRONTEND=noninteractive

ARCH=${ARCH:-${1:-$(uname -m)}}

run_as_root apt-get update
run_as_root apt-get install -qy lsb-release wget

if [ $ARCH == "x86_64" ]; then
DISTRO=$(lsb_release -c -s)
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | run_as_root tee /etc/apt/trusted.gpg.d/lunarg.asc
run_as_root wget -qO /etc/apt/sources.list.d/lunarg-vulkan-1.3.280-$DISTRO.list https://packages.lunarg.com/vulkan/1.3.280/lunarg-vulkan-1.3.280-$DISTRO.list
run_as_root apt-get update
run_as_root apt-get install -qy vulkan-sdk
run_as_root apt-get install -qy vulkan-sdk cmake pkg-config build-essential libx11-xcb-dev libxkbcommon-dev libwayland-dev libxrandr-dev
else
run_as_root apt-get update
run_as_root apt-get install -qy curl gnupg2
Expand Down
7 changes: 4 additions & 3 deletions src/EmbeddingWorker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() {
llama_kv_cache_clear(_sess->context());
auto tokens = ::common_tokenize(_sess->context(), _text, true);
// add SEP if not present
if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
tokens.push_back(llama_token_sep(_sess->model()));
auto vocab = llama_model_get_vocab(_sess->model());
if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) {
tokens.push_back(llama_vocab_sep(vocab));
}
const int n_embd = llama_n_embd(_sess->model());
const int n_embd = llama_model_n_embd(_sess->model());
do {
auto ctx = _sess->context();
int ret =
Expand Down
6 changes: 4 additions & 2 deletions src/LlamaCompletionWorker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() {
size_t n_cur = 0;
size_t n_input = 0;
const auto model = _sess->model();
const bool add_bos = llama_add_bos_token(model);
auto vocab = llama_model_get_vocab(model);

const bool add_bos = llama_vocab_get_add_bos(vocab);
auto ctx = _sess->context();

auto sparams = llama_sampler_chain_default_params();
Expand Down Expand Up @@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() {
});
}
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id)) {
if (llama_vocab_is_eog(vocab, new_token_id)) {
break;
}
// check for stop words
Expand Down
10 changes: 5 additions & 5 deletions src/LlamaContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
llama_backend_init();
llama_numa_init(params.numa);

auto result = common_init_from_params(params);
auto sess = std::make_shared<LlamaSession>(params);

if (result.model == nullptr || result.context == nullptr) {
if (sess->model() == nullptr || sess->context() == nullptr) {
Napi::TypeError::New(env, "Failed to load model")
.ThrowAsJavaScriptException();
}

_sess = std::make_shared<LlamaSession>(result.model, result.context, params);
_sess = sess;
_info = common_params_get_system_info(params);
}

Expand All @@ -191,8 +191,8 @@ bool validateModelChatTemplate(const struct llama_model * model) {
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
if (res >= 0) {
llama_chat_message chat[] = {{"user", "test"}};
std::string tmpl = std::string(model_template.data(), model_template.size());
int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
const char * tmpl = llama_model_chat_template(model);
int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
return res > 0;
Expand Down
17 changes: 6 additions & 11 deletions src/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
#include <tuple>
#include <vector>

typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
LlamaCppSampling;
typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
Expand Down Expand Up @@ -47,17 +45,17 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,

class LlamaSession {
public:
LlamaSession(llama_model *model, llama_context *ctx, common_params params)
: model_(LlamaCppModel(model, llama_free_model)),
ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
LlamaSession(common_params params)
: params_(params) {
llama_init_ = common_init_from_params(params);
tokens_.reserve(params.n_ctx);
}

~LlamaSession() { dispose(); }

inline llama_context *context() { return ctx_.get(); }
inline llama_context *context() { return llama_init_.context.get(); }

inline llama_model *model() { return model_.get(); }
inline llama_model *model() { return llama_init_.model.get(); }

inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }

Expand All @@ -72,13 +70,10 @@ class LlamaSession {
void dispose() {
std::lock_guard<std::mutex> lock(mutex);
tokens_.clear();
ctx_.reset();
model_.reset();
}

private:
LlamaCppModel model_;
LlamaCppContext ctx_;
common_init_result llama_init_;
const common_params params_;
std::vector<llama_token> tokens_{};
std::mutex mutex;
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Submodule llama.cpp updated 291 files
2 changes: 1 addition & 1 deletion test/__snapshots__/index.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ exports[`works fine with vocab_only: empty result 1`] = `

exports[`works fine with vocab_only: model info 1`] = `
{
"desc": "llama ?B all F32",
"desc": "",
"isChatTemplateSupported": false,
"metadata": {
"general.architecture": "llama",
Expand Down

0 comments on commit 30368d3

Please sign in to comment.