diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index 2aebe93..44eecd6 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -20,7 +20,10 @@ on: jobs: build-linux-x86_64: runs-on: ubuntu-22.04 + container: nvidia/cuda:12.6.2-devel-ubuntu22.04 steps: + - name: Upgrade git + run: apt-get update && apt-get install -y git - uses: actions/checkout@v4 with: submodules: "true" @@ -54,8 +57,11 @@ jobs: retention-days: ${{ inputs.artifacts-retention-days }} build-linux-arm64: - runs-on: ubuntu-22.04 + runs-on: ubuntu-22.04-arm + container: nvidia/cuda:12.6.2-devel-ubuntu22.04 steps: + - name: Upgrade git + run: apt-get update && apt-get install -y git - uses: actions/checkout@v4 with: submodules: "true" @@ -72,22 +78,14 @@ jobs: - uses: actions/setup-node@v4.0.2 with: node-version: 20 - cache: "yarn" + - name: Install yarn + run: npm install -g yarn - name: Install dependencies run: yarn install - - name: Setup QEMU - uses: docker/setup-qemu-action@v3 - with: - platforms: linux/arm64 - name: Prepare & build run: | - docker run --rm \ - -e CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \ - -v $(pwd):/${{ github.workspace }} \ - -w /${{ github.workspace }} \ - --platform linux/arm64 \ - arm64v8/ubuntu:latest \ - bash -c "./scripts/prepare-linux.sh && ./scripts/build-linux.sh" + bash ./scripts/prepare-linux.sh + bash ./scripts/build-linux.sh - name: Upload build artifacts if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES' uses: actions/upload-artifact@v4 diff --git a/README.md b/README.md index 0b492a9..5743442 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,23 @@ [![NPM Version](https://img.shields.io/npm/v/%40fugood%2Fllama.node)](https://www.npmjs.com/package/@fugood/llama.node) ![NPM Downloads](https://img.shields.io/npm/dw/%40fugood%2Fllama.node) -Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). +An another Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp) to make same API with [llama.rn](https://github.com/mybigday/llama.rn) as much as possible. -[llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +- [llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +- [llama.rn](https://github.com/mybigday/llama.rn): React Native binding of llama.cpp + +## Platform Support + +- macOS + - arm64: CPU and Metal GPU acceleration + - x86_64: CPU only +- Windows (x86_64 and arm64) + - CPU + - GPU acceleration via Vulkan +- Linux (x86_64 and arm64) + - CPU + - GPU acceleration via Vulkan + - GPU acceleration via CUDA ## Installation @@ -49,6 +63,7 @@ console.log('Result:', text) - [x] `default`: General usage, not support GPU except macOS (Metal) - [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable +- [x] `cuda`: Support GPU CUDA (Linux), but only for limited capability (x86_64: 8.9, arm64: 8.7) ## License diff --git a/lib/binding.ts b/lib/binding.ts index 83e60fc..def14bf 100644 --- a/lib/binding.ts +++ b/lib/binding.ts @@ -117,7 +117,7 @@ export interface Module { LlamaContext: LlamaContext } -export type LibVariant = 'default' | 'vulkan' +export type LibVariant = 'default' | 'vulkan' | 'cuda' const setupEnv = (variant?: string) => { const postfix = variant ? `-${variant}` : '' diff --git a/package.json b/package.json index a27173b..c5cffd9 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@fugood/llama.node", "access": "public", - "version": "0.3.6", + "version": "0.3.7", "description": "Llama.cpp for Node.js", "main": "lib/index.js", "scripts": { diff --git a/scripts/build-linux.sh b/scripts/build-linux.sh index 943f519..ec8c3c3 100755 --- a/scripts/build-linux.sh +++ b/scripts/build-linux.sh @@ -9,7 +9,28 @@ ARCH=${ARCH:-${1:-$(uname -m)}} if [ $ARCH == "x86_64" ]; then yarn clean && yarn build-native yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVARIANT=vulkan + + # Check CUDA is available + if [ -f /usr/local/cuda/bin/nvcc ]; then + yarn clean && yarn build-native \ + --CDLLAMA_CUDA=1 \ + --CDVARIANT=cuda \ + --CDCMAKE_CUDA_ARCHITECTURES=89 # > GeForce RTX 40 series + else + echo "CUDA is not available, skipping CUDA build" + fi else - yarn clean && yarn build-native - yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan + yarn clean && yarn build-native --CDGGML_NATIVE=OFF + yarn clean && yarn build-native --CDGGML_NATIVE=OFF --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan + + # Check CUDA is available + if [ -f /usr/local/cuda/bin/nvcc ]; then + yarn clean && yarn build-native \ + --CDLLAMA_CUDA=1 \ + --CDVARIANT=cuda \ + --CDGGML_NATIVE=OFF \ + --CDCMAKE_CUDA_ARCHITECTURES=87 # > Jetson Orin series + else + echo "CUDA is not available, skipping CUDA build" + fi fi diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 0667a20..fc292df 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,5 +1,29 @@ +diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp +index 451826d5..a85ac028 100644 +--- a/src/llama.cpp/common/common.cpp ++++ b/src/llama.cpp/common/common.cpp +@@ -1043,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { + if (params.n_gpu_layers != -1) { + mparams.n_gpu_layers = params.n_gpu_layers; + } ++ mparams.vocab_only = params.vocab_only; + mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; + mparams.tensor_split = params.tensor_split; +diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp /common/common.h +index 3bcc637c..19ae7dad 100644 +--- a/src/llama.cpp/common/common.h ++++ b/src/llama.cpp/common/common.h +@@ -189,6 +189,7 @@ struct common_params_vocoder { + }; + + struct common_params { ++ bool vocab_only = false; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 4096; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt -index 683b90af..e1bf104c 100644 +index 6b3641c4..6d6cb27f 100644 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt @@ -80,7 +80,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) @@ -9,29 +33,5 @@ index 683b90af..e1bf104c 100644 - message(FATAL_ERROR "MSVC is not supported for ARM, use clang") + list(APPEND ARCH_FLAGS /arch:armv8.7) else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") -diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h -index 1d2bd932..b5007c66 100644 ---- a/src/llama.cpp/common/common.h -+++ b/src/llama.cpp/common/common.h -@@ -183,6 +183,7 @@ struct common_params_vocoder { - }; - - struct common_params { -+ bool vocab_only = false; - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size - int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) -diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp -index 20be9291..1bedc55d 100644 ---- a/src/llama.cpp/common/common.cpp -+++ b/src/llama.cpp/common/common.cpp -@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { - if (params.n_gpu_layers != -1) { - mparams.n_gpu_layers = params.n_gpu_layers; - } -+ mparams.vocab_only = params.vocab_only; - mparams.rpc_servers = params.rpc_servers.c_str(); - mparams.main_gpu = params.main_gpu; - mparams.split_mode = params.split_mode; + check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") diff --git a/scripts/prepare-linux.sh b/scripts/prepare-linux.sh index 4702306..24f8e7d 100755 --- a/scripts/prepare-linux.sh +++ b/scripts/prepare-linux.sh @@ -14,12 +14,15 @@ export DEBIAN_FRONTEND=noninteractive ARCH=${ARCH:-${1:-$(uname -m)}} +run_as_root apt-get update +run_as_root apt-get install -qy lsb-release wget + if [ $ARCH == "x86_64" ]; then DISTRO=$(lsb_release -c -s) wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | run_as_root tee /etc/apt/trusted.gpg.d/lunarg.asc run_as_root wget -qO /etc/apt/sources.list.d/lunarg-vulkan-1.3.280-$DISTRO.list https://packages.lunarg.com/vulkan/1.3.280/lunarg-vulkan-1.3.280-$DISTRO.list run_as_root apt-get update - run_as_root apt-get install -qy vulkan-sdk + run_as_root apt-get install -qy vulkan-sdk cmake pkg-config build-essential libx11-xcb-dev libxkbcommon-dev libwayland-dev libxrandr-dev else run_as_root apt-get update run_as_root apt-get install -qy curl gnupg2 diff --git a/src/EmbeddingWorker.cpp b/src/EmbeddingWorker.cpp index 0ad8d35..86da8d2 100644 --- a/src/EmbeddingWorker.cpp +++ b/src/EmbeddingWorker.cpp @@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() { llama_kv_cache_clear(_sess->context()); auto tokens = ::common_tokenize(_sess->context(), _text, true); // add SEP if not present - if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) { - tokens.push_back(llama_token_sep(_sess->model())); + auto vocab = llama_model_get_vocab(_sess->model()); + if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) { + tokens.push_back(llama_vocab_sep(vocab)); } - const int n_embd = llama_n_embd(_sess->model()); + const int n_embd = llama_model_n_embd(_sess->model()); do { auto ctx = _sess->context(); int ret = diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp index 2ff96d3..e21f310 100644 --- a/src/LlamaCompletionWorker.cpp +++ b/src/LlamaCompletionWorker.cpp @@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() { size_t n_cur = 0; size_t n_input = 0; const auto model = _sess->model(); - const bool add_bos = llama_add_bos_token(model); + auto vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); auto ctx = _sess->context(); auto sparams = llama_sampler_chain_default_params(); @@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() { }); } // is it an end of generation? - if (llama_token_is_eog(model, new_token_id)) { + if (llama_vocab_is_eog(vocab, new_token_id)) { break; } // check for stop words diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp index 546b35c..35e77f6 100644 --- a/src/LlamaContext.cpp +++ b/src/LlamaContext.cpp @@ -169,14 +169,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info) llama_backend_init(); llama_numa_init(params.numa); - auto result = common_init_from_params(params); + auto sess = std::make_shared(params); - if (result.model == nullptr || result.context == nullptr) { + if (sess->model() == nullptr || sess->context() == nullptr) { Napi::TypeError::New(env, "Failed to load model") .ThrowAsJavaScriptException(); } - _sess = std::make_shared(result.model, result.context, params); + _sess = sess; _info = common_params_get_system_info(params); } @@ -191,8 +191,8 @@ bool validateModelChatTemplate(const struct llama_model * model) { int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); if (res >= 0) { llama_chat_message chat[] = {{"user", "test"}}; - std::string tmpl = std::string(model_template.data(), model_template.size()); - int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); + const char * tmpl = llama_model_chat_template(model); + int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); return chat_res > 0; } return res > 0; diff --git a/src/common.hpp b/src/common.hpp index ff2230d..f5cd71f 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -11,8 +11,6 @@ #include #include -typedef std::unique_ptr LlamaCppModel; -typedef std::unique_ptr LlamaCppContext; typedef std::unique_ptr LlamaCppSampling; typedef std::unique_ptr LlamaCppBatch; @@ -47,17 +45,17 @@ constexpr T get_option(const Napi::Object &options, const std::string &name, class LlamaSession { public: - LlamaSession(llama_model *model, llama_context *ctx, common_params params) - : model_(LlamaCppModel(model, llama_free_model)), - ctx_(LlamaCppContext(ctx, llama_free)), params_(params) { + LlamaSession(common_params params) + : params_(params) { + llama_init_ = common_init_from_params(params); tokens_.reserve(params.n_ctx); } ~LlamaSession() { dispose(); } - inline llama_context *context() { return ctx_.get(); } + inline llama_context *context() { return llama_init_.context.get(); } - inline llama_model *model() { return model_.get(); } + inline llama_model *model() { return llama_init_.model.get(); } inline std::vector *tokens_ptr() { return &tokens_; } @@ -72,13 +70,10 @@ class LlamaSession { void dispose() { std::lock_guard lock(mutex); tokens_.clear(); - ctx_.reset(); - model_.reset(); } private: - LlamaCppModel model_; - LlamaCppContext ctx_; + common_init_result llama_init_; const common_params params_; std::vector tokens_{}; std::mutex mutex; diff --git a/src/llama.cpp b/src/llama.cpp index 0a11f8b..92bc493 160000 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1 +1 @@ -Subproject commit 0a11f8b7b5c39fdf6e91ef9674bc68ff08681af7 +Subproject commit 92bc493917d43b83e592349e138b54c90b1c3ea7 diff --git a/test/__snapshots__/index.test.ts.snap b/test/__snapshots__/index.test.ts.snap index e9329ff..902f77e 100644 --- a/test/__snapshots__/index.test.ts.snap +++ b/test/__snapshots__/index.test.ts.snap @@ -444,7 +444,7 @@ exports[`works fine with vocab_only: empty result 1`] = ` exports[`works fine with vocab_only: model info 1`] = ` { - "desc": "llama ?B all F32", + "desc": "", "isChatTemplateSupported": false, "metadata": { "general.architecture": "llama",