From 9c6199acf1d06cafdfb621a76ee817033f97f1c2 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <developer@jhen.me>
Date: Sun, 19 Jan 2025 18:23:54 +0800
Subject: [PATCH 1/6] feat: use new linux arm runner for build
 build-linux-arm64 & off GGML_NATIVE (#75)

* ci: use new linux arm runner for build build-linux-arm64

* ci: install yarn by self

* fix: set GGML_NATIVE=OFF
---
 .github/workflows/build-release.yml | 18 +++++-------------
 scripts/build-linux.sh              |  4 ++--
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index 2aebe93..2d73546 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -54,7 +54,7 @@ jobs:
           retention-days: ${{ inputs.artifacts-retention-days }}
 
   build-linux-arm64:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-22.04-arm
     steps:
       - uses: actions/checkout@v4
         with:
@@ -72,22 +72,14 @@ jobs:
       - uses: actions/setup-node@v4.0.2
         with:
           node-version: 20
-          cache: "yarn"
+      - name: Install yarn
+        run: npm install -g yarn
       - name: Install dependencies
         run: yarn install
-      - name: Setup QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          platforms: linux/arm64
       - name: Prepare & build
         run: |
-          docker run --rm \
-            -e CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
-            -v $(pwd):/${{ github.workspace }} \
-            -w /${{ github.workspace }} \
-            --platform linux/arm64 \
-            arm64v8/ubuntu:latest \
-            bash -c "./scripts/prepare-linux.sh && ./scripts/build-linux.sh"
+          bash ./scripts/prepare-linux.sh
+          bash ./scripts/build-linux.sh
       - name: Upload build artifacts
         if: github.event.inputs.upload-artifacts == 'YES' || inputs.upload-artifacts == 'YES'
         uses: actions/upload-artifact@v4
diff --git a/scripts/build-linux.sh b/scripts/build-linux.sh
index 943f519..ccf4818 100755
--- a/scripts/build-linux.sh
+++ b/scripts/build-linux.sh
@@ -10,6 +10,6 @@ if [ $ARCH == "x86_64" ]; then
   yarn clean && yarn build-native
   yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVARIANT=vulkan
 else
-  yarn clean && yarn build-native
-  yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan
+  yarn clean && yarn build-native --CDGGML_NATIVE=OFF
+  yarn clean && yarn build-native --CDGGML_NATIVE=OFF --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan
 fi

From f084d66314b7ff88bf66d94ffe438f0e22773713 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <developer@jhen.me>
Date: Sun, 19 Jan 2025 18:25:25 +0800
Subject: [PATCH 2/6] feat: build cuda bin for linux (x86/arm) (#77)

* ci: use new linux arm runner for build build-linux-arm64

* ci: install yarn by self

* ci: enable linux cuda build

* feat: add CMAKE_CUDA_ARCHITECTURES

* feat: update scripts/prepare-linux.sh

* feat: set GGML_NATIVE=OFF
---
 .github/workflows/build-release.yml |  6 ++++++
 lib/binding.ts                      |  2 +-
 scripts/build-linux.sh              | 21 +++++++++++++++++++++
 scripts/prepare-linux.sh            |  5 ++++-
 4 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index 2d73546..44eecd6 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -20,7 +20,10 @@ on:
 jobs:
   build-linux-x86_64:
     runs-on: ubuntu-22.04
+    container: nvidia/cuda:12.6.2-devel-ubuntu22.04
     steps:
+      - name: Upgrade git
+        run: apt-get update && apt-get install -y git
       - uses: actions/checkout@v4
         with:
           submodules: "true"
@@ -55,7 +58,10 @@ jobs:
 
   build-linux-arm64:
     runs-on: ubuntu-22.04-arm
+    container: nvidia/cuda:12.6.2-devel-ubuntu22.04
     steps:
+      - name: Upgrade git
+        run: apt-get update && apt-get install -y git
       - uses: actions/checkout@v4
         with:
           submodules: "true"
diff --git a/lib/binding.ts b/lib/binding.ts
index 83e60fc..def14bf 100644
--- a/lib/binding.ts
+++ b/lib/binding.ts
@@ -117,7 +117,7 @@ export interface Module {
   LlamaContext: LlamaContext
 }
 
-export type LibVariant = 'default' | 'vulkan'
+export type LibVariant = 'default' | 'vulkan' | 'cuda'
 
 const setupEnv = (variant?: string) => {
   const postfix = variant ? `-${variant}` : ''
diff --git a/scripts/build-linux.sh b/scripts/build-linux.sh
index ccf4818..ec8c3c3 100755
--- a/scripts/build-linux.sh
+++ b/scripts/build-linux.sh
@@ -9,7 +9,28 @@ ARCH=${ARCH:-${1:-$(uname -m)}}
 if [ $ARCH == "x86_64" ]; then
   yarn clean && yarn build-native
   yarn clean && yarn build-native --CDLLAMA_VULKAN=1 --CDVARIANT=vulkan
+
+  # Check CUDA is available
+  if [ -f /usr/local/cuda/bin/nvcc ]; then
+    yarn clean && yarn build-native \
+      --CDLLAMA_CUDA=1 \
+      --CDVARIANT=cuda \
+      --CDCMAKE_CUDA_ARCHITECTURES=89 # > GeForce RTX 40 series
+  else
+    echo "CUDA is not available, skipping CUDA build"
+  fi
 else
   yarn clean && yarn build-native --CDGGML_NATIVE=OFF
   yarn clean && yarn build-native --CDGGML_NATIVE=OFF --CDLLAMA_VULKAN=1 --CDVULKAN_SDK="$(realpath 'externals/arm64-Vulkan-SDK')" --CDVARIANT=vulkan
+
+  # Check CUDA is available
+  if [ -f /usr/local/cuda/bin/nvcc ]; then
+    yarn clean && yarn build-native \
+      --CDLLAMA_CUDA=1 \
+      --CDVARIANT=cuda \
+      --CDGGML_NATIVE=OFF \
+      --CDCMAKE_CUDA_ARCHITECTURES=87 # > Jetson Orin series
+  else
+    echo "CUDA is not available, skipping CUDA build"
+  fi
 fi
diff --git a/scripts/prepare-linux.sh b/scripts/prepare-linux.sh
index 4702306..24f8e7d 100755
--- a/scripts/prepare-linux.sh
+++ b/scripts/prepare-linux.sh
@@ -14,12 +14,15 @@ export DEBIAN_FRONTEND=noninteractive
 
 ARCH=${ARCH:-${1:-$(uname -m)}}
 
+run_as_root apt-get update
+run_as_root apt-get install -qy lsb-release wget
+
 if [ $ARCH == "x86_64" ]; then
   DISTRO=$(lsb_release -c -s)
   wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | run_as_root tee /etc/apt/trusted.gpg.d/lunarg.asc
   run_as_root wget -qO /etc/apt/sources.list.d/lunarg-vulkan-1.3.280-$DISTRO.list https://packages.lunarg.com/vulkan/1.3.280/lunarg-vulkan-1.3.280-$DISTRO.list
   run_as_root apt-get update
-  run_as_root apt-get install -qy vulkan-sdk
+  run_as_root apt-get install -qy vulkan-sdk cmake pkg-config build-essential  libx11-xcb-dev libxkbcommon-dev libwayland-dev libxrandr-dev
 else
   run_as_root apt-get update
   run_as_root apt-get install -qy curl gnupg2

From 6fec952b08111c51f82f12e5be247d3a79703f42 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <developer@jhen.me>
Date: Sun, 19 Jan 2025 18:29:28 +0800
Subject: [PATCH 3/6] chore: release v0.3.7

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index a27173b..c5cffd9 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.3.6",
+  "version": "0.3.7",
   "description": "Llama.cpp for Node.js",
   "main": "lib/index.js",
   "scripts": {

From d58de1f193bea4f089ee35c0da25f53567d9e495 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <developer@jhen.me>
Date: Mon, 20 Jan 2025 12:07:05 +0800
Subject: [PATCH 4/6] docs(readme): update

---
 README.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0b492a9..a365c3b 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,23 @@
 [![NPM Version](https://img.shields.io/npm/v/%40fugood%2Fllama.node)](https://www.npmjs.com/package/@fugood/llama.node)
 ![NPM Downloads](https://img.shields.io/npm/dw/%40fugood%2Fllama.node)
 
-Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
+An another Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp) to make same API with [llama.rn](https://github.com/mybigday/llama.rn) as much as possible.
 
-[llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
+- [llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
+- [llama.rn](https://github.com/mybigday/llama.rn): React Native binding of llama.cpp
+
+## Platform Support
+
+- macOS
+  - arm64: CPU and Metal GPU acceleration
+  - x86_64: CPU only
+- Windows (x86_64 and arm64)
+  - CPU
+  - GPU acceleration via Vulkan
+- Linux (x86_64 and arm64)
+  - CPU
+  - GPU acceleration via Vulkan
+  - GPU acceleration via CUDA
 
 ## Installation
 

From 2dced8026105805ff02e7d1e7ffb36c5c9251da5 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <developer@jhen.me>
Date: Mon, 20 Jan 2025 12:09:07 +0800
Subject: [PATCH 5/6] docs(readme): update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a365c3b..5743442 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,7 @@ console.log('Result:', text)
 
 - [x] `default`: General usage, not support GPU except macOS (Metal)
 - [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable
+- [x] `cuda`: Support GPU CUDA (Linux), but only for limited capability (x86_64: 8.9, arm64: 8.7)
 
 ## License
 

From 2cbf1aaed3a5d8e181c65efe511a8b67e6832b55 Mon Sep 17 00:00:00 2001
From: Jhen-Jie Hong <developer@jhen.me>
Date: Mon, 20 Jan 2025 12:49:04 +0800
Subject: [PATCH 6/6] feat: sync llama.cpp (#69)

* feat: sync llama.cpp

* fix: patch

* feat: sync llama.cpp
---
 scripts/llama.cpp.patch               | 54 +++++++++++++--------------
 src/EmbeddingWorker.cpp               |  7 ++--
 src/LlamaCompletionWorker.cpp         |  6 ++-
 src/LlamaContext.cpp                  | 10 ++---
 src/common.hpp                        | 17 +++------
 src/llama.cpp                         |  2 +-
 test/__snapshots__/index.test.ts.snap |  2 +-
 7 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch
index 0667a20..fc292df 100644
--- a/scripts/llama.cpp.patch
+++ b/scripts/llama.cpp.patch
@@ -1,5 +1,29 @@
+diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
+index 451826d5..a85ac028 100644
+--- a/src/llama.cpp/common/common.cpp
++++ b/src/llama.cpp/common/common.cpp
+@@ -1043,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
+     if (params.n_gpu_layers != -1) {
+         mparams.n_gpu_layers = params.n_gpu_layers;
+     }
++    mparams.vocab_only      = params.vocab_only;
+     mparams.main_gpu        = params.main_gpu;
+     mparams.split_mode      = params.split_mode;
+     mparams.tensor_split    = params.tensor_split;
+diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp  /common/common.h
+index 3bcc637c..19ae7dad 100644
+--- a/src/llama.cpp/common/common.h
++++ b/src/llama.cpp/common/common.h
+@@ -189,6 +189,7 @@ struct common_params_vocoder {
+ };
+ 
+ struct common_params {
++    bool vocab_only               = false;
+     int32_t n_predict             =    -1; // new tokens to predict
+     int32_t n_ctx                 =  4096; // context size
+     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
 diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
-index 683b90af..e1bf104c 100644
+index 6b3641c4..6d6cb27f 100644
 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
 +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
 @@ -80,7 +80,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -9,29 +33,5 @@ index 683b90af..e1bf104c 100644
 -            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
 +            list(APPEND ARCH_FLAGS /arch:armv8.7)
          else()
-             check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-             if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index 1d2bd932..b5007c66 100644
---- a/src/llama.cpp/common/common.h
-+++ b/src/llama.cpp/common/common.h
-@@ -183,6 +183,7 @@ struct common_params_vocoder {
- };
- 
- struct common_params {
-+    bool vocab_only               = false;
-     int32_t n_predict             =    -1; // new tokens to predict
-     int32_t n_ctx                 =  4096; // context size
-     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
-diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
-index 20be9291..1bedc55d 100644
---- a/src/llama.cpp/common/common.cpp
-+++ b/src/llama.cpp/common/common.cpp
-@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
-     if (params.n_gpu_layers != -1) {
-         mparams.n_gpu_layers = params.n_gpu_layers;
-     }
-+    mparams.vocab_only      = params.vocab_only;
-     mparams.rpc_servers     = params.rpc_servers.c_str();
-     mparams.main_gpu        = params.main_gpu;
-     mparams.split_mode      = params.split_mode;
+             check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+             if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
diff --git a/src/EmbeddingWorker.cpp b/src/EmbeddingWorker.cpp
index 0ad8d35..86da8d2 100644
--- a/src/EmbeddingWorker.cpp
+++ b/src/EmbeddingWorker.cpp
@@ -9,10 +9,11 @@ void EmbeddingWorker::Execute() {
   llama_kv_cache_clear(_sess->context());
   auto tokens = ::common_tokenize(_sess->context(), _text, true);
   // add SEP if not present
-  if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
-    tokens.push_back(llama_token_sep(_sess->model()));
+  auto vocab = llama_model_get_vocab(_sess->model());
+  if (tokens.empty() || tokens.back() != llama_vocab_sep(vocab)) {
+    tokens.push_back(llama_vocab_sep(vocab));
   }
-  const int n_embd = llama_n_embd(_sess->model());
+  const int n_embd = llama_model_n_embd(_sess->model());
   do {
     auto ctx = _sess->context();
     int ret =
diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp
index 2ff96d3..e21f310 100644
--- a/src/LlamaCompletionWorker.cpp
+++ b/src/LlamaCompletionWorker.cpp
@@ -59,7 +59,9 @@ void LlamaCompletionWorker::Execute() {
   size_t n_cur = 0;
   size_t n_input = 0;
   const auto model = _sess->model();
-  const bool add_bos = llama_add_bos_token(model);
+  auto vocab = llama_model_get_vocab(model);
+
+  const bool add_bos = llama_vocab_get_add_bos(vocab);
   auto ctx = _sess->context();
 
   auto sparams = llama_sampler_chain_default_params();
@@ -130,7 +132,7 @@ void LlamaCompletionWorker::Execute() {
       });
     }
     // is it an end of generation?
-    if (llama_token_is_eog(model, new_token_id)) {
+    if (llama_vocab_is_eog(vocab, new_token_id)) {
       break;
     }
     // check for stop words
diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp
index eb9ee8d..cd73027 100644
--- a/src/LlamaContext.cpp
+++ b/src/LlamaContext.cpp
@@ -140,14 +140,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   llama_backend_init();
   llama_numa_init(params.numa);
 
-  auto result = common_init_from_params(params);
+  auto sess = std::make_shared<LlamaSession>(params);
 
-  if (result.model == nullptr || result.context == nullptr) {
+  if (sess->model() == nullptr || sess->context() == nullptr) {
     Napi::TypeError::New(env, "Failed to load model")
         .ThrowAsJavaScriptException();
   }
 
-  _sess = std::make_shared<LlamaSession>(result.model, result.context, params);
+  _sess = sess;
   _info = common_params_get_system_info(params);
 }
 
@@ -162,8 +162,8 @@ bool validateModelChatTemplate(const struct llama_model * model) {
     int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
     if (res >= 0) {
         llama_chat_message chat[] = {{"user", "test"}};
-        std::string tmpl = std::string(model_template.data(), model_template.size());
-        int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
+        const char * tmpl = llama_model_chat_template(model);
+        int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
         return chat_res > 0;
     }
     return res > 0;
diff --git a/src/common.hpp b/src/common.hpp
index ff2230d..f5cd71f 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -11,8 +11,6 @@
 #include <tuple>
 #include <vector>
 
-typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
-typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
 typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
     LlamaCppSampling;
 typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
@@ -47,17 +45,17 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
 
 class LlamaSession {
 public:
-  LlamaSession(llama_model *model, llama_context *ctx, common_params params)
-      : model_(LlamaCppModel(model, llama_free_model)),
-        ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
+  LlamaSession(common_params params)
+      : params_(params) {
+    llama_init_ = common_init_from_params(params);
     tokens_.reserve(params.n_ctx);
   }
 
   ~LlamaSession() { dispose(); }
 
-  inline llama_context *context() { return ctx_.get(); }
+  inline llama_context *context() { return llama_init_.context.get(); }
 
-  inline llama_model *model() { return model_.get(); }
+  inline llama_model *model() { return llama_init_.model.get(); }
 
   inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }
 
@@ -72,13 +70,10 @@ class LlamaSession {
   void dispose() {
     std::lock_guard<std::mutex> lock(mutex);
     tokens_.clear();
-    ctx_.reset();
-    model_.reset();
   }
 
 private:
-  LlamaCppModel model_;
-  LlamaCppContext ctx_;
+  common_init_result llama_init_;
   const common_params params_;
   std::vector<llama_token> tokens_{};
   std::mutex mutex;
diff --git a/src/llama.cpp b/src/llama.cpp
index 0a11f8b..92bc493 160000
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1 +1 @@
-Subproject commit 0a11f8b7b5c39fdf6e91ef9674bc68ff08681af7
+Subproject commit 92bc493917d43b83e592349e138b54c90b1c3ea7
diff --git a/test/__snapshots__/index.test.ts.snap b/test/__snapshots__/index.test.ts.snap
index e9329ff..902f77e 100644
--- a/test/__snapshots__/index.test.ts.snap
+++ b/test/__snapshots__/index.test.ts.snap
@@ -444,7 +444,7 @@ exports[`works fine with vocab_only: empty result 1`] = `
 
 exports[`works fine with vocab_only: model info 1`] = `
 {
-  "desc": "llama ?B all F32",
+  "desc": "",
   "isChatTemplateSupported": false,
   "metadata": {
     "general.architecture": "llama",