From 0ba5f34689b8d4bcab9b0d0f91896d1dd0fc973d Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Thu, 29 Jun 2023 18:21:26 -0400
Subject: [PATCH 1/6] (Experimental) Add support to NTK RoPE scaling

---
 model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/model.py b/model.py
index 316bf1c6..cddd2f72 100644
--- a/model.py
+++ b/model.py
@@ -72,9 +72,13 @@ def __init__(self, model_config_path):
         self.max_input_len = 2048  # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps
         self.max_attention_size = 2048**2  # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this
         self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
+        self.alpha_value = 1.0 # Similar to RoPE, higher is more perplex but more ctx
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
 
+        self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
+
+
         # Tuning
 
         self.matmul_recons_thd = 8

From 555ae298ecb3367e67c0ecf9ec7a64466f2cb7c6 Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Thu, 29 Jun 2023 18:22:43 -0400
Subject: [PATCH 2/6] (Experimental) Add support to NTK RoPE scaling

---
 model_init.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/model_init.py b/model_init.py
index d61dbac9..96732286 100644
--- a/model_init.py
+++ b/model_init.py
@@ -13,6 +13,7 @@ def add_args(parser):
     parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7")
     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048)
     parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0)
+    parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0)
 
     parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs")
 
@@ -79,6 +80,9 @@ def print_options(args, extra_options = None):
     if args.compress_pos_emb != 1.0:
         print(f" -- RoPE compression factor: {args.compress_pos_emb}")
 
+    if args.alpha != 1.0:
+        print(f" -- RoPE alpha factor: {args.alpha}")
+
     print(f" -- Tuning:")
     print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else ""))
     print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else ""))
@@ -105,6 +109,7 @@ def make_config(args):
     config.compress_pos_emb = args.compress_pos_emb
     config.set_auto_map(args.gpu_split)
     config.gpu_peer_fix = args.gpu_peer_fix
+    config.alpha_value = args.alpha
 
     config.matmul_recons_thd = args.matmul_recons_thd
     config.fused_mlp_thd = args.fused_mlp_thd

From 5db2f3a6b1c3ec8f988e91d1a2bd5c7cbb724fa2 Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Fri, 30 Jun 2023 16:38:38 -0400
Subject: [PATCH 3/6] Revert alpha value to be set manually

---
 model.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/model.py b/model.py
index cddd2f72..d191b4ba 100644
--- a/model.py
+++ b/model.py
@@ -72,13 +72,10 @@ def __init__(self, model_config_path):
         self.max_input_len = 2048  # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps
         self.max_attention_size = 2048**2  # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this
         self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
-        self.alpha_value = 1.0 # Similar to RoPE, higher is more perplex but more ctx
+        self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. For now, the value has to be set manually, since from the parameters, the setting doesn't seem to apply.
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
-
         self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
-
-
         # Tuning
 
         self.matmul_recons_thd = 8

From 720dc6dc98303e9433fbab5ecf02d8e217ce47e4 Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Fri, 30 Jun 2023 16:45:53 -0400
Subject: [PATCH 4/6] Apply correctly the value when using -a

---
 model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/model.py b/model.py
index d191b4ba..bc8a9841 100644
--- a/model.py
+++ b/model.py
@@ -72,7 +72,7 @@ def __init__(self, model_config_path):
         self.max_input_len = 2048  # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps
         self.max_attention_size = 2048**2  # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this
         self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
-        self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. For now, the value has to be set manually, since from the parameters, the setting doesn't seem to apply.
+        self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. Similar to compress_pos_emb, higher values increaste ctx but add Perplexity.
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
         self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
@@ -110,6 +110,8 @@ def set_auto_map(self, map_string):
         if map_string is None: self.auto_map = None
         else: self.auto_map = [float(alloc) for alloc in map_string.split(",")]
 
+    def calculate_rotary_embedding_base(self):
+        self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
 
 # 4-bit linear layer implementation
 

From 60c2abe85f85b5716b78fe6966b90448c9e56da5 Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Fri, 30 Jun 2023 16:46:19 -0400
Subject: [PATCH 5/6] Add calculate rotary embedding base correctly on
 model_init.py

---
 model_init.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model_init.py b/model_init.py
index 96732286..68901992 100644
--- a/model_init.py
+++ b/model_init.py
@@ -110,6 +110,7 @@ def make_config(args):
     config.set_auto_map(args.gpu_split)
     config.gpu_peer_fix = args.gpu_peer_fix
     config.alpha_value = args.alpha
+    config.calculate_rotary_embedding_base()
 
     config.matmul_recons_thd = args.matmul_recons_thd
     config.fused_mlp_thd = args.fused_mlp_thd

From ea0deebd59d7a4f9800ea6895ee89be66343782c Mon Sep 17 00:00:00 2001
From: Panchovix <panchovixmenn@gmail.com>
Date: Fri, 30 Jun 2023 16:48:31 -0400
Subject: [PATCH 6/6] Remove obsolete code

---
 model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/model.py b/model.py
index bc8a9841..05c46402 100644
--- a/model.py
+++ b/model.py
@@ -75,7 +75,6 @@ def __init__(self, model_config_path):
         self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. Similar to compress_pos_emb, higher values increaste ctx but add Perplexity.
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
-        self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
         # Tuning
 
         self.matmul_recons_thd = 8