(Experimental) Add support to NTK RoPE scaling (#118)

* (Experimental) Add support to NTK RoPE scaling * (Experimental) Add support to NTK RoPE scaling * Revert alpha value to be set manually * Apply correctly the value when using -a * Add calculate rotary embedding base correctly on model_init.py * Remove obsolete code
turboderp · Jul 1, 2023 · 8229d55 · 8229d55
1 parent 93d50d1
commit 8229d55
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 1 deletion.
diff --git a/model.py b/model.py
@@ -72,9 +72,9 @@ def __init__(self, model_config_path):
         self.max_input_len = 2048  # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps
         self.max_attention_size = 2048**2  # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this
         self.compress_pos_emb = 1.0  # Increase to compress positional embeddings applied to sequence
+        self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. Similar to compress_pos_emb, higher values increaste ctx but add Perplexity.
         self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed
         self.auto_map = None  # List of floats with memory allocation in GB, per CUDA device, overrides device_map
-
         # Tuning
 
         self.matmul_recons_thd = 8
@@ -109,6 +109,8 @@ def set_auto_map(self, map_string):
         if map_string is None: self.auto_map = None
         else: self.auto_map = [float(alloc) for alloc in map_string.split(",")]
 
+    def calculate_rotary_embedding_base(self):
+        self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2))
 
 # 4-bit linear layer implementation
 

diff --git a/model_init.py b/model_init.py
@@ -13,6 +13,7 @@ def add_args(parser):
     parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7")
     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048)
     parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0)
+    parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0)
 
     parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs")
 
@@ -79,6 +80,9 @@ def print_options(args, extra_options = None):
     if args.compress_pos_emb != 1.0:
         print(f" -- RoPE compression factor: {args.compress_pos_emb}")
 
+    if args.alpha != 1.0:
+        print(f" -- RoPE alpha factor: {args.alpha}")
+
     print(f" -- Tuning:")
     print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else ""))
     print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else ""))
@@ -105,6 +109,8 @@ def make_config(args):
     config.compress_pos_emb = args.compress_pos_emb
     config.set_auto_map(args.gpu_split)
     config.gpu_peer_fix = args.gpu_peer_fix
+    config.alpha_value = args.alpha
+    config.calculate_rotary_embedding_base()
 
     config.matmul_recons_thd = args.matmul_recons_thd
     config.fused_mlp_thd = args.fused_mlp_thd