From 0ba5f34689b8d4bcab9b0d0f91896d1dd0fc973d Mon Sep 17 00:00:00 2001 From: Panchovix Date: Thu, 29 Jun 2023 18:21:26 -0400 Subject: [PATCH 1/6] (Experimental) Add support to NTK RoPE scaling --- model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/model.py b/model.py index 316bf1c6..cddd2f72 100644 --- a/model.py +++ b/model.py @@ -72,9 +72,13 @@ def __init__(self, model_config_path): self.max_input_len = 2048 # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps self.max_attention_size = 2048**2 # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this self.compress_pos_emb = 1.0 # Increase to compress positional embeddings applied to sequence + self.alpha_value = 1.0 # Similar to RoPE, higher is more perplex but more ctx self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map + self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2)) + + # Tuning self.matmul_recons_thd = 8 From 555ae298ecb3367e67c0ecf9ec7a64466f2cb7c6 Mon Sep 17 00:00:00 2001 From: Panchovix Date: Thu, 29 Jun 2023 18:22:43 -0400 Subject: [PATCH 2/6] (Experimental) Add support to NTK RoPE scaling --- model_init.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/model_init.py b/model_init.py index d61dbac9..96732286 100644 --- a/model_init.py +++ b/model_init.py @@ -13,6 +13,7 @@ def add_args(parser): parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7") parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048) parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0) + parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0) parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs") @@ -79,6 +80,9 @@ def print_options(args, extra_options = None): if args.compress_pos_emb != 1.0: print(f" -- RoPE compression factor: {args.compress_pos_emb}") + if args.alpha != 1.0: + print(f" -- RoPE alpha factor: {args.alpha}") + print(f" -- Tuning:") print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else "")) print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else "")) @@ -105,6 +109,7 @@ def make_config(args): config.compress_pos_emb = args.compress_pos_emb config.set_auto_map(args.gpu_split) config.gpu_peer_fix = args.gpu_peer_fix + config.alpha_value = args.alpha config.matmul_recons_thd = args.matmul_recons_thd config.fused_mlp_thd = args.fused_mlp_thd From 5db2f3a6b1c3ec8f988e91d1a2bd5c7cbb724fa2 Mon Sep 17 00:00:00 2001 From: Panchovix Date: Fri, 30 Jun 2023 16:38:38 -0400 Subject: [PATCH 3/6] Revert alpha value to be set manually --- model.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/model.py b/model.py index cddd2f72..d191b4ba 100644 --- a/model.py +++ b/model.py @@ -72,13 +72,10 @@ def __init__(self, model_config_path): self.max_input_len = 2048 # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps self.max_attention_size = 2048**2 # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this self.compress_pos_emb = 1.0 # Increase to compress positional embeddings applied to sequence - self.alpha_value = 1.0 # Similar to RoPE, higher is more perplex but more ctx + self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. For now, the value has to be set manually, since from the parameters, the setting doesn't seem to apply. self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map - self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2)) - - # Tuning self.matmul_recons_thd = 8 From 720dc6dc98303e9433fbab5ecf02d8e217ce47e4 Mon Sep 17 00:00:00 2001 From: Panchovix Date: Fri, 30 Jun 2023 16:45:53 -0400 Subject: [PATCH 4/6] Apply correctly the value when using -a --- model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/model.py b/model.py index d191b4ba..bc8a9841 100644 --- a/model.py +++ b/model.py @@ -72,7 +72,7 @@ def __init__(self, model_config_path): self.max_input_len = 2048 # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps self.max_attention_size = 2048**2 # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this self.compress_pos_emb = 1.0 # Increase to compress positional embeddings applied to sequence - self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. For now, the value has to be set manually, since from the parameters, the setting doesn't seem to apply. + self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. Similar to compress_pos_emb, higher values increaste ctx but add Perplexity. self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2)) @@ -110,6 +110,8 @@ def set_auto_map(self, map_string): if map_string is None: self.auto_map = None else: self.auto_map = [float(alloc) for alloc in map_string.split(",")] + def calculate_rotary_embedding_base(self): + self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2)) # 4-bit linear layer implementation From 60c2abe85f85b5716b78fe6966b90448c9e56da5 Mon Sep 17 00:00:00 2001 From: Panchovix Date: Fri, 30 Jun 2023 16:46:19 -0400 Subject: [PATCH 5/6] Add calculate rotary embedding base correctly on model_init.py --- model_init.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model_init.py b/model_init.py index 96732286..68901992 100644 --- a/model_init.py +++ b/model_init.py @@ -110,6 +110,7 @@ def make_config(args): config.set_auto_map(args.gpu_split) config.gpu_peer_fix = args.gpu_peer_fix config.alpha_value = args.alpha + config.calculate_rotary_embedding_base() config.matmul_recons_thd = args.matmul_recons_thd config.fused_mlp_thd = args.fused_mlp_thd From ea0deebd59d7a4f9800ea6895ee89be66343782c Mon Sep 17 00:00:00 2001 From: Panchovix Date: Fri, 30 Jun 2023 16:48:31 -0400 Subject: [PATCH 6/6] Remove obsolete code --- model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/model.py b/model.py index bc8a9841..05c46402 100644 --- a/model.py +++ b/model.py @@ -75,7 +75,6 @@ def __init__(self, model_config_path): self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. Similar to compress_pos_emb, higher values increaste ctx but add Perplexity. self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map - self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2)) # Tuning self.matmul_recons_thd = 8