diff --git a/model.py b/model.py index 316bf1c6..05c46402 100644 --- a/model.py +++ b/model.py @@ -72,9 +72,9 @@ def __init__(self, model_config_path): self.max_input_len = 2048 # Maximum length of input IDs in a single forward pass. Sequences longer than this will be processed in multiple steps self.max_attention_size = 2048**2 # Sequences will be processed in chunks to keep the size of the attention weights matrix <= this self.compress_pos_emb = 1.0 # Increase to compress positional embeddings applied to sequence + self.alpha_value = 1.0 # Alpha value for NTK RoPE scaling. Similar to compress_pos_emb, higher values increaste ctx but add Perplexity. self.gpu_peer_fix = False # Apparently Torch can have problems transferring tensors directly one GPU to another sometimes. Enable this to expliticly move tensors via system RAM instead, where needed self.auto_map = None # List of floats with memory allocation in GB, per CUDA device, overrides device_map - # Tuning self.matmul_recons_thd = 8 @@ -109,6 +109,8 @@ def set_auto_map(self, map_string): if map_string is None: self.auto_map = None else: self.auto_map = [float(alloc) for alloc in map_string.split(",")] + def calculate_rotary_embedding_base(self): + self.rotary_embedding_base = self.rotary_embedding_base * self.alpha_value ** (self.head_dim / (self.head_dim-2)) # 4-bit linear layer implementation diff --git a/model_init.py b/model_init.py index d61dbac9..68901992 100644 --- a/model_init.py +++ b/model_init.py @@ -13,6 +13,7 @@ def add_args(parser): parser.add_argument("-gs", "--gpu_split", type = str, help = "Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. -gs 20,7,7") parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length", default = 2048) parser.add_argument("-cpe", "--compress_pos_emb", type = float, help = "Compression factor for positional embeddings", default = 1.0) + parser.add_argument("-a", "--alpha", type = float, help = "alpha for context size extension via embedding extension", default = 1.0) parser.add_argument("-gpfix", "--gpu_peer_fix", action = "store_true", help = "Prevent direct copies of data between GPUs") @@ -79,6 +80,9 @@ def print_options(args, extra_options = None): if args.compress_pos_emb != 1.0: print(f" -- RoPE compression factor: {args.compress_pos_emb}") + if args.alpha != 1.0: + print(f" -- RoPE alpha factor: {args.alpha}") + print(f" -- Tuning:") print(f" -- --matmul_recons_thd: {args.matmul_recons_thd}" + (" (disabled)" if args.matmul_recons_thd == 0 else "")) print(f" -- --fused_mlp_thd: {args.fused_mlp_thd}" + (" (disabled)" if args.fused_mlp_thd == 0 else "")) @@ -105,6 +109,8 @@ def make_config(args): config.compress_pos_emb = args.compress_pos_emb config.set_auto_map(args.gpu_split) config.gpu_peer_fix = args.gpu_peer_fix + config.alpha_value = args.alpha + config.calculate_rotary_embedding_base() config.matmul_recons_thd = args.matmul_recons_thd config.fused_mlp_thd = args.fused_mlp_thd