precompute rope for some number of positions

eole-nlp · Aug 28, 2024 · 3091a4e · 3091a4e
1 parent 3df5ef9
commit 3091a4e
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 2 deletions.
diff --git a/eole/modules/multi_headed_attn.py b/eole/modules/multi_headed_attn.py
@@ -379,6 +379,9 @@ def _prepare_inputs(
         query = shape(query, self.dim_per_head)
 
         if self.position_encoding_type == PositionEncodingType.Rotary:
+            start_pos = 0
+            seqlen = query.size(2)
+            position_embeddings = position_embeddings[start_pos : start_pos + seqlen].to(query.device)
             query, key = apply_rotary_emb(
                 query, key, position_embeddings, interleave=self.rotary_interleave
             )

diff --git a/eole/modules/rope.py b/eole/modules/rope.py
@@ -62,6 +62,8 @@ def __init__(self, model_config):
         # TODO: extend with other scaling types
         if getattr(self.model_config.rope_config, "scaling_type", None) == "llama3":
             self.llama3_scaling()
+        # cache rope tensor to limit unnecessary computations
+        self.rope = None
 
     def llama3_scaling(self):
         """
@@ -106,7 +108,7 @@ def llama3_scaling(self):
         inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
         self.inv_freq = inv_freq_llama
 
-    def forward(self, emb, step=0, device=None, offset=0):
+    def forward(self, emb, step=0, device=None, offset=0, prefetch=1024):
         """
         Computes the rotary position embeddings for a given input.
 
@@ -128,9 +130,15 @@ def forward(self, emb, step=0, device=None, offset=0):
             - The output tensor's dimensions are `[maxseqlen, dim]`, where `dim` is
               twice the size of the original inverse frequency tensor (`inv_freq`).
         """
-        maxseqlen = emb.size(1)
         if step is None:
             step = 0
+        maxseqlen = emb.size(1)
+        # This could probably a bit cleaner/homogenized with the offset case
+        if self.rope is not None:
+            if self.rope.size(0) >= max(offset + step, 0) + maxseqlen:
+                return self.rope
+            else:
+                maxseqlen = maxseqlen + prefetch
         tmax = torch.arange(
             max(offset + step, 0) + maxseqlen, device=self.inv_freq.device
         )
@@ -143,4 +151,5 @@ def forward(self, emb, step=0, device=None, offset=0):
         # cos = rope[:, : rope.size(1) // 2].real.contiguous().half()
         # sin = rope[:, : rope.size(1) // 2].imag.contiguous().half()
         # return rope, cos, sin
+        self.rope = rope
         return rope