From f99784c5bb495b3f53d65aee2aa0731f43ed0e35 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 14 Feb 2024 22:17:44 -0800
Subject: [PATCH] [BugFix] Fix GC bug for `LLM` class (#2882)

---
 tests/test_regression.py |  18 +++
 vllm/lora/punica.py      | 334 +++++++++++++++++++--------------------
 2 files changed, 182 insertions(+), 170 deletions(-)

diff --git a/tests/test_regression.py b/tests/test_regression.py
index c48e474bd889f..cb68e9ecfc06b 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -4,6 +4,10 @@
 will never happen again.
 
 """
+import gc
+
+import torch
+
 from vllm import LLM, SamplingParams
 
 
@@ -35,6 +39,20 @@ def test_max_tokens_none():
     assert len(prompts) == len(outputs)
 
 
+def test_gc():
+    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    del llm
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # The memory allocated for model and KV cache should be released.
+    # The memory allocated for PyTorch and others should be less than 50MB.
+    # Usually, it's around 10MB.
+    allocated = torch.cuda.memory_allocated()
+    assert allocated < 50 * 1024 * 1024
+
+
 if __name__ == "__main__":
     import pytest
     pytest.main([__file__])
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index bcb73ccc19b0e..307a33dcf2820 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,173 +4,167 @@
 
 import torch
 
-import_exc = None
-
-try:
-    import vllm._punica_C as punica_kernels
-except ImportError as e:
-    import_exc = e
-
-if import_exc is None:
-
-    def bgmv(
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        indicies: torch.LongTensor,
-        layer_idx: int,
-        scale: float,
-    ):
-        """
-        Semantics:
-          y[i] += (
-              x[i].unsqueeze(0)
-              @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              * scale
-            ).squeeze(0)
-
-        Args:
-          y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-          x: Shape: `[B, H1]`. Input vectors.
-          w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
-            matrices.
-          indicies: Shape: `[B]`. Indices of the weight matrices.
-          layer_idx: Layer index of the weight matrices.
-          scale: Scaling factor.
-        """
-        punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
-
-    def add_lora(y: torch.Tensor,
-                 x: torch.Tensor,
-                 wa_t_all: torch.Tensor,
-                 wb_t_all: torch.Tensor,
-                 indicies: torch.LongTensor,
-                 layer_idx: int,
-                 scale: float,
-                 *,
-                 buffer: Optional[torch.Tensor] = None):
-        """
-        Semantics:
-          y[i] += (
-              x[i].unsqueeze(0)
-              @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              * scale
-            ).squeeze(0)
-
-        Args:
-          y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-          x: Shape: `[B, H1]`. Input vectors.
-          wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-            LoRA A matrices.
-          wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-            LoRA B matrices.
-          indicies: Shape: `[B]`. Indices of the LoRA weights.
-          layer_idx: Layer index of LoRA weights.
-          scale: Scaling factor.
-          buffer: Optional. Shape: `[B, R]`. Temporary buffer.
-        """
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default to avoid
-            # numerical innacuracies that would otherwise happen
-            # due to downcasting.
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-        punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx,
-                                     1.0)
-        punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-                                     scale)
-
-    def add_lora_slice(y: torch.Tensor,
-                       x: torch.Tensor,
-                       wa_t_all: torch.Tensor,
-                       wb_t_all: torch.Tensor,
-                       indicies: torch.LongTensor,
-                       layer_idx: int,
-                       scale: float,
-                       y_offset: int,
-                       y_slice_size: int,
-                       *,
-                       buffer: Optional[torch.Tensor] = None):
-        """
-        Same as `add_lora` but you can operate on slices of y.
-        Pass whole y, define y_offset and y_slice_size.
-
-        Semantics:
-          y[i] += (
-              x[i].unsqueeze(0)
-              @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-              * scale
-            ).squeeze(0)
-
-        Args:
-          y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
-          x: Shape: `[B, H1]`. Input vectors.
-          wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
-            LoRA A matrices.
-          wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
-            LoRA B matrices.
-          indicies: Shape: `[B]`. Indices of the LoRA weights.
-          layer_idx: Layer index of LoRA weights.
-          scale: Scaling factor.
-          y_offset: Offset to apply to the starting column of y.
-          y_slice_size: Size of the y column slice.
-        """
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default to avoid
-            # numerical inaccuracies that would otherwise happen
-            # due to downcasting.
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-        punica_kernels.dispatch_bgmv_low_level(
-            buffer,
-            x,
-            wa_t_all,
-            indicies,
-            layer_idx,
-            1.0,
-            x.size(1),
-            buffer.size(1),
-            0,
-        )
-        punica_kernels.dispatch_bgmv_low_level(
-            y,
-            buffer,
-            wb_t_all,
-            indicies,
-            layer_idx,
-            scale,
-            buffer.size(1),
-            y_slice_size,
-            y_offset,
-        )
-
-else:
-
-    def _raise_exc(
-        *args,  # pylint: disable=unused-argument
-        **kwargs  # pylint: disable=unused-argument
-    ):
-        if torch.cuda.get_device_capability() < (8, 0):
-            raise ImportError("punica LoRA kernels require compute "
-                              "capability>=8.0") from import_exc
-        else:
-            raise ImportError(
-                "punica LoRA kernels could not be imported. If you built vLLM "
-                "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-                "was set.") from import_exc
-
-    bgmv = _raise_exc
-    add_lora = _raise_exc
-    add_lora_slice = _raise_exc
-
-__all__ = [
-    "bgmv",
-    "add_lora",
-    "add_lora_slice",
-]
+
+def _raise_import_error(e):
+    if torch.cuda.get_device_capability() < (8, 0):
+        raise ImportError(
+            "punica LoRA kernels require compute capability >= 8.0") from e
+    else:
+        raise ImportError(
+            "punica LoRA kernels could not be imported. If you built vLLM "
+            "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
+            "was set.") from e
+
+
+def bgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.LongTensor,
+    layer_idx: int,
+    scale: float,
+):
+    """
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ w_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      w_t_all: Shape: `[None, L, H2, H1]`. All of the transposed weight
+        matrices.
+      indicies: Shape: `[B]`. Indices of the weight matrices.
+      layer_idx: Layer index of the weight matrices.
+      scale: Scaling factor.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+
+
+def add_lora(y: torch.Tensor,
+             x: torch.Tensor,
+             wa_t_all: torch.Tensor,
+             wb_t_all: torch.Tensor,
+             indicies: torch.LongTensor,
+             layer_idx: int,
+             scale: float,
+             *,
+             buffer: Optional[torch.Tensor] = None):
+    """
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      buffer: Optional. Shape: `[B, R]`. Temporary buffer.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical innacuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
+    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
+                                 scale)
+
+
+def add_lora_slice(y: torch.Tensor,
+                   x: torch.Tensor,
+                   wa_t_all: torch.Tensor,
+                   wb_t_all: torch.Tensor,
+                   indicies: torch.LongTensor,
+                   layer_idx: int,
+                   scale: float,
+                   y_offset: int,
+                   y_slice_size: int,
+                   *,
+                   buffer: Optional[torch.Tensor] = None):
+    """
+    Same as `add_lora` but you can operate on slices of y.
+    Pass whole y, define y_offset and y_slice_size.
+
+    Semantics:
+      y[i] += (
+          x[i].unsqueeze(0)
+          @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
+          * scale
+        ).squeeze(0)
+
+    Args:
+      y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
+      x: Shape: `[B, H1]`. Input vectors.
+      wa_t_all: Shape: `[None, L, R, H1]`. All of the transposed
+        LoRA A matrices.
+      wb_t_all: Shape: `[None, L, H2, R]`. All of the transposed
+        LoRA B matrices.
+      indicies: Shape: `[B]`. Indices of the LoRA weights.
+      layer_idx: Layer index of LoRA weights.
+      scale: Scaling factor.
+      y_offset: Offset to apply to the starting column of y.
+      y_slice_size: Size of the y column slice.
+    """
+    try:
+        import vllm._punica_C as punica_kernels
+    except ImportError as e:
+        _raise_import_error(e)
+
+    r = wb_t_all.size(-1)
+    if buffer is None:
+        # We set the buffer to be float32 by default to avoid
+        # numerical inaccuracies that would otherwise happen
+        # due to downcasting.
+        buffer = torch.zeros((x.size(0), r),
+                             dtype=torch.float32,
+                             device=x.device)
+    punica_kernels.dispatch_bgmv_low_level(
+        buffer,
+        x,
+        wa_t_all,
+        indicies,
+        layer_idx,
+        1.0,
+        x.size(1),
+        buffer.size(1),
+        0,
+    )
+    punica_kernels.dispatch_bgmv_low_level(
+        y,
+        buffer,
+        wb_t_all,
+        indicies,
+        layer_idx,
+        scale,
+        buffer.size(1),
+        y_slice_size,
+        y_offset,
+    )