aai-institute · schroedk · Feb 19, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Fixed
 
 - Bug in `LissaInfluence`, when not using CPU device [PR #495](https://github.com/aai-institute/pyDVL/pull/495)
+- Memory issue with `CgInfluence` and `ArnoldiInfluence`[PR #498](https://github.com/aai-institute/pyDVL/pull/498)
 
 ## 0.8.1 - 🆕 🏗  New method and noteboo, Games with exact shapley values, bug fixes and cleanup
 

diff --git a/src/pydvl/influence/torch/functional.py b/src/pydvl/influence/torch/functional.py
@@ -265,8 +265,8 @@ def create_hvp_function(
             is the model's input and the second element is the target output.
         precompute_grad: If True, the full data gradient is precomputed and kept
             in memory, which can speed up the hessian vector product computation.
-            Set this to False, if you can't afford to keep an additional
-            parameter-sized vector in memory.
+            Set this to False, if you can't afford to keep the full computation graph
+            in memory.
         use_average: If True, the returned function uses batch-wise computation via
             [batch_loss_function][pydvl.influence.torch.functional.batch_loss_function]
             and averages the results.
@@ -772,6 +772,7 @@ def model_hessian_low_rank(
     tol: float = 1e-6,
     max_iter: Optional[int] = None,
     eigen_computation_on_gpu: bool = False,
+    precompute_grad: bool = False,
 ) -> LowRankProductRepresentation:
     r"""
     Calculates a low-rank approximation of the Hessian matrix of the model's
@@ -807,14 +808,20 @@ def model_hessian_low_rank(
             small rank_estimate to fit your device's memory.
             If False, the eigen pair approximation is executed on the CPU by
             scipy wrapper to ARPACK.
+        precompute_grad: If True, the full data gradient is precomputed and kept
+            in memory, which can speed up the hessian vector product computation.
+            Set this to False, if you can't afford to keep the full computation graph
+            in memory.
 
     Returns:
         [LowRankProductRepresentation]
             [pydvl.influence.torch.functional.LowRankProductRepresentation]
             instance that contains the top (up until rank_estimate) eigenvalues
             and corresponding eigenvectors of the Hessian.
     """
-    raw_hvp = create_hvp_function(model, loss, training_data, use_average=True)
+    raw_hvp = create_hvp_function(
+        model, loss, training_data, use_average=True, precompute_grad=precompute_grad
+    )
     n_params = sum([p.numel() for p in model.parameters() if p.requires_grad])
     device = next(model.parameters()).device
     return lanzcos_low_rank_hessian_approx(

diff --git a/src/pydvl/influence/torch/influence_function_model.py b/src/pydvl/influence/torch/influence_function_model.py
@@ -439,6 +439,10 @@ class CgInfluence(TorchInfluenceFunctionModel):
         atol: Absolute tolerance of result.
         maxiter: Maximum number of iterations. If None, defaults to 10*len(b).
         progress: If True, display progress bars.
+        precompute_grad: If True, the full data gradient is precomputed and kept
+            in memory, which can speed up the hessian vector product computation.
+            Set this to False, if you can't afford to keep the full computation graph
+            in memory.
 
     """
 
@@ -452,8 +456,10 @@ def __init__(
         atol: float = 1e-7,
         maxiter: Optional[int] = None,
         progress: bool = False,
+        precompute_grad: bool = False,
     ):
         super().__init__(model, loss)
+        self.precompute_grad = precompute_grad
         self.progress = progress
         self.maxiter = maxiter
         self.atol = atol
@@ -525,7 +531,12 @@ def _solve_hvp(self, rhs: torch.Tensor) -> torch.Tensor:
         if len(self.train_dataloader) == 0:
             raise ValueError("Training dataloader must not be empty.")
 
-        hvp = create_hvp_function(self.model, self.loss, self.train_dataloader)
+        hvp = create_hvp_function(
+            self.model,
+            self.loss,
+            self.train_dataloader,
+            precompute_grad=self.precompute_grad,
+        )
 
         def reg_hvp(v: torch.Tensor):
             return hvp(v) + self.hessian_regularization * v.type(rhs.dtype)
@@ -749,6 +760,10 @@ class ArnoldiInfluence(TorchInfluenceFunctionModel):
             is appropriate for device memory.
             If False, the eigen pair approximation is executed on the CPU by the scipy
             wrapper to ARPACK.
+        precompute_grad: If True, the full data gradient is precomputed and kept
+            in memory, which can speed up the hessian vector product computation.
+            Set this to False, if you can't afford to keep the full computation graph
+            in memory.
     """
     low_rank_representation: LowRankProductRepresentation
 
@@ -762,6 +777,7 @@ def __init__(
         tol: float = 1e-6,
         max_iter: Optional[int] = None,
         eigen_computation_on_gpu: bool = False,
+        precompute_grad: bool = False,
     ):
 
         super().__init__(model, loss)
@@ -771,6 +787,7 @@ def __init__(
         self.max_iter = max_iter
         self.krylov_dimension = krylov_dimension
         self.eigen_computation_on_gpu = eigen_computation_on_gpu
+        self.precompute_grad = precompute_grad
 
     @property
     def is_fitted(self):
@@ -804,6 +821,7 @@ def fit(self, data: DataLoader) -> ArnoldiInfluence:
             tol=self.tol,
             max_iter=self.max_iter,
             eigen_computation_on_gpu=self.eigen_computation_on_gpu,
+            precompute_grad=self.precompute_grad,
         )
         self.low_rank_representation = low_rank_representation.to(self.model_device)
         return self