Avoid bader_caller from altering compressed file in place (#3660)

* add type hint and fix mypy error * fix calling of class * format cleanups * add type hint and format tweak * remove unnecessary var * simplify Bader executable fetch * clean up __init__ * revert change of BaderAnalysis * avoid changing compressed file in place * revert removal of arg bader_path * revise docstring * fix chgcar_path possibly unbound pyright error * format tweaks --------- Co-authored-by: Janosh Riebesell <[email protected]>
materialsproject · Mar 1, 2024 · 83806df · 83806df
1 parent d07164f
commit 83806df
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 77 deletions.
diff --git a/pymatgen/command_line/bader_caller.py b/pymatgen/command_line/bader_caller.py
@@ -14,10 +14,12 @@
 from __future__ import annotations
 
 import os
+import shutil
 import subprocess
 import warnings
 from datetime import datetime
 from glob import glob
+from pathlib import Path
 from shutil import which
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any
@@ -42,11 +44,8 @@
 __date__ = "4/5/13"
 
 
-BADER_EXE = which("bader") or which("bader.exe")
-
-
 class BaderAnalysis:
-    """Performs Bader analysis for Cube files and VASP outputs.
+    """Performs Bader charge analysis for Cube files or VASP outputs.
 
     Attributes:
         data (list[dict]): Atomic data parsed from bader analysis.
@@ -57,21 +56,20 @@ class BaderAnalysis:
         nelectrons (int): Number of electrons of the Bader analysis.
         chgcar (Chgcar): Chgcar object associated with input CHGCAR file.
         atomic_densities (list[dict]): List of charge densities for each
-            atom centered on the atom. Excess 0's are removed from the array
-            to reduce its size. Each dictionary has the keys:
+            atom centered on the atom. Each dictionary has the keys:
             "data", "shift", "dim", where "data" is the charge density array,
             "shift" is the shift used to center the atomic charge density, and
-            "dim" is the dimension of the original charge density map.
+            "dim" is the dimension of the original charge density.
     """
 
     def __init__(
         self,
         chgcar_filename: str = "",
         potcar_filename: str = "",
         chgref_filename: str = "",
-        parse_atomic_densities: bool = False,
         cube_filename: str = "",
-        bader_exe_path: str | None = BADER_EXE,
+        bader_path: str | None = None,
+        parse_atomic_densities: bool = False,
     ) -> None:
         """Initializes the Bader caller.
 
@@ -80,44 +78,66 @@ def __init__(
             potcar_filename (str): The filename of the POTCAR.
             chgref_filename (str): The filename of the
                 reference charge density.
-            parse_atomic_densities (bool, optional): Enable atomic partition
-                of the charge density. Charge densities are atom centered.
-                Defaults to False.
             cube_filename (str, optional): The filename of the cube file.
-            bader_exe_path (str, optional): The path to the bader executable.
+            bader_path (str, optional): The path to the bader executable.
+            parse_atomic_densities (bool, optional): Enable atomic partition of the
+                charge density. Charge densities are atom centered. Defaults to False.
         """
-        bader_exe = which(bader_exe_path or "")
-        if bader_exe is None:
+
+        def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
+            """Utility function to copy a compressed file to a target directory (ScratchDir)
+            and decompress it, to avoid modifying files in place.
+
+            Parameters:
+                file (str | Path): The path to the compressed file to be decompressed.
+                target_dir (str, optional): The target directory where the decompressed file will be stored.
+                    Defaults to "." (current directory).
+
+            Returns:
+                str: The path to the decompressed file if successful.
+            """
+            file = Path(file)
+
+            if file.suffix.lower() in {".bz2", ".gz", ".z"}:
+                shutil.copy(file, f"{target_dir}/{file.name}")
+                out_file = decompress_file(f"{target_dir}/{file.name}")
+
+                if file:  # to avoid mypy error
+                    return str(out_file)
+
+                raise FileNotFoundError(f"File {out_file} not found.")
+
+            return str(file)
+
+        # Get and check Bader executable path
+        if bader_path is None:
+            bader_path = which("bader") or which("bader.exe")
+
+        if bader_path is None:
             raise RuntimeError(
-                "BaderAnalysis requires bader or bader.exe to be in the PATH "
-                "the absolute path to the binary to be specified "
-                f"via {bader_exe_path=}. Download the binary at "
-                "https://theory.cm.utexas.edu/henkelman/code/bader."
+                "Requires bader or bader.exe to be in the PATH or the absolute path to "
+                f"the binary to be specified via {bader_path=}.\n"
+                "Download from https://theory.cm.utexas.edu/henkelman/code/bader."
             )
 
+        # Check input cube/CHGCAR files
         if not (cube_filename or chgcar_filename):
-            raise ValueError("You must provide either a cube file or a CHGCAR")
+            raise ValueError("You must provide either a Cube file or a CHGCAR.")
         if cube_filename and chgcar_filename:
-            raise ValueError(
-                f"You cannot parse a cube and a CHGCAR at the same time. \n{cube_filename=}\n{chgcar_filename=}"
-            )
+            raise ValueError("Cannot parse cube and CHGCAR at the same time.")
+
         self.parse_atomic_densities = parse_atomic_densities
 
         with ScratchDir("."):
             if chgcar_filename:
-                self.is_vasp = True
+                filepath = chgcar_fpath = temp_decompress(chgcar_filename)
 
-                # Decompress the file if compressed
-                fpath = chgcar_fpath = decompress_file(filepath=chgcar_filename) or chgcar_filename
                 self.chgcar = Chgcar.from_file(chgcar_fpath)
                 self.structure = self.chgcar.structure
 
                 self.potcar = Potcar.from_file(potcar_filename) if potcar_filename else None
                 self.natoms = self.chgcar.poscar.natoms
 
-                chgref_fpath = decompress_file(filepath=chgref_filename) or chgref_filename
-                self.reference_used = bool(chgref_filename)
-
                 # List of nelects for each atom from potcar
                 potcar_indices = []
                 for idx, val in enumerate(self.natoms):
@@ -128,80 +148,101 @@ def __init__(
                     else []
                 )
 
+            # Parse from Cube file
             else:
-                self.is_vasp = False
-                fpath = cube_fpath = decompress_file(filepath=cube_filename) or cube_filename
+                filepath = cube_fpath = temp_decompress(cube_filename)
+
                 self.cube = VolumetricData.from_cube(cube_fpath)
                 self.structure = self.cube.structure
                 self.nelects = []
-                chgref_fpath = decompress_file(filepath=chgref_filename) or chgref_filename
-                self.reference_used = bool(chgref_filename)
 
-            args = [bader_exe, fpath]
+            # Prepare CHGCAR reference file
+            chgref_fpath = temp_decompress(chgref_filename)
+            self.reference_used = bool(chgref_filename)
+
+            bader_args = [bader_path, filepath]
 
-            if chgref_fpath:
-                args += ["-ref", chgref_fpath]
+            if self.reference_used:
+                bader_args += ["-ref", chgref_fpath]
 
             if parse_atomic_densities:
-                args += ["-p", "all_atom"]
+                bader_args += ["-p", "all_atom"]
 
             with subprocess.Popen(
-                args,
+                bader_args,
                 stdout=subprocess.PIPE,
                 stdin=subprocess.PIPE,
                 close_fds=True,
             ) as proc:
                 stdout, stderr = proc.communicate()
                 if proc.returncode != 0:
                     raise RuntimeError(
-                        f"{bader_exe} exit code: {proc.returncode}, "
-                        f"error message: {stderr!s}.\nstdout: {stdout!s}"
-                        "Please check your bader installation."
+                        f"{bader_path} exit code: {proc.returncode}, error: {stderr!s}."
+                        f"\nstdout: {stdout!s}. Please check your bader installation."
                     )
 
+            # Determine Bader version
             try:
                 self.version = float(stdout.split()[5])
             except ValueError:
                 self.version = -1  # Unknown
+
             if self.version < 1.0:
                 warnings.warn(
-                    "Your installed version of Bader is outdated, calculation of vacuum charge may be incorrect.",
-                    UserWarning,
+                    "Your installed version of Bader is outdated, calculation of vacuum charge may be incorrect."
                 )
 
-            data = []
-            with open("ACF.dat") as file:
-                lines = file.readlines()
-                headers = ("x", "y", "z", "charge", "min_dist", "atomic_vol")
-                lines.pop(0)
-                lines.pop(0)
-                while True:
-                    line = lines.pop(0).strip()
-                    if line.startswith("-"):
-                        break
-                    vals = map(float, line.split()[1:])
-                    data.append(dict(zip(headers, vals)))
-                for line in lines:
-                    tokens = line.strip().split(":")
-                    if tokens[0] == "VACUUM CHARGE":
-                        self.vacuum_charge = float(tokens[1])
-                    elif tokens[0] == "VACUUM VOLUME":
-                        self.vacuum_volume = float(tokens[1])
-                    elif tokens[0] == "NUMBER OF ELECTRONS":
-                        self.nelectrons = float(tokens[1])
-            self.data = data
+            # Parse ACF.dat file
+            self.data = self._parse_acf()
 
+            # Parse atomic densities
             if self.parse_atomic_densities:
-                self._parse_atomic_densities()
+                self.atomic_densities = self._parse_atomic_densities()
+
+    def _parse_acf(self) -> list[dict]:
+        """Parse Bader output file ACF.dat."""
+        with open("ACF.dat", encoding="us-ascii") as file:
+            lines = file.readlines()
+
+        # Skip header lines
+        headers = ("x", "y", "z", "charge", "min_dist", "atomic_vol")
+        lines.pop(0)
+        lines.pop(0)
+
+        data = []
+        while True:
+            line = lines.pop(0).strip()
+            if line.startswith("-"):
+                break
+            vals = map(float, line.split()[1:])
+            data.append(dict(zip(headers, vals)))
+
+        for line in lines:
+            tokens = line.strip().split(":")
+            if tokens[0] == "VACUUM CHARGE":
+                self.vacuum_charge = float(tokens[1])
+            elif tokens[0] == "VACUUM VOLUME":
+                self.vacuum_volume = float(tokens[1])
+            elif tokens[0] == "NUMBER OF ELECTRONS":
+                self.nelectrons = float(tokens[1])
+
+        return data
 
     @deprecated(
         message=(
             "parse_atomic_densities was deprecated on 2024-02-26 "
-            "and will be removed on 2025-02-26. See https://"
+            "and will be removed on 2025-02-26.\nSee https://"
             "github.com/materialsproject/pymatgen/issues/3652 for details."
         )
     )
-    def _parse_atomic_densities(self):
+    def _parse_atomic_densities(self) -> list[dict]:
+        """Parse atom-centered charge densities with excess zeros removed.
+
+        Each dictionary has the keys:
+            "data", "shift", "dim", where "data" is the charge density array,
+            "shift" is the shift used to center the atomic charge density, and
+            "dim" is the dimension of the original charge density.
+        """
         # Deprecation tracker
         if (
             datetime(2025, 2, 26) < datetime.now()
@@ -210,7 +251,7 @@ def _parse_atomic_densities(self):
         ):
             raise RuntimeError("This method should have been removed, see #3656.")
 
-        def slice_from_center(data, x_width, y_width, z_width):
+        def slice_from_center(data: np.ndarray, x_width: int, y_width: int, z_width: int) -> np.ndarray:
             """Slices a central window from the data array."""
             x, y, z = data.shape
             start_x = x // 2 - (x_width // 2)
@@ -222,7 +263,7 @@ def slice_from_center(data, x_width, y_width, z_width):
                 start_z : start_z + z_width,
             ]
 
-        def find_encompassing_vol(data: np.ndarray):
+        def find_encompassing_vol(data: np.ndarray) -> np.ndarray | None:
             """Find the central encompassing volume which
             holds all the data within a precision.
             """
@@ -260,7 +301,8 @@ def find_encompassing_vol(data: np.ndarray):
                 "dim": self.chgcar.dim,
             }
             atomic_densities.append(dct)
-        self.atomic_densities = atomic_densities
+
+        return atomic_densities
 
     def get_charge(self, atom_index: int) -> float:
         """Convenience method to get the charge on a particular atom. This is the "raw"
@@ -416,7 +458,7 @@ def from_path(cls, path: str, suffix: str = "") -> BaderAnalysis:
         def _get_filepath(filename):
             name_pattern = f"{filename}{suffix}*" if filename != "POTCAR" else f"{filename}*"
             paths = glob(f"{path}/{name_pattern}")
-            fpath = ""
+            filepath = ""
             if len(paths) >= 1:
                 # using reverse=True because, if multiple files are present, they likely
                 # have suffixes 'static', 'relax', 'relax2', etc. and this would give
@@ -425,15 +467,15 @@ def _get_filepath(filename):
                 paths.sort(reverse=True)
                 if len(paths) > 1:
                     warnings.warn(f"Multiple files detected, using {paths[0]}")
-                fpath = paths[0]
+                filepath = paths[0]
             else:
                 msg = f"Could not find {filename!r}"
                 if filename in ("AECCAR0", "AECCAR2"):
                     msg += ", interpret Bader results with severe caution."
                 elif filename == "POTCAR":
                     msg += ", cannot calculate charge transfer."
                 warnings.warn(msg)
-            return fpath
+            return filepath
 
         chgcar_filename = _get_filepath("CHGCAR")
         if chgcar_filename is None:
@@ -455,7 +497,7 @@ def _get_filepath(filename):
         )
 
 
-def bader_analysis_from_path(path, suffix=""):
+def bader_analysis_from_path(path: str, suffix: str = ""):
     """Convenience method to run Bader analysis on a folder containing
     typical VASP output files.
 
@@ -489,7 +531,8 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
         return paths[0]
 
     chgcar_path = _get_filepath("CHGCAR", "Could not find CHGCAR!")
-    chgcar = Chgcar.from_file(chgcar_path)
+    if chgcar_path is not None:
+        chgcar = Chgcar.from_file(chgcar_path)
 
     aeccar0_path = _get_filepath("AECCAR0")
     if not aeccar0_path:
@@ -509,7 +552,12 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
     return bader_analysis_from_objects(chgcar, potcar, aeccar0, aeccar2)
 
 
-def bader_analysis_from_objects(chgcar, potcar=None, aeccar0=None, aeccar2=None):
+def bader_analysis_from_objects(
+    chgcar: Chgcar,
+    potcar: Potcar | None = None,
+    aeccar0: Chgcar | None = None,
+    aeccar2: Chgcar | None = None,
+):
     """Convenience method to run Bader analysis from a set
     of pymatgen Chgcar and Potcar objects.
 

diff --git a/tests/command_line/test_bader_caller.py b/tests/command_line/test_bader_caller.py
@@ -139,4 +139,4 @@ def test_missing_file_bader_exe_path(self):
                 RuntimeError, match="BaderAnalysis requires the executable bader be in the PATH or the full path "
             ),
         ):
-            BaderAnalysis(chgcar_filename=f"{TEST_FILES_DIR}/CHGCAR.Fe3O4", bader_exe_path="")
+            BaderAnalysis(chgcar_filename=f"{TEST_FILES_DIR}/CHGCAR.Fe3O4", bader_path="")