materialsproject · janosh · Mar 1, 2024 · Feb 28, 2024 · Feb 28, 2024 · Feb 28, 2024
@@ -14,10 +14,12 @@
 from __future__ import annotations
 
 import os
+import shutil
 import subprocess
 import warnings
 from datetime import datetime
 from glob import glob
+from pathlib import Path
 from shutil import which
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any
@@ -42,11 +44,8 @@
 __date__ = "4/5/13"
 
 
-BADER_EXE = which("bader") or which("bader.exe")
-
-
 class BaderAnalysis:
-    """Performs Bader analysis for Cube files and VASP outputs.
+    """Performs Bader charge analysis for Cube files or VASP outputs.
 
     Attributes:
         data (list[dict]): Atomic data parsed from bader analysis.
@@ -57,21 +56,20 @@ class BaderAnalysis:
         nelectrons (int): Number of electrons of the Bader analysis.
         chgcar (Chgcar): Chgcar object associated with input CHGCAR file.
         atomic_densities (list[dict]): List of charge densities for each
-            atom centered on the atom. Excess 0's are removed from the array
-            to reduce its size. Each dictionary has the keys:
+            atom centered on the atom. Each dictionary has the keys:
             "data", "shift", "dim", where "data" is the charge density array,
             "shift" is the shift used to center the atomic charge density, and
-            "dim" is the dimension of the original charge density map.
+            "dim" is the dimension of the original charge density.
     """
 
     def __init__(
         self,
         chgcar_filename: str = "",
         potcar_filename: str = "",
         chgref_filename: str = "",
-        parse_atomic_densities: bool = False,
         cube_filename: str = "",
-        bader_exe_path: str | None = BADER_EXE,
+        bader_path: str | None = None,
+        parse_atomic_densities: bool = False,
     ) -> None:
         """Initializes the Bader caller.
 
@@ -80,44 +78,66 @@ def __init__(
             potcar_filename (str): The filename of the POTCAR.
             chgref_filename (str): The filename of the
                 reference charge density.
-            parse_atomic_densities (bool, optional): Enable atomic partition
-                of the charge density. Charge densities are atom centered.
-                Defaults to False.
             cube_filename (str, optional): The filename of the cube file.
-            bader_exe_path (str, optional): The path to the bader executable.
+            bader_path (str, optional): The path to the bader executable.
+            parse_atomic_densities (bool, optional): Enable atomic partition of the
+                charge density. Charge densities are atom centered. Defaults to False.
         """
-        bader_exe = which(bader_exe_path or "")
-        if bader_exe is None:
+
+        def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
+            """Utility function to copy a compressed file to a target directory (ScratchDir)
+            and decompress it, to avoid modifying files in place.
+
+            Parameters:
+                file (str | Path): The path to the compressed file to be decompressed.
+                target_dir (str, optional): The target directory where the decompressed file will be stored.
+                    Defaults to "." (current directory).
+
+            Returns:
+                str: The path to the decompressed file if successful.
+            """
+            file = Path(file)
+
+            if file.suffix.lower() in (".bz2", ".gz", ".z"):
+                shutil.copy(file, f"{target_dir}/{file.name}")
+                out_file = decompress_file(f"{target_dir}/{file.name}")
+
+                if file:  # to avoid mypy error
+                    return str(out_file)
+
+                raise FileNotFoundError(f"File {out_file} not found.")
+
+            return str(file)
+
+        # Get and check Bader executable path
+        if bader_path is None:
+            bader_path = which("bader") or which("bader.exe")
+
+        if bader_path is None:
             raise RuntimeError(
-                "BaderAnalysis requires bader or bader.exe to be in the PATH "
-                "the absolute path to the binary to be specified "
-                f"via {bader_exe_path=}. Download the binary at "
-                "https://theory.cm.utexas.edu/henkelman/code/bader."
+                "Requires bader or bader.exe to be in the PATH or the absolute path to "
+                f"the binary to be specified via {bader_path=}.\n"
+                "Download from https://theory.cm.utexas.edu/henkelman/code/bader."
             )
 
+        # Check input cube/CHGCAR files
         if not (cube_filename or chgcar_filename):
-            raise ValueError("You must provide either a cube file or a CHGCAR")
+            raise ValueError("You must provide either a Cube file or a CHGCAR.")
         if cube_filename and chgcar_filename:
-            raise ValueError(
-                f"You cannot parse a cube and a CHGCAR at the same time. \n{cube_filename=}\n{chgcar_filename=}"
-            )
+            raise ValueError("Cannot parse cube and CHGCAR at the same time.")
+
         self.parse_atomic_densities = parse_atomic_densities
 
         with ScratchDir("."):
             if chgcar_filename:
-                self.is_vasp = True
+                filepath = chgcar_fpath = temp_decompress(chgcar_filename)
 
-                # Decompress the file if compressed
-                fpath = chgcar_fpath = decompress_file(filepath=chgcar_filename) or chgcar_filename
                 self.chgcar = Chgcar.from_file(chgcar_fpath)
                 self.structure = self.chgcar.structure
 
                 self.potcar = Potcar.from_file(potcar_filename) if potcar_filename else None
                 self.natoms = self.chgcar.poscar.natoms
 
-                chgref_fpath = decompress_file(filepath=chgref_filename) or chgref_filename
-                self.reference_used = bool(chgref_filename)
-
                 # List of nelects for each atom from potcar
                 potcar_indices = []
                 for idx, val in enumerate(self.natoms):
@@ -128,85 +148,106 @@ def __init__(
                     else []
                 )
 
+            # Parse from Cube file
             else:
-                self.is_vasp = False
-                fpath = cube_fpath = decompress_file(filepath=cube_filename) or cube_filename
+                filepath = cube_fpath = temp_decompress(cube_filename)
+
                 self.cube = VolumetricData.from_cube(cube_fpath)
                 self.structure = self.cube.structure
                 self.nelects = []
-                chgref_fpath = decompress_file(filepath=chgref_filename) or chgref_filename
-                self.reference_used = bool(chgref_filename)
 
-            args = [bader_exe, fpath]
+            # prepare CHGCAR file
+            chgref_fpath = temp_decompress(chgref_filename)
+            self.reference_used = bool(chgref_filename)
+
+            bader_args = [bader_path, filepath]
 
-            if chgref_fpath:
-                args += ["-ref", chgref_fpath]
+            if self.reference_used:
+                bader_args += ["-ref", chgref_fpath]
 
             if parse_atomic_densities:
-                args += ["-p", "all_atom"]
+                bader_args += ["-p", "all_atom"]
 
             with subprocess.Popen(
-                args,
+                bader_args,
                 stdout=subprocess.PIPE,
                 stdin=subprocess.PIPE,
                 close_fds=True,
             ) as proc:
                 stdout, stderr = proc.communicate()
                 if proc.returncode != 0:
                     raise RuntimeError(
-                        f"{bader_exe} exit code: {proc.returncode}, "
-                        f"error message: {stderr!s}.\nstdout: {stdout!s}"
-                        "Please check your bader installation."
+                        f"{bader_path} exit code: {proc.returncode}, error: {stderr!s}."
+                        f"\nstdout: {stdout!s}. Please check your bader installation."
                     )
 
+            # Determine Bader version
             try:
                 self.version = float(stdout.split()[5])
             except ValueError:
                 self.version = -1  # Unknown
+
             if self.version < 1.0:
                 warnings.warn(
-                    "Your installed version of Bader is outdated, calculation of vacuum charge may be incorrect.",
-                    UserWarning,
+                    "Your installed version of Bader is outdated, calculation of vacuum charge may be incorrect."
                 )
 
-            data = []
-            with open("ACF.dat") as file:
-                lines = file.readlines()
-                headers = ("x", "y", "z", "charge", "min_dist", "atomic_vol")
-                lines.pop(0)
-                lines.pop(0)
-                while True:
-                    line = lines.pop(0).strip()
-                    if line.startswith("-"):
-                        break
-                    vals = map(float, line.split()[1:])
-                    data.append(dict(zip(headers, vals)))
-                for line in lines:
-                    tokens = line.strip().split(":")
-                    if tokens[0] == "VACUUM CHARGE":
-                        self.vacuum_charge = float(tokens[1])
-                    elif tokens[0] == "VACUUM VOLUME":
-                        self.vacuum_volume = float(tokens[1])
-                    elif tokens[0] == "NUMBER OF ELECTRONS":
-                        self.nelectrons = float(tokens[1])
-            self.data = data
+            # Parse ACF.dat file
+            self.data = self._parse_acf()
 
+            # Parse atomic densities
             if self.parse_atomic_densities:
-                self._parse_atomic_densities()
+                self.atomic_densities = self._parse_atomic_densities()
+
+    def _parse_acf(self) -> list[dict]:
+        """Parse Bader output file ACF.dat."""
+        with open("ACF.dat", encoding="us-ascii") as file:
+            lines = file.readlines()
+
+        # Skip header lines
+        headers = ("x", "y", "z", "charge", "min_dist", "atomic_vol")
+        lines.pop(0)
+        lines.pop(0)
+
+        data = []
+        while True:
+            line = lines.pop(0).strip()
+            if line.startswith("-"):
+                break
+            vals = map(float, line.split()[1:])
+            data.append(dict(zip(headers, vals)))
+
+        for line in lines:
+            tokens = line.strip().split(":")
+            if tokens[0] == "VACUUM CHARGE":
+                self.vacuum_charge = float(tokens[1])
+            elif tokens[0] == "VACUUM VOLUME":
+                self.vacuum_volume = float(tokens[1])
+            elif tokens[0] == "NUMBER OF ELECTRONS":
+                self.nelectrons = float(tokens[1])
+
+        return data
 
     @deprecated(
         message=(
             "parse_atomic_densities was deprecated on 2024-02-26 "
-            "and will be removed on 2025-02-26. See https://"
+            "and will be removed on 2025-02-26.\nSee https://"
             "github.com/materialsproject/pymatgen/issues/3652 for details."
         )
     )
-    def _parse_atomic_densities(self):
+    def _parse_atomic_densities(self) -> list[dict]:
+        """Parse atom-centered charge densities with excess zeros removed.
+
+        Each dictionary has the keys:
+            "data", "shift", "dim", where "data" is the charge density array,
+            "shift" is the shift used to center the atomic charge density, and
+            "dim" is the dimension of the original charge density.
+        """
         # Deprecation tracker
         if datetime(2025, 2, 26) < datetime.now() and "CI" in os.environ:
             raise RuntimeError("This method should have been removed, see #3656.")
 
-        def slice_from_center(data, x_width, y_width, z_width):
+        def slice_from_center(data: np.ndarray, x_width: int, y_width: int, z_width: int) -> np.ndarray:
             """Slices a central window from the data array."""
             x, y, z = data.shape
             start_x = x // 2 - (x_width // 2)
@@ -218,7 +259,7 @@ def slice_from_center(data, x_width, y_width, z_width):
                 start_z : start_z + z_width,
             ]
 
-        def find_encompassing_vol(data: np.ndarray):
+        def find_encompassing_vol(data: np.ndarray) -> np.ndarray | None:
             """Find the central encompassing volume which
             holds all the data within a precision.
             """
@@ -256,7 +297,8 @@ def find_encompassing_vol(data: np.ndarray):
                 "dim": self.chgcar.dim,
             }
             atomic_densities.append(dct)
-        self.atomic_densities = atomic_densities
+
+        return atomic_densities
 
     def get_charge(self, atom_index: int) -> float:
         """Convenience method to get the charge on a particular atom. This is the "raw"
@@ -412,7 +454,7 @@ def from_path(cls, path: str, suffix: str = "") -> BaderAnalysis:
         def _get_filepath(filename):
             name_pattern = f"{filename}{suffix}*" if filename != "POTCAR" else f"{filename}*"
             paths = glob(f"{path}/{name_pattern}")
-            fpath = ""
+            filepath = ""
             if len(paths) >= 1:
                 # using reverse=True because, if multiple files are present, they likely
                 # have suffixes 'static', 'relax', 'relax2', etc. and this would give
@@ -421,15 +463,15 @@ def _get_filepath(filename):
                 paths.sort(reverse=True)
                 if len(paths) > 1:
                     warnings.warn(f"Multiple files detected, using {paths[0]}")
-                fpath = paths[0]
+                filepath = paths[0]
             else:
                 msg = f"Could not find {filename!r}"
                 if filename in ("AECCAR0", "AECCAR2"):
                     msg += ", interpret Bader results with severe caution."
                 elif filename == "POTCAR":
                     msg += ", cannot calculate charge transfer."
                 warnings.warn(msg)
-            return fpath
+            return filepath
 
         chgcar_filename = _get_filepath("CHGCAR")
         if chgcar_filename is None:
@@ -451,7 +493,7 @@ def _get_filepath(filename):
         )
 
 
-def bader_analysis_from_path(path, suffix=""):
+def bader_analysis_from_path(path: str, suffix: str = ""):
     """Convenience method to run Bader analysis on a folder containing
     typical VASP output files.
 
@@ -485,6 +527,8 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
         return paths[0]
 
     chgcar_path = _get_filepath("CHGCAR", "Could not find CHGCAR!")
+    if chgcar_path is None:
+        raise FileNotFoundError("Could not find CHGCAR!")
     @classmethod 
     def from_file(cls, filename: str): 
         """Read a CHGCAR file. 
         Args: 
             filename (str): Path to CHGCAR file. 
         Returns: 
             Chgcar 
         """ 
         poscar, data, data_aug = VolumetricData.parse_file(filename) 
         return cls(poscar, data, data_aug=data_aug) 
     @staticmethod 
     def parse_file(filename: str) -> tuple[Poscar, dict, dict]: 
         """ 
         Convenience method to parse a generic volumetric data file in the vasp 
         like format. Used by subclasses for parsing file. 
         Args: 
             filename (str): Path of file to parse 
         Returns: 
             tuple[Poscar, dict, dict]: Poscar object, data dict, data_aug dict 
         """ 
         poscar_read = False 
         poscar_string: list[str] = [] 
         dataset: np.ndarray = np.zeros((1, 1, 1)) 
         all_dataset: list[np.ndarray] = [] 
         # for holding any strings in input that are not Poscar 
         # or VolumetricData (typically augmentation charges) 
         all_dataset_aug: dict[int, list[str]] = {} 
         dim: list[int] = [] 
         dimline = "" 
         read_dataset = False 
         ngrid_pts = 0 
         data_count = 0 
         poscar = None 
         with zopen(filename, mode="rt") as file: 
             for line in file: 
                 original_line = line 
                 line = line.strip() 
                 if read_dataset: 
                     for tok in line.split(): 
                         if data_count < ngrid_pts: 
                             # This complicated procedure is necessary because 
                             # vasp outputs x as the fastest index, followed by y 
                             # then z. 
                             no_x = data_count // dim[0] 
                             dataset[data_count % dim[0], no_x % dim[1], no_x // dim[1]] = float(tok) 
                             data_count += 1 
                     if data_count >= ngrid_pts: 
                         read_dataset = False 
                         data_count = 0 
                         all_dataset.append(dataset) 
                 elif not poscar_read: 
                     if line != "" or len(poscar_string) == 0: 
                         poscar_string.append(line) 
                     elif line == "": 
                         poscar = Poscar.from_str("\n".join(poscar_string)) 
                         poscar_read = True 
                 elif not dim: 
                     dim = [int(i) for i in line.split()] 
                     ngrid_pts = dim[0] * dim[1] * dim[2] 
                     dimline = line 
                     read_dataset = True 
                     dataset = np.zeros(dim) 
                 elif line == dimline: 
                     # when line == dimline, expect volumetric data to follow 
                     # so set read_dataset to True 
                     read_dataset = True 
                     dataset = np.zeros(dim) 
                 else: 
                     # store any extra lines that were not part of the 
                     # volumetric data so we know which set of data the extra 
                     # lines are associated with 
                     key = len(all_dataset) - 1 
                     if key not in all_dataset_aug: 
                         all_dataset_aug[key] = [] 
                     all_dataset_aug[key].append(original_line) 
             if len(all_dataset) == 4: 
                 data = { 
                     "total": all_dataset[0], 
                     "diff_x": all_dataset[1], 
                     "diff_y": all_dataset[2], 
                     "diff_z": all_dataset[3], 
                 } 
                 data_aug = { 
                     "total": all_dataset_aug.get(0), 
                     "diff_x": all_dataset_aug.get(1), 
                     "diff_y": all_dataset_aug.get(2), 
                     "diff_z": all_dataset_aug.get(3), 
                 } 
                 # construct a "diff" dict for scalar-like magnetization density, 
                 # referenced to an arbitrary direction (using same method as 
                 # pymatgen.electronic_structure.core.Magmom, see 
                 # Magmom documentation for justification for this) 
                 # TODO: re-examine this, and also similar behavior in 
                 # Magmom - @mkhorton 
                 # TODO: does CHGCAR change with different SAXIS? 
                 diff_xyz = np.array([data["diff_x"], data["diff_y"], data["diff_z"]]) 
                 diff_xyz = diff_xyz.reshape((3, dim[0] * dim[1] * dim[2])) 
                 ref_direction = np.array([1.01, 1.02, 1.03]) 
                 ref_sign = np.sign(np.dot(ref_direction, diff_xyz)) 
                 diff = np.multiply(np.linalg.norm(diff_xyz, axis=0), ref_sign) 
                 data["diff"] = diff.reshape((dim[0], dim[1], dim[2])) 
             elif len(all_dataset) == 2: 
                 data = {"total": all_dataset[0], "diff": all_dataset[1]} 
                 data_aug = { 
                     "total": all_dataset_aug.get(0), 
                     "diff": all_dataset_aug.get(1), 
                 } 
             else: 
                 data = {"total": all_dataset[0]} 
                 data_aug = {"total": all_dataset_aug.get(0)} 
             return poscar, data, data_aug  # type: ignore[return-value] 
     @classmethod 
     def from_file(cls, filename: str): 
         """Read a CHGCAR file. 
  
         Args: 
             filename (str): Path to CHGCAR file. 
  
         Returns: 
             Chgcar 
         """ 
         poscar, data, data_aug = VolumetricData.parse_file(filename) 
         return cls(poscar, data, data_aug=data_aug) 
     @staticmethod 
     def parse_file(filename: str) -> tuple[Poscar, dict, dict]: 
         """ 
         Convenience method to parse a generic volumetric data file in the vasp 
         like format. Used by subclasses for parsing file. 
  
         Args: 
             filename (str): Path of file to parse 
  
         Returns: 
             tuple[Poscar, dict, dict]: Poscar object, data dict, data_aug dict 
         """ 
         poscar_read = False 
         poscar_string: list[str] = [] 
         dataset: np.ndarray = np.zeros((1, 1, 1)) 
         all_dataset: list[np.ndarray] = [] 
         # for holding any strings in input that are not Poscar 
         # or VolumetricData (typically augmentation charges) 
         all_dataset_aug: dict[int, list[str]] = {} 
         dim: list[int] = [] 
         dimline = "" 
         read_dataset = False 
         ngrid_pts = 0 
         data_count = 0 
         poscar = None 
         with zopen(filename, mode="rt") as file: 
             for line in file: 
                 original_line = line 
                 line = line.strip() 
                 if read_dataset: 
                     for tok in line.split(): 
                         if data_count < ngrid_pts: 
                             # This complicated procedure is necessary because 
                             # vasp outputs x as the fastest index, followed by y 
                             # then z. 
                             no_x = data_count // dim[0] 
                             dataset[data_count % dim[0], no_x % dim[1], no_x // dim[1]] = float(tok) 
                             data_count += 1 
                     if data_count >= ngrid_pts: 
                         read_dataset = False 
                         data_count = 0 
                         all_dataset.append(dataset) 
                 elif not poscar_read: 
                     if line != "" or len(poscar_string) == 0: 
                         poscar_string.append(line) 
                     elif line == "": 
                         poscar = Poscar.from_str("\n".join(poscar_string)) 
                         poscar_read = True 
                 elif not dim: 
                     dim = [int(i) for i in line.split()] 
                     ngrid_pts = dim[0] * dim[1] * dim[2] 
                     dimline = line 
                     read_dataset = True 
                     dataset = np.zeros(dim) 
                 elif line == dimline: 
                     # when line == dimline, expect volumetric data to follow 
                     # so set read_dataset to True 
                     read_dataset = True 
                     dataset = np.zeros(dim) 
                 else: 
                     # store any extra lines that were not part of the 
                     # volumetric data so we know which set of data the extra 
                     # lines are associated with 
                     key = len(all_dataset) - 1 
                     if key not in all_dataset_aug: 
                         all_dataset_aug[key] = [] 
                     all_dataset_aug[key].append(original_line) 
             if len(all_dataset) == 4: 
                 data = { 
                     "total": all_dataset[0], 
                     "diff_x": all_dataset[1], 
                     "diff_y": all_dataset[2], 
                     "diff_z": all_dataset[3], 
                 } 
                 data_aug = { 
                     "total": all_dataset_aug.get(0), 
                     "diff_x": all_dataset_aug.get(1), 
                     "diff_y": all_dataset_aug.get(2), 
                     "diff_z": all_dataset_aug.get(3), 
                 } 
  
                 # construct a "diff" dict for scalar-like magnetization density, 
                 # referenced to an arbitrary direction (using same method as 
                 # pymatgen.electronic_structure.core.Magmom, see 
                 # Magmom documentation for justification for this) 
                 # TODO: re-examine this, and also similar behavior in 
                 # Magmom - @mkhorton 
                 # TODO: does CHGCAR change with different SAXIS? 
                 diff_xyz = np.array([data["diff_x"], data["diff_y"], data["diff_z"]]) 
                 diff_xyz = diff_xyz.reshape((3, dim[0] * dim[1] * dim[2])) 
                 ref_direction = np.array([1.01, 1.02, 1.03]) 
                 ref_sign = np.sign(np.dot(ref_direction, diff_xyz)) 
                 diff = np.multiply(np.linalg.norm(diff_xyz, axis=0), ref_sign) 
                 data["diff"] = diff.reshape((dim[0], dim[1], dim[2])) 
  
             elif len(all_dataset) == 2: 
                 data = {"total": all_dataset[0], "diff": all_dataset[1]} 
                 data_aug = { 
                     "total": all_dataset_aug.get(0), 
                     "diff": all_dataset_aug.get(1), 
                 } 
             else: 
                 data = {"total": all_dataset[0]} 
                 data_aug = {"total": all_dataset_aug.get(0)} 
             return poscar, data, data_aug  # type: ignore[return-value] 
     chgcar = Chgcar.from_file(chgcar_path)
 
     aeccar0_path = _get_filepath("AECCAR0")
@@ -505,7 +549,12 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
     return bader_analysis_from_objects(chgcar, potcar, aeccar0, aeccar2)
 
 
-def bader_analysis_from_objects(chgcar, potcar=None, aeccar0=None, aeccar2=None):
+def bader_analysis_from_objects(
+    chgcar: Chgcar,
+    potcar: Potcar | None = None,
+    aeccar0: Chgcar | None = None,
+    aeccar2: Chgcar | None = None,
+):
     """Convenience method to run Bader analysis from a set
     of pymatgen Chgcar and Potcar objects.
 

@@ -136,4 +136,4 @@ def test_missing_file_bader_exe_path(self):
         with patch("shutil.which", return_value=None), pytest.raises(
             RuntimeError, match="BaderAnalysis requires the executable bader be in the PATH or the full path "
         ):
-            BaderAnalysis(chgcar_filename=f"{TEST_FILES_DIR}/CHGCAR.Fe3O4", bader_exe_path="")
+            BaderAnalysis(chgcar_filename=f"{TEST_FILES_DIR}/CHGCAR.Fe3O4", bader_path="")