Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid bader_caller from altering compressed file in place #3660

Merged
merged 15 commits into from
Mar 1, 2024
Merged
199 changes: 124 additions & 75 deletions pymatgen/command_line/bader_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
from __future__ import annotations

import os
import shutil
import subprocess
import warnings
from datetime import datetime
from glob import glob
from pathlib import Path
from shutil import which
from tempfile import TemporaryDirectory
from typing import TYPE_CHECKING, Any
Expand All @@ -42,11 +44,8 @@
__date__ = "4/5/13"


BADER_EXE = which("bader") or which("bader.exe")
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved


class BaderAnalysis:
"""Performs Bader analysis for Cube files and VASP outputs.
"""Performs Bader charge analysis for Cube files or VASP outputs.

Attributes:
data (list[dict]): Atomic data parsed from bader analysis.
Expand All @@ -57,21 +56,20 @@ class BaderAnalysis:
nelectrons (int): Number of electrons of the Bader analysis.
chgcar (Chgcar): Chgcar object associated with input CHGCAR file.
atomic_densities (list[dict]): List of charge densities for each
atom centered on the atom. Excess 0's are removed from the array
to reduce its size. Each dictionary has the keys:
atom centered on the atom. Each dictionary has the keys:
"data", "shift", "dim", where "data" is the charge density array,
"shift" is the shift used to center the atomic charge density, and
"dim" is the dimension of the original charge density map.
"dim" is the dimension of the original charge density.
"""

def __init__(
self,
chgcar_filename: str = "",
potcar_filename: str = "",
chgref_filename: str = "",
parse_atomic_densities: bool = False,
cube_filename: str = "",
bader_exe_path: str | None = BADER_EXE,
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved
bader_path: str | None = None,
parse_atomic_densities: bool = False,
) -> None:
"""Initializes the Bader caller.

Expand All @@ -80,44 +78,66 @@ def __init__(
potcar_filename (str): The filename of the POTCAR.
chgref_filename (str): The filename of the
reference charge density.
parse_atomic_densities (bool, optional): Enable atomic partition
of the charge density. Charge densities are atom centered.
Defaults to False.
cube_filename (str, optional): The filename of the cube file.
bader_exe_path (str, optional): The path to the bader executable.
bader_path (str, optional): The path to the bader executable.
parse_atomic_densities (bool, optional): Enable atomic partition of the
charge density. Charge densities are atom centered. Defaults to False.
"""
bader_exe = which(bader_exe_path or "")
if bader_exe is None:

def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved
"""Utility function to copy a compressed file to a target directory (ScratchDir)
and decompress it, to avoid modifying files in place.

Parameters:
file (str | Path): The path to the compressed file to be decompressed.
target_dir (str, optional): The target directory where the decompressed file will be stored.
Defaults to "." (current directory).

Returns:
str: The path to the decompressed file if successful.
"""
file = Path(file)

if file.suffix.lower() in (".bz2", ".gz", ".z"):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My oversight, should use set instead {".bz2", ".gz", ".z"}

Copy link
Contributor Author

@DanielYang59 DanielYang59 Mar 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a quick note for myself, Sourcery would recommend using set over list or tuple for membership check, see here.

The advantage of set/tuple over list seems quite obvious, for example discussed here.

However I still don't quite know the difference between tuple and set in this scenario, some claims it's related to performance.

shutil.copy(file, f"{target_dir}/{file.name}")
out_file = decompress_file(f"{target_dir}/{file.name}")

if file: # to avoid mypy error
return str(out_file)

raise FileNotFoundError(f"File {out_file} not found.")

return str(file)

# Get and check Bader executable path
if bader_path is None:
bader_path = which("bader") or which("bader.exe")

if bader_path is None:
raise RuntimeError(
"BaderAnalysis requires bader or bader.exe to be in the PATH "
"the absolute path to the binary to be specified "
f"via {bader_exe_path=}. Download the binary at "
"https://theory.cm.utexas.edu/henkelman/code/bader."
"Requires bader or bader.exe to be in the PATH or the absolute path to "
f"the binary to be specified via {bader_path=}.\n"
"Download from https://theory.cm.utexas.edu/henkelman/code/bader."
)

# Check input cube/CHGCAR files
if not (cube_filename or chgcar_filename):
raise ValueError("You must provide either a cube file or a CHGCAR")
raise ValueError("You must provide either a Cube file or a CHGCAR.")
if cube_filename and chgcar_filename:
raise ValueError(
f"You cannot parse a cube and a CHGCAR at the same time. \n{cube_filename=}\n{chgcar_filename=}"
)
raise ValueError("Cannot parse cube and CHGCAR at the same time.")

self.parse_atomic_densities = parse_atomic_densities

with ScratchDir("."):
if chgcar_filename:
self.is_vasp = True
Copy link
Contributor Author

@DanielYang59 DanielYang59 Feb 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't see self.is_vasp being used.

filepath = chgcar_fpath = temp_decompress(chgcar_filename)

# Decompress the file if compressed
fpath = chgcar_fpath = decompress_file(filepath=chgcar_filename) or chgcar_filename
self.chgcar = Chgcar.from_file(chgcar_fpath)
self.structure = self.chgcar.structure

self.potcar = Potcar.from_file(potcar_filename) if potcar_filename else None
self.natoms = self.chgcar.poscar.natoms

chgref_fpath = decompress_file(filepath=chgref_filename) or chgref_filename
self.reference_used = bool(chgref_filename)

# List of nelects for each atom from potcar
potcar_indices = []
for idx, val in enumerate(self.natoms):
Expand All @@ -128,85 +148,106 @@ def __init__(
else []
)

# Parse from Cube file
else:
self.is_vasp = False
fpath = cube_fpath = decompress_file(filepath=cube_filename) or cube_filename
filepath = cube_fpath = temp_decompress(cube_filename)

self.cube = VolumetricData.from_cube(cube_fpath)
self.structure = self.cube.structure
self.nelects = []
chgref_fpath = decompress_file(filepath=chgref_filename) or chgref_filename
self.reference_used = bool(chgref_filename)

args = [bader_exe, fpath]
# prepare CHGCAR file
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved
chgref_fpath = temp_decompress(chgref_filename)
self.reference_used = bool(chgref_filename)

bader_args = [bader_path, filepath]

if chgref_fpath:
args += ["-ref", chgref_fpath]
if self.reference_used:
bader_args += ["-ref", chgref_fpath]

if parse_atomic_densities:
args += ["-p", "all_atom"]
bader_args += ["-p", "all_atom"]

with subprocess.Popen(
args,
bader_args,
stdout=subprocess.PIPE,
stdin=subprocess.PIPE,
close_fds=True,
) as proc:
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise RuntimeError(
f"{bader_exe} exit code: {proc.returncode}, "
f"error message: {stderr!s}.\nstdout: {stdout!s}"
"Please check your bader installation."
f"{bader_path} exit code: {proc.returncode}, error: {stderr!s}."
f"\nstdout: {stdout!s}. Please check your bader installation."
)

# Determine Bader version
try:
self.version = float(stdout.split()[5])
except ValueError:
self.version = -1 # Unknown

if self.version < 1.0:
warnings.warn(
"Your installed version of Bader is outdated, calculation of vacuum charge may be incorrect.",
UserWarning,
"Your installed version of Bader is outdated, calculation of vacuum charge may be incorrect."
)

data = []
with open("ACF.dat") as file:
lines = file.readlines()
headers = ("x", "y", "z", "charge", "min_dist", "atomic_vol")
lines.pop(0)
lines.pop(0)
while True:
line = lines.pop(0).strip()
if line.startswith("-"):
break
vals = map(float, line.split()[1:])
data.append(dict(zip(headers, vals)))
for line in lines:
tokens = line.strip().split(":")
if tokens[0] == "VACUUM CHARGE":
self.vacuum_charge = float(tokens[1])
elif tokens[0] == "VACUUM VOLUME":
self.vacuum_volume = float(tokens[1])
elif tokens[0] == "NUMBER OF ELECTRONS":
self.nelectrons = float(tokens[1])
self.data = data
# Parse ACF.dat file
self.data = self._parse_acf()

# Parse atomic densities
if self.parse_atomic_densities:
self._parse_atomic_densities()
self.atomic_densities = self._parse_atomic_densities()

def _parse_acf(self) -> list[dict]:
"""Parse Bader output file ACF.dat."""
with open("ACF.dat", encoding="us-ascii") as file:
lines = file.readlines()

# Skip header lines
headers = ("x", "y", "z", "charge", "min_dist", "atomic_vol")
lines.pop(0)
lines.pop(0)

data = []
while True:
line = lines.pop(0).strip()
if line.startswith("-"):
break
vals = map(float, line.split()[1:])
data.append(dict(zip(headers, vals)))

for line in lines:
tokens = line.strip().split(":")
if tokens[0] == "VACUUM CHARGE":
self.vacuum_charge = float(tokens[1])
elif tokens[0] == "VACUUM VOLUME":
self.vacuum_volume = float(tokens[1])
elif tokens[0] == "NUMBER OF ELECTRONS":
self.nelectrons = float(tokens[1])

return data

@deprecated(
message=(
"parse_atomic_densities was deprecated on 2024-02-26 "
"and will be removed on 2025-02-26. See https://"
"and will be removed on 2025-02-26.\nSee https://"
"github.com/materialsproject/pymatgen/issues/3652 for details."
)
)
def _parse_atomic_densities(self):
def _parse_atomic_densities(self) -> list[dict]:
"""Parse atom-centered charge densities with excess zeros removed.

Each dictionary has the keys:
"data", "shift", "dim", where "data" is the charge density array,
"shift" is the shift used to center the atomic charge density, and
"dim" is the dimension of the original charge density.
"""
# Deprecation tracker
if datetime(2025, 2, 26) < datetime.now() and "CI" in os.environ:
raise RuntimeError("This method should have been removed, see #3656.")

def slice_from_center(data, x_width, y_width, z_width):
def slice_from_center(data: np.ndarray, x_width: int, y_width: int, z_width: int) -> np.ndarray:
"""Slices a central window from the data array."""
x, y, z = data.shape
start_x = x // 2 - (x_width // 2)
Expand All @@ -218,7 +259,7 @@ def slice_from_center(data, x_width, y_width, z_width):
start_z : start_z + z_width,
]

def find_encompassing_vol(data: np.ndarray):
def find_encompassing_vol(data: np.ndarray) -> np.ndarray | None:
"""Find the central encompassing volume which
holds all the data within a precision.
"""
Expand Down Expand Up @@ -256,7 +297,8 @@ def find_encompassing_vol(data: np.ndarray):
"dim": self.chgcar.dim,
}
atomic_densities.append(dct)
self.atomic_densities = atomic_densities

return atomic_densities

def get_charge(self, atom_index: int) -> float:
"""Convenience method to get the charge on a particular atom. This is the "raw"
Expand Down Expand Up @@ -412,7 +454,7 @@ def from_path(cls, path: str, suffix: str = "") -> BaderAnalysis:
def _get_filepath(filename):
name_pattern = f"{filename}{suffix}*" if filename != "POTCAR" else f"{filename}*"
paths = glob(f"{path}/{name_pattern}")
fpath = ""
filepath = ""
if len(paths) >= 1:
# using reverse=True because, if multiple files are present, they likely
# have suffixes 'static', 'relax', 'relax2', etc. and this would give
Expand All @@ -421,15 +463,15 @@ def _get_filepath(filename):
paths.sort(reverse=True)
if len(paths) > 1:
warnings.warn(f"Multiple files detected, using {paths[0]}")
fpath = paths[0]
filepath = paths[0]
else:
msg = f"Could not find {filename!r}"
if filename in ("AECCAR0", "AECCAR2"):
msg += ", interpret Bader results with severe caution."
elif filename == "POTCAR":
msg += ", cannot calculate charge transfer."
warnings.warn(msg)
return fpath
return filepath

chgcar_filename = _get_filepath("CHGCAR")
if chgcar_filename is None:
Expand All @@ -451,7 +493,7 @@ def _get_filepath(filename):
)


def bader_analysis_from_path(path, suffix=""):
def bader_analysis_from_path(path: str, suffix: str = ""):
"""Convenience method to run Bader analysis on a folder containing
typical VASP output files.

Expand Down Expand Up @@ -485,6 +527,8 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
return paths[0]

chgcar_path = _get_filepath("CHGCAR", "Could not find CHGCAR!")
if chgcar_path is None:
raise FileNotFoundError("Could not find CHGCAR!")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _get_filepath method already issued a warning in line 529, maybe this is not necessary?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess Chgcar.from_file(chgcar_path) will already raise if the file is missing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe no... By Chgcar.from_file

@classmethod
def from_file(cls, filename: str):
"""Read a CHGCAR file.
Args:
filename (str): Path to CHGCAR file.
Returns:
Chgcar
"""
poscar, data, data_aug = VolumetricData.parse_file(filename)
return cls(poscar, data, data_aug=data_aug)

And then VolumetricData.parse_file:

@staticmethod
def parse_file(filename: str) -> tuple[Poscar, dict, dict]:
"""
Convenience method to parse a generic volumetric data file in the vasp
like format. Used by subclasses for parsing file.
Args:
filename (str): Path of file to parse
Returns:
tuple[Poscar, dict, dict]: Poscar object, data dict, data_aug dict
"""
poscar_read = False
poscar_string: list[str] = []
dataset: np.ndarray = np.zeros((1, 1, 1))
all_dataset: list[np.ndarray] = []
# for holding any strings in input that are not Poscar
# or VolumetricData (typically augmentation charges)
all_dataset_aug: dict[int, list[str]] = {}
dim: list[int] = []
dimline = ""
read_dataset = False
ngrid_pts = 0
data_count = 0
poscar = None
with zopen(filename, mode="rt") as file:
for line in file:
original_line = line
line = line.strip()
if read_dataset:
for tok in line.split():
if data_count < ngrid_pts:
# This complicated procedure is necessary because
# vasp outputs x as the fastest index, followed by y
# then z.
no_x = data_count // dim[0]
dataset[data_count % dim[0], no_x % dim[1], no_x // dim[1]] = float(tok)
data_count += 1
if data_count >= ngrid_pts:
read_dataset = False
data_count = 0
all_dataset.append(dataset)
elif not poscar_read:
if line != "" or len(poscar_string) == 0:
poscar_string.append(line)
elif line == "":
poscar = Poscar.from_str("\n".join(poscar_string))
poscar_read = True
elif not dim:
dim = [int(i) for i in line.split()]
ngrid_pts = dim[0] * dim[1] * dim[2]
dimline = line
read_dataset = True
dataset = np.zeros(dim)
elif line == dimline:
# when line == dimline, expect volumetric data to follow
# so set read_dataset to True
read_dataset = True
dataset = np.zeros(dim)
else:
# store any extra lines that were not part of the
# volumetric data so we know which set of data the extra
# lines are associated with
key = len(all_dataset) - 1
if key not in all_dataset_aug:
all_dataset_aug[key] = []
all_dataset_aug[key].append(original_line)
if len(all_dataset) == 4:
data = {
"total": all_dataset[0],
"diff_x": all_dataset[1],
"diff_y": all_dataset[2],
"diff_z": all_dataset[3],
}
data_aug = {
"total": all_dataset_aug.get(0),
"diff_x": all_dataset_aug.get(1),
"diff_y": all_dataset_aug.get(2),
"diff_z": all_dataset_aug.get(3),
}
# construct a "diff" dict for scalar-like magnetization density,
# referenced to an arbitrary direction (using same method as
# pymatgen.electronic_structure.core.Magmom, see
# Magmom documentation for justification for this)
# TODO: re-examine this, and also similar behavior in
# Magmom - @mkhorton
# TODO: does CHGCAR change with different SAXIS?
diff_xyz = np.array([data["diff_x"], data["diff_y"], data["diff_z"]])
diff_xyz = diff_xyz.reshape((3, dim[0] * dim[1] * dim[2]))
ref_direction = np.array([1.01, 1.02, 1.03])
ref_sign = np.sign(np.dot(ref_direction, diff_xyz))
diff = np.multiply(np.linalg.norm(diff_xyz, axis=0), ref_sign)
data["diff"] = diff.reshape((dim[0], dim[1], dim[2]))
elif len(all_dataset) == 2:
data = {"total": all_dataset[0], "diff": all_dataset[1]}
data_aug = {
"total": all_dataset_aug.get(0),
"diff": all_dataset_aug.get(1),
}
else:
data = {"total": all_dataset[0]}
data_aug = {"total": all_dataset_aug.get(0)}
return poscar, data, data_aug # type: ignore[return-value]

Thenzopen: https://github.com/materialsvirtuallab/monty/blob/4d60fd4745f840354e7b3f48346eaf3cd68f2b35/monty/io.py#L19-L45

But certainly open would complain (maybe not so descriptive). But _get_filepath would issue a warning anyway, so maybe don't bother issure another here 😄

Copy link
Contributor Author

@DanielYang59 DanielYang59 Mar 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, I added the if chgcar_path is not None check to avoid mypy error, because _get_filepath could return None and trigger mypy incompatible type error:

error: Argument 1 to "from_file" of "Chgcar" has incompatible type "str | None"; expected "str"  [arg-type]

chgcar = Chgcar.from_file(chgcar_path)

aeccar0_path = _get_filepath("AECCAR0")
Expand All @@ -505,7 +549,12 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
return bader_analysis_from_objects(chgcar, potcar, aeccar0, aeccar2)


def bader_analysis_from_objects(chgcar, potcar=None, aeccar0=None, aeccar2=None):
def bader_analysis_from_objects(
chgcar: Chgcar,
potcar: Potcar | None = None,
aeccar0: Chgcar | None = None,
aeccar2: Chgcar | None = None,
):
"""Convenience method to run Bader analysis from a set
of pymatgen Chgcar and Potcar objects.

Expand Down
2 changes: 1 addition & 1 deletion tests/command_line/test_bader_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,4 @@ def test_missing_file_bader_exe_path(self):
with patch("shutil.which", return_value=None), pytest.raises(
RuntimeError, match="BaderAnalysis requires the executable bader be in the PATH or the full path "
):
BaderAnalysis(chgcar_filename=f"{TEST_FILES_DIR}/CHGCAR.Fe3O4", bader_exe_path="")
BaderAnalysis(chgcar_filename=f"{TEST_FILES_DIR}/CHGCAR.Fe3O4", bader_path="")