Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid bader_caller from altering compressed file in place #3660

Merged
merged 15 commits into from
Mar 1, 2024
Merged
57 changes: 27 additions & 30 deletions pymatgen/command_line/bader_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,16 @@ def __init__(
reference charge density.
cube_filename (str, optional): The filename of the cube file.
bader_path (str, optional): The path to the bader executable.
parse_atomic_densities (bool, optional): Enable atomic partition
of the charge density. Charge densities are atom centered.
Defaults to False.
parse_atomic_densities (bool, optional): Enable atomic partition of the
charge density. Charge densities are atom centered. Defaults to False.
"""

def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved
"""
Utility function to copy a compressed file to a target directory (ScratchDir)
"""Utility function to copy a compressed file to a target directory (ScratchDir)
and decompress it, to avoid modifying files in place.

Parameters:
file (str or Path): The path to the compressed file to be decompressed.
file (str | Path): The path to the compressed file to be decompressed.
target_dir (str, optional): The target directory where the decompressed file will be stored.
Defaults to "." (current directory).

Expand All @@ -100,14 +98,14 @@ def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
"""
file = Path(file)

if file.suffix.lower() in [".bz2", ".gz", ".z"]:
if file.suffix.lower() in (".bz2", ".gz", ".z"):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My oversight, should use set instead {".bz2", ".gz", ".z"}

Copy link
Contributor Author

@DanielYang59 DanielYang59 Mar 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a quick note for myself, Sourcery would recommend using set over list or tuple for membership check, see here.

The advantage of set/tuple over list seems quite obvious, for example discussed here.

However I still don't quite know the difference between tuple and set in this scenario, some claims it's related to performance.

shutil.copy(file, f"{target_dir}/{file.name}")
outputfile = decompress_file(f"{target_dir}/{file.name}")
out_file = decompress_file(f"{target_dir}/{file.name}")

if file: # to avoid mypy error
return str(outputfile)
return str(out_file)

raise FileNotFoundError(f"File {outputfile} not found.")
raise FileNotFoundError(f"File {out_file} not found.")

return str(file)

Expand All @@ -117,22 +115,22 @@ def temp_decompress(file: str | Path, target_dir: str = ".") -> str:

if bader_path is None:
raise RuntimeError(
"Requires bader or bader.exe to be in the PATH.\n"
"Requires bader or bader.exe to be in the PATH or the absolute path to "
f"the binary to be specified via {bader_path=}.\n"
"Download from https://theory.cm.utexas.edu/henkelman/code/bader."
)

# Check input cube/CHGCAR files
if not (cube_filename or chgcar_filename):
raise ValueError("You must provide either a cube file or a CHGCAR.")
raise ValueError("You must provide either a Cube file or a CHGCAR.")
if cube_filename and chgcar_filename:
raise ValueError("Cannot parse cube and CHGCAR at the same time.")

# Flag for parse atomic partitioned charge density
self.parse_atomic_densities = parse_atomic_densities

with ScratchDir("."):
if chgcar_filename:
fpath = chgcar_fpath = temp_decompress(chgcar_filename)
filepath = chgcar_fpath = temp_decompress(chgcar_filename)

self.chgcar = Chgcar.from_file(chgcar_fpath)
self.structure = self.chgcar.structure
Expand All @@ -150,20 +148,19 @@ def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
else []
)

# Parse from cube file
# Parse from Cube file
else:
fpath = cube_fpath = temp_decompress(cube_filename)
filepath = cube_fpath = temp_decompress(cube_filename)

self.cube = VolumetricData.from_cube(cube_fpath)
self.structure = self.cube.structure
self.nelects = []

# Compile CHGCAR reference file
# prepare CHGCAR file
DanielYang59 marked this conversation as resolved.
Show resolved Hide resolved
chgref_fpath = temp_decompress(chgref_filename)
self.reference_used = bool(chgref_filename)

# Compile Bader args
bader_args = [bader_path, fpath]
bader_args = [bader_path, filepath]

if self.reference_used:
bader_args += ["-ref", chgref_fpath]
Expand All @@ -180,9 +177,8 @@ def temp_decompress(file: str | Path, target_dir: str = ".") -> str:
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise RuntimeError(
f"{bader_path} exit code: {proc.returncode}, "
f"error message: {stderr!s}.\nstdout: {stdout!s}"
"Please check your bader installation."
f"{bader_path} exit code: {proc.returncode}, error: {stderr!s}."
f"\nstdout: {stdout!s}. Please check your bader installation."
)

# Determine Bader version
Expand Down Expand Up @@ -458,7 +454,7 @@ def from_path(cls, path: str, suffix: str = "") -> BaderAnalysis:
def _get_filepath(filename):
name_pattern = f"{filename}{suffix}*" if filename != "POTCAR" else f"{filename}*"
paths = glob(f"{path}/{name_pattern}")
fpath = ""
filepath = ""
if len(paths) >= 1:
# using reverse=True because, if multiple files are present, they likely
# have suffixes 'static', 'relax', 'relax2', etc. and this would give
Expand All @@ -467,15 +463,15 @@ def _get_filepath(filename):
paths.sort(reverse=True)
if len(paths) > 1:
warnings.warn(f"Multiple files detected, using {paths[0]}")
fpath = paths[0]
filepath = paths[0]
else:
msg = f"Could not find {filename!r}"
if filename in ("AECCAR0", "AECCAR2"):
msg += ", interpret Bader results with severe caution."
elif filename == "POTCAR":
msg += ", cannot calculate charge transfer."
warnings.warn(msg)
return fpath
return filepath

chgcar_filename = _get_filepath("CHGCAR")
if chgcar_filename is None:
Expand Down Expand Up @@ -531,8 +527,9 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:
return paths[0]

chgcar_path = _get_filepath("CHGCAR", "Could not find CHGCAR!")
if chgcar_path is not None:
chgcar = Chgcar.from_file(chgcar_path)
if chgcar_path is None:
raise FileNotFoundError("Could not find CHGCAR!")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _get_filepath method already issued a warning in line 529, maybe this is not necessary?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i guess Chgcar.from_file(chgcar_path) will already raise if the file is missing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe no... By Chgcar.from_file

@classmethod
def from_file(cls, filename: str):
"""Read a CHGCAR file.
Args:
filename (str): Path to CHGCAR file.
Returns:
Chgcar
"""
poscar, data, data_aug = VolumetricData.parse_file(filename)
return cls(poscar, data, data_aug=data_aug)

And then VolumetricData.parse_file:

@staticmethod
def parse_file(filename: str) -> tuple[Poscar, dict, dict]:
"""
Convenience method to parse a generic volumetric data file in the vasp
like format. Used by subclasses for parsing file.
Args:
filename (str): Path of file to parse
Returns:
tuple[Poscar, dict, dict]: Poscar object, data dict, data_aug dict
"""
poscar_read = False
poscar_string: list[str] = []
dataset: np.ndarray = np.zeros((1, 1, 1))
all_dataset: list[np.ndarray] = []
# for holding any strings in input that are not Poscar
# or VolumetricData (typically augmentation charges)
all_dataset_aug: dict[int, list[str]] = {}
dim: list[int] = []
dimline = ""
read_dataset = False
ngrid_pts = 0
data_count = 0
poscar = None
with zopen(filename, mode="rt") as file:
for line in file:
original_line = line
line = line.strip()
if read_dataset:
for tok in line.split():
if data_count < ngrid_pts:
# This complicated procedure is necessary because
# vasp outputs x as the fastest index, followed by y
# then z.
no_x = data_count // dim[0]
dataset[data_count % dim[0], no_x % dim[1], no_x // dim[1]] = float(tok)
data_count += 1
if data_count >= ngrid_pts:
read_dataset = False
data_count = 0
all_dataset.append(dataset)
elif not poscar_read:
if line != "" or len(poscar_string) == 0:
poscar_string.append(line)
elif line == "":
poscar = Poscar.from_str("\n".join(poscar_string))
poscar_read = True
elif not dim:
dim = [int(i) for i in line.split()]
ngrid_pts = dim[0] * dim[1] * dim[2]
dimline = line
read_dataset = True
dataset = np.zeros(dim)
elif line == dimline:
# when line == dimline, expect volumetric data to follow
# so set read_dataset to True
read_dataset = True
dataset = np.zeros(dim)
else:
# store any extra lines that were not part of the
# volumetric data so we know which set of data the extra
# lines are associated with
key = len(all_dataset) - 1
if key not in all_dataset_aug:
all_dataset_aug[key] = []
all_dataset_aug[key].append(original_line)
if len(all_dataset) == 4:
data = {
"total": all_dataset[0],
"diff_x": all_dataset[1],
"diff_y": all_dataset[2],
"diff_z": all_dataset[3],
}
data_aug = {
"total": all_dataset_aug.get(0),
"diff_x": all_dataset_aug.get(1),
"diff_y": all_dataset_aug.get(2),
"diff_z": all_dataset_aug.get(3),
}
# construct a "diff" dict for scalar-like magnetization density,
# referenced to an arbitrary direction (using same method as
# pymatgen.electronic_structure.core.Magmom, see
# Magmom documentation for justification for this)
# TODO: re-examine this, and also similar behavior in
# Magmom - @mkhorton
# TODO: does CHGCAR change with different SAXIS?
diff_xyz = np.array([data["diff_x"], data["diff_y"], data["diff_z"]])
diff_xyz = diff_xyz.reshape((3, dim[0] * dim[1] * dim[2]))
ref_direction = np.array([1.01, 1.02, 1.03])
ref_sign = np.sign(np.dot(ref_direction, diff_xyz))
diff = np.multiply(np.linalg.norm(diff_xyz, axis=0), ref_sign)
data["diff"] = diff.reshape((dim[0], dim[1], dim[2]))
elif len(all_dataset) == 2:
data = {"total": all_dataset[0], "diff": all_dataset[1]}
data_aug = {
"total": all_dataset_aug.get(0),
"diff": all_dataset_aug.get(1),
}
else:
data = {"total": all_dataset[0]}
data_aug = {"total": all_dataset_aug.get(0)}
return poscar, data, data_aug # type: ignore[return-value]

Thenzopen: https://github.com/materialsvirtuallab/monty/blob/4d60fd4745f840354e7b3f48346eaf3cd68f2b35/monty/io.py#L19-L45

But certainly open would complain (maybe not so descriptive). But _get_filepath would issue a warning anyway, so maybe don't bother issure another here 😄

Copy link
Contributor Author

@DanielYang59 DanielYang59 Mar 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, I added the if chgcar_path is not None check to avoid mypy error, because _get_filepath could return None and trigger mypy incompatible type error:

error: Argument 1 to "from_file" of "Chgcar" has incompatible type "str | None"; expected "str"  [arg-type]

chgcar = Chgcar.from_file(chgcar_path)

aeccar0_path = _get_filepath("AECCAR0")
if not aeccar0_path:
Expand All @@ -554,9 +551,9 @@ def _get_filepath(filename: str, msg: str = "") -> str | None:

def bader_analysis_from_objects(
chgcar: Chgcar,
potcar: Potcar = None,
aeccar0: Chgcar = None,
aeccar2: Chgcar = None,
potcar: Potcar | None = None,
aeccar0: Chgcar | None = None,
aeccar2: Chgcar | None = None,
):
"""Convenience method to run Bader analysis from a set
of pymatgen Chgcar and Potcar objects.
Expand Down