src/pymatgen/io/common.py

"""Module for defining common data used and produced by atomistic simulation packages."""

from __future__ import annotations

import itertools
import json
import warnings
from copy import deepcopy
from typing import TYPE_CHECKING

import numpy as np
from monty.io import zopen
from monty.json import MSONable
from scipy.interpolate import RegularGridInterpolator

from pymatgen.core import Element, Site, Structure
from pymatgen.core.units import ang_to_bohr, bohr_to_angstrom
from pymatgen.electronic_structure.core import Spin

if TYPE_CHECKING:
    from pathlib import Path

    from typing_extensions import Self


class VolumetricData(MSONable):
    """
    Simple volumetric object. Used to read LOCPOT/CHGCAR files produced by
    vasp as well as cube files produced by other codes.

    Attributes:
        structure (Structure): Structure associated with the Volumetric Data object.
        is_spin_polarized (bool): True if run is spin polarized.
        dim (tuple): Tuple of dimensions of volumetric grid in each direction (nx, ny, nz).
        data (dict): Actual data as a dict of {string: np.array}. The string are "total"
            and "diff", in accordance to the output format of Vasp LOCPOT and
            CHGCAR files where the total spin density is written first, followed
            by the difference spin density.
        ngridpts (int): Total number of grid points in volumetric data.
    """

    def __init__(
        self,
        structure: Structure,
        data: dict[str, np.ndarray],
        distance_matrix: np.ndarray | None = None,
        data_aug: np.ndarray | None = None,
    ) -> None:
        """
        Typically, this constructor is not used directly and the static
        from_file constructor is used. This constructor is designed to allow
        summation and other operations between VolumetricData objects.

        Args:
            structure (Structure): associated with the volumetric data
            data (dict[str, np.array]): Actual volumetric data.
            distance_matrix (np.array): A pre-computed distance matrix if available.
                Useful so pass distance_matrices between sums,
                short-circuiting an otherwise expensive operation.
            data_aug (np.array): Any extra information associated with volumetric data
                (typically augmentation charges)
        """
        self.structure = structure
        self.is_spin_polarized = len(data) >= 2
        self.is_soc = len(data) >= 4
        # convert data to numpy arrays in case they were jsanitized as lists
        self.data = {k: np.array(v) for k, v in data.items()}
        self.dim = self.data["total"].shape
        self.data_aug = data_aug or {}
        self.ngridpts = self.dim[0] * self.dim[1] * self.dim[2]
        # lazy init the spin data since this is not always needed.
        self._spin_data: dict[Spin, float] = {}
        self._distance_matrix = distance_matrix or {}
        self.xpoints = np.linspace(0.0, 1.0, num=self.dim[0])
        self.ypoints = np.linspace(0.0, 1.0, num=self.dim[1])
        self.zpoints = np.linspace(0.0, 1.0, num=self.dim[2])
        self.interpolator = RegularGridInterpolator(
            (self.xpoints, self.ypoints, self.zpoints),
            self.data["total"],
            bounds_error=True,
        )
        self.name = "VolumetricData"

    @property
    def spin_data(self):
        """The data decomposed into actual spin data as {spin: data}.
        Essentially, this provides the actual Spin.up and Spin.down data
        instead of the total and diff. Note that by definition, a
        non-spin-polarized run would have Spin.up data == Spin.down data.
        """
        if not self._spin_data:
            spin_data = {}
            spin_data[Spin.up] = 0.5 * (self.data["total"] + self.data.get("diff", 0))
            spin_data[Spin.down] = 0.5 * (self.data["total"] - self.data.get("diff", 0))
            self._spin_data = spin_data
        return self._spin_data

    def get_axis_grid(self, ind):
        """Get the grid for a particular axis.

        Args:
            ind (int): Axis index.
        """
        ng = self.dim
        num_pts = ng[ind]
        lengths = self.structure.lattice.abc
        return [i / num_pts * lengths[ind] for i in range(num_pts)]

    def __add__(self, other):
        return self.linear_add(other, 1.0)

    def __sub__(self, other):
        return self.linear_add(other, -1.0)

    def copy(self) -> Self:
        """Make a copy of VolumetricData object."""
        return VolumetricData(
            self.structure,
            {k: v.copy() for k, v in self.data.items()},
            distance_matrix=self._distance_matrix,
            data_aug=self.data_aug,
        )

    def linear_add(self, other, scale_factor=1.0):
        """
        Method to do a linear sum of volumetric objects. Used by + and -
        operators as well. Returns a VolumetricData object containing the
        linear sum.

        Args:
            other (VolumetricData): Another VolumetricData object
            scale_factor (float): Factor to scale the other data by.

        Returns:
            VolumetricData corresponding to self + scale_factor * other.
        """
        if self.structure != other.structure:
            warnings.warn("Structures are different. Make sure you know what you are doing...")
        if list(self.data) != list(other.data):
            raise ValueError("Data have different keys! Maybe one is spin-polarized and the other is not?")

        # To add checks
        data = {}
        for k in self.data:
            data[k] = self.data[k] + scale_factor * other.data[k]

        new = deepcopy(self)
        new.data = data
        new.data_aug = {}
        return new

    def scale(self, factor):
        """Scale the data in place by a factor."""
        for k in self.data:
            self.data[k] = np.multiply(self.data[k], factor)

    def value_at(self, x, y, z):
        """Get a data value from self.data at a given point (x, y, z) in terms
        of fractional lattice parameters. Will be interpolated using a
        RegularGridInterpolator on self.data if (x, y, z) is not in the original
        set of data points.

        Args:
            x (float): Fraction of lattice vector a.
            y (float): Fraction of lattice vector b.
            z (float): Fraction of lattice vector c.

        Returns:
            Value from self.data (potentially interpolated) corresponding to
            the point (x, y, z).
        """
        return self.interpolator([x, y, z])[0]

    def linear_slice(self, p1, p2, n=100):
        """Get a linear slice of the volumetric data with n data points from
        point p1 to point p2, in the form of a list.

        Args:
            p1 (list): 3-element list containing fractional coordinates of the first point.
            p2 (list): 3-element list containing fractional coordinates of the second point.
            n (int): Number of data points to collect, defaults to 100.

        Returns:
            List of n data points (mostly interpolated) representing a linear slice of the
            data from point p1 to point p2.
        """
        if type(p1) not in {list, np.ndarray}:
            raise TypeError(f"type of p1 should be list or np.ndarray, got {type(p1).__name__}")
        if len(p1) != 3:
            raise ValueError(f"length of p1 should be 3, got {len(p1)}")
        if type(p2) not in {list, np.ndarray}:
            raise TypeError(f"type of p2 should be list or np.ndarray, got {type(p2).__name__}")
        if len(p2) != 3:
            raise ValueError(f"length of p2 should be 3, got {len(p2)}")

        x_pts = np.linspace(p1[0], p2[0], num=n)
        y_pts = np.linspace(p1[1], p2[1], num=n)
        z_pts = np.linspace(p1[2], p2[2], num=n)
        return [self.value_at(x_pts[i], y_pts[i], z_pts[i]) for i in range(n)]

    def get_integrated_diff(self, ind, radius, nbins=1):
        """Get integrated difference of atom index ind up to radius. This can be
        an extremely computationally intensive process, depending on how many
        grid points are in the VolumetricData.

        Args:
            ind (int): Index of atom.
            radius (float): Radius of integration.
            nbins (int): Number of bins. Defaults to 1. This allows one to
                obtain the charge integration up to a list of the cumulative
                charge integration values for radii for [radius/nbins,
                2 * radius/nbins, ....].

        Returns:
            Differential integrated charge as a np array of [[radius, value],
            ...]. Format is for ease of plotting. e.g. plt.plot(data[:,0],
            data[:,1])
        """
        # For non-spin-polarized runs, this is zero by definition.
        if not self.is_spin_polarized:
            radii = [radius / nbins * (i + 1) for i in range(nbins)]
            data = np.zeros((nbins, 2))
            data[:, 0] = radii
            return data

        struct = self.structure
        a = self.dim
        if ind not in self._distance_matrix or self._distance_matrix[ind]["max_radius"] < radius:
            coords = []
            for x, y, z in itertools.product(*(list(range(i)) for i in a)):
                coords.append([x / a[0], y / a[1], z / a[2]])
            sites_dist = struct.lattice.get_points_in_sphere(coords, struct[ind].coords, radius)
            self._distance_matrix[ind] = {
                "max_radius": radius,
                "data": np.array(sites_dist, dtype=object),
            }

        data = self._distance_matrix[ind]["data"]

        # Use boolean indexing to find all charges within the desired distance.
        inds = data[:, 1] <= radius
        dists = data[inds, 1]
        data_inds = np.rint(np.mod(list(data[inds, 0]), 1) * np.tile(a, (len(dists), 1))).astype(int)
        vals = [self.data["diff"][x, y, z] for x, y, z in data_inds]

        hist, edges = np.histogram(dists, bins=nbins, range=[0, radius], weights=vals)
        data = np.zeros((nbins, 2))
        data[:, 0] = edges[1:]
        data[:, 1] = [sum(hist[0 : i + 1]) / self.ngridpts for i in range(nbins)]
        return data

    def get_average_along_axis(self, ind):
        """Get the averaged total of the volumetric data a certain axis direction.
        For example, useful for visualizing Hartree Potentials from a LOCPOT
        file.

        Args:
            ind (int): Index of axis.

        Returns:
            Average total along axis
        """
        total_spin_dens = self.data["total"]
        ng = self.dim
        if ind == 0:
            total = np.sum(np.sum(total_spin_dens, axis=1), 1)
        elif ind == 1:
            total = np.sum(np.sum(total_spin_dens, axis=0), 1)
        else:
            total = np.sum(np.sum(total_spin_dens, axis=0), 0)
        return total / ng[(ind + 1) % 3] / ng[(ind + 2) % 3]

    def to_hdf5(self, filename):
        """Write the VolumetricData to a HDF5 format, which is a highly optimized
        format for reading storing large data. The mapping of the VolumetricData
        to this file format is as follows:

        VolumetricData.data -> f["vdata"]
        VolumetricData.structure ->
            f["Z"]: Sequence of atomic numbers
            f["fcoords"]: Fractional coords
            f["lattice"]: Lattice in the pymatgen.core.Lattice matrix
                format
            f.attrs["structure_json"]: String of JSON representation

        Args:
            filename (str): Filename to output to.
        """
        import h5py

        with h5py.File(filename, mode="w") as file:
            ds = file.create_dataset("lattice", (3, 3), dtype="float")
            ds[...] = self.structure.lattice.matrix
            ds = file.create_dataset("Z", (len(self.structure.species),), dtype="i")
            ds[...] = np.array([sp.Z for sp in self.structure.species])
            ds = file.create_dataset("fcoords", self.structure.frac_coords.shape, dtype="float")
            ds[...] = self.structure.frac_coords
            dt = h5py.special_dtype(vlen=str)
            ds = file.create_dataset("species", (len(self.structure.species),), dtype=dt)
            ds[...] = [str(sp) for sp in self.structure.species]
            grp = file.create_group("vdata")
            for k in self.data:
                ds = grp.create_dataset(k, self.data[k].shape, dtype="float")
                ds[...] = self.data[k]
            file.attrs["name"] = self.name
            file.attrs["structure_json"] = json.dumps(self.structure.as_dict())

    @classmethod
    def from_hdf5(cls, filename: str, **kwargs) -> Self:
        """
        Reads VolumetricData from HDF5 file.

        Args:
            filename: Filename

        Returns:
            VolumetricData
        """
        import h5py

        with h5py.File(filename, mode="r") as file:
            data = {k: np.array(v) for k, v in file["vdata"].items()}
            data_aug = None
            if "vdata_aug" in file:
                data_aug = {k: np.array(v) for k, v in file["vdata_aug"].items()}
            structure = Structure.from_dict(json.loads(file.attrs["structure_json"]))
            return cls(structure, data=data, data_aug=data_aug, **kwargs)

    def to_cube(self, filename, comment: str = ""):
        """Write the total volumetric data to a cube file format, which consists of two comment lines,
        a header section defining the structure IN BOHR, and the data.

        Args:
            filename (str): Name of the cube file to be written.
            comment (str): If provided, this will be added to the second comment line
        """
        with zopen(filename, mode="wt") as file:
            file.write(f"# Cube file for {self.structure.formula} generated by Pymatgen\n")
            file.write(f"# {comment}\n")
            file.write(f"\t {len(self.structure)} 0.000000 0.000000 0.000000\n")

            for idx in range(3):
                lattice_matrix = self.structure.lattice.matrix[idx] / self.dim[idx] * ang_to_bohr
                file.write(
                    f"\t {self.dim[idx]} {lattice_matrix[0]:.6f} {lattice_matrix[1]:.6f} {lattice_matrix[2]:.6f}\n"
                )

            for site in self.structure:
                file.write(
                    f"\t {Element(site.species_string).Z} 0.000000 "
                    f"{ang_to_bohr * site.coords[0]} "
                    f"{ang_to_bohr * site.coords[1]} "
                    f"{ang_to_bohr * site.coords[2]} \n"
                )

            for idx, dat in enumerate(self.data["total"].flatten(), start=1):
                file.write(f"{' ' if dat > 0 else ''}{dat:.6e} ")
                if idx % 6 == 0:
                    file.write("\n")

    @classmethod
    def from_cube(cls, filename: str | Path) -> Self:
        """
        Initialize the cube object and store the data as data.

        Args:
            filename (str): of the cube to read
        """
        file = zopen(filename, mode="rt")

        # skip header lines
        file.readline()
        file.readline()

        # number of atoms followed by the position of the origin of the volumetric data
        line = file.readline().split()
        n_atoms = int(line[0])

        # The number of voxels along each axis (x, y, z) followed by the axis vector.
        line = file.readline().split()
        num_x_voxels = int(line[0])
        voxel_x = np.array([bohr_to_angstrom * float(val) for val in line[1:]])

        line = file.readline().split()
        num_y_voxels = int(line[0])
        voxel_y = np.array([bohr_to_angstrom * float(val) for val in line[1:]])

        line = file.readline().split()
        num_z_voxels = int(line[0])
        voxel_z = np.array([bohr_to_angstrom * float(val) for val in line[1:]])

        # The last section in the header is one line for each atom consisting of 5 numbers,
        # the first is the atom number, second is charge,
        # the last three are the x,y,z coordinates of the atom center.
        sites = []
        for _ in range(n_atoms):
            line = file.readline().split()
            sites.append(Site(line[0], np.multiply(bohr_to_angstrom, list(map(float, line[2:])))))

        structure = Structure(
            lattice=[
                voxel_x * num_x_voxels,
                voxel_y * num_y_voxels,
                voxel_z * num_z_voxels,
            ],
            species=[s.specie for s in sites],
            coords=[s.coords for s in sites],
            coords_are_cartesian=True,
        )

        # Volumetric data
        data = np.reshape(
            np.array(file.read().split()).astype(float),
            (num_x_voxels, num_y_voxels, num_z_voxels),
        )
        return cls(structure=structure, data={"total": data})