Feat: Add DOSnet training in PT (#3486)

This is a follow-up PR on #3452 - [x] Add DOS loss - [x] Fix stat calculation - [x] Add UT on training - [x] Add e2e JIT test - [x] fix dp test data shape --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: anyangml <[email protected]>
deepmodeling · Mar 25, 2024 · 48f06fe · 48f06fe
1 parent a58dbc6
commit 48f06fe
Show file tree

Hide file tree

Showing 33 changed files with 546 additions and 12 deletions.
diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
@@ -194,7 +194,7 @@ def get_sel_type(self) -> List[int]:
 
     def get_numb_dos(self) -> int:
         """Get the number of DOS."""
-        return 0
+        return self.dp.model["Default"].get_numb_dos()
 
     def get_has_efield(self):
         """Check if the model has efield."""

diff --git a/deepmd/pt/loss/__init__.py b/deepmd/pt/loss/__init__.py
@@ -2,6 +2,9 @@
 from .denoise import (
     DenoiseLoss,
 )
+from .dos import (
+    DOSLoss,
+)
 from .ener import (
     EnergyStdLoss,
 )
@@ -21,4 +24,5 @@
     "EnergySpinLoss",
     "TensorLoss",
     "TaskLoss",
+    "DOSLoss",
 ]
diff --git a/deepmd/pt/loss/dos.py b/deepmd/pt/loss/dos.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    List,
+)
+
+import torch
+
+from deepmd.pt.loss.loss import (
+    TaskLoss,
+)
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.utils.data import (
+    DataRequirementItem,
+)
+
+
+class DOSLoss(TaskLoss):
+    def __init__(
+        self,
+        starter_learning_rate: float,
+        numb_dos: int,
+        start_pref_dos: float = 1.00,
+        limit_pref_dos: float = 1.00,
+        start_pref_cdf: float = 1000,
+        limit_pref_cdf: float = 1.00,
+        start_pref_ados: float = 0.0,
+        limit_pref_ados: float = 0.0,
+        start_pref_acdf: float = 0.0,
+        limit_pref_acdf: float = 0.0,
+        inference=False,
+        **kwargs,
+    ):
+        r"""Construct a loss for local and global tensors.
+
+        Parameters
+        ----------
+        tensor_name : str
+            The name of the tensor in the model predictions to compute the loss.
+        tensor_size : int
+            The size (dimension) of the tensor.
+        label_name : str
+            The name of the tensor in the labels to compute the loss.
+        pref_atomic : float
+            The prefactor of the weight of atomic loss. It should be larger than or equal to 0.
+        pref : float
+            The prefactor of the weight of global loss. It should be larger than or equal to 0.
+        inference : bool
+            If true, it will output all losses found in output, ignoring the pre-factors.
+        **kwargs
+            Other keyword arguments.
+        """
+        super().__init__()
+        self.starter_learning_rate = starter_learning_rate
+        self.numb_dos = numb_dos
+        self.inference = inference
+
+        self.start_pref_dos = start_pref_dos
+        self.limit_pref_dos = limit_pref_dos
+        self.start_pref_cdf = start_pref_cdf
+        self.limit_pref_cdf = limit_pref_cdf
+
+        self.start_pref_ados = start_pref_ados
+        self.limit_pref_ados = limit_pref_ados
+        self.start_pref_acdf = start_pref_acdf
+        self.limit_pref_acdf = limit_pref_acdf
+
+        assert (
+            self.start_pref_dos >= 0.0
+            and self.limit_pref_dos >= 0.0
+            and self.start_pref_cdf >= 0.0
+            and self.limit_pref_cdf >= 0.0
+            and self.start_pref_ados >= 0.0
+            and self.limit_pref_ados >= 0.0
+            and self.start_pref_acdf >= 0.0
+            and self.limit_pref_acdf >= 0.0
+        ), "Can not assign negative weight to `pref` and `pref_atomic`"
+
+        self.has_dos = (start_pref_dos != 0.0 and limit_pref_dos != 0.0) or inference
+        self.has_cdf = (start_pref_cdf != 0.0 and limit_pref_cdf != 0.0) or inference
+        self.has_ados = (start_pref_ados != 0.0 and limit_pref_ados != 0.0) or inference
+        self.has_acdf = (start_pref_acdf != 0.0 and limit_pref_acdf != 0.0) or inference
+
+        assert (
+            self.has_dos or self.has_cdf or self.has_ados or self.has_acdf
+        ), AssertionError("Can not assian zero weight both to `pref` and `pref_atomic`")
+
+    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+        """Return loss on local and global tensors.
+
+        Parameters
+        ----------
+        input_dict : dict[str, torch.Tensor]
+            Model inputs.
+        model : torch.nn.Module
+            Model to be used to output the predictions.
+        label : dict[str, torch.Tensor]
+            Labels.
+        natoms : int
+            The local atom number.
+
+        Returns
+        -------
+        model_pred: dict[str, torch.Tensor]
+            Model predictions.
+        loss: torch.Tensor
+            Loss for model to minimize.
+        more_loss: dict[str, torch.Tensor]
+            Other losses for display.
+        """
+        model_pred = model(**input_dict)
+
+        coef = learning_rate / self.starter_learning_rate
+        pref_dos = (
+            self.limit_pref_dos + (self.start_pref_dos - self.limit_pref_dos) * coef
+        )
+        pref_cdf = (
+            self.limit_pref_cdf + (self.start_pref_cdf - self.limit_pref_cdf) * coef
+        )
+        pref_ados = (
+            self.limit_pref_ados + (self.start_pref_ados - self.limit_pref_ados) * coef
+        )
+        pref_acdf = (
+            self.limit_pref_acdf + (self.start_pref_acdf - self.limit_pref_acdf) * coef
+        )
+
+        loss = torch.zeros(1, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE)[0]
+        more_loss = {}
+        if self.has_ados and "atom_dos" in model_pred and "atom_dos" in label:
+            find_local = label.get("find_atom_dos", 0.0)
+            pref_ados = pref_ados * find_local
+            local_tensor_pred_dos = model_pred["atom_dos"].reshape(
+                [-1, natoms, self.numb_dos]
+            )
+            local_tensor_label_dos = label["atom_dos"].reshape(
+                [-1, natoms, self.numb_dos]
+            )
+            diff = (local_tensor_pred_dos - local_tensor_label_dos).reshape(
+                [-1, self.numb_dos]
+            )
+            if "mask" in model_pred:
+                diff = diff[model_pred["mask"].reshape([-1]).bool()]
+            l2_local_loss_dos = torch.mean(torch.square(diff))
+            if not self.inference:
+                more_loss["l2_local_dos_loss"] = self.display_if_exist(
+                    l2_local_loss_dos.detach(), find_local
+                )
+            loss += pref_ados * l2_local_loss_dos
+            rmse_local_dos = l2_local_loss_dos.sqrt()
+            more_loss["rmse_local_dos"] = self.display_if_exist(
+                rmse_local_dos.detach(), find_local
+            )
+        if self.has_acdf and "atom_dos" in model_pred and "atom_dos" in label:
+            find_local = label.get("find_atom_dos", 0.0)
+            pref_acdf = pref_acdf * find_local
+            local_tensor_pred_cdf = torch.cusum(
+                model_pred["atom_dos"].reshape([-1, natoms, self.numb_dos]), dim=-1
+            )
+            local_tensor_label_cdf = torch.cusum(
+                label["atom_dos"].reshape([-1, natoms, self.numb_dos]), dim=-1
+            )
+            diff = (local_tensor_pred_cdf - local_tensor_label_cdf).reshape(
+                [-1, self.numb_dos]
+            )
+            if "mask" in model_pred:
+                diff = diff[model_pred["mask"].reshape([-1]).bool()]
+            l2_local_loss_cdf = torch.mean(torch.square(diff))
+            if not self.inference:
+                more_loss["l2_local_cdf_loss"] = self.display_if_exist(
+                    l2_local_loss_cdf.detach(), find_local
+                )
+            loss += pref_acdf * l2_local_loss_cdf
+            rmse_local_cdf = l2_local_loss_cdf.sqrt()
+            more_loss["rmse_local_cdf"] = self.display_if_exist(
+                rmse_local_cdf.detach(), find_local
+            )
+        if self.has_dos and "dos" in model_pred and "dos" in label:
+            find_global = label.get("find_dos", 0.0)
+            pref_dos = pref_dos * find_global
+            global_tensor_pred_dos = model_pred["dos"].reshape([-1, self.numb_dos])
+            global_tensor_label_dos = label["dos"].reshape([-1, self.numb_dos])
+            diff = global_tensor_pred_dos - global_tensor_label_dos
+            if "mask" in model_pred:
+                atom_num = model_pred["mask"].sum(-1, keepdim=True)
+                l2_global_loss_dos = torch.mean(
+                    torch.sum(torch.square(diff) * atom_num, dim=0) / atom_num.sum()
+                )
+                atom_num = torch.mean(atom_num.float())
+            else:
+                atom_num = natoms
+                l2_global_loss_dos = torch.mean(torch.square(diff))
+            if not self.inference:
+                more_loss["l2_global_dos_loss"] = self.display_if_exist(
+                    l2_global_loss_dos.detach(), find_global
+                )
+            loss += pref_dos * l2_global_loss_dos
+            rmse_global_dos = l2_global_loss_dos.sqrt() / atom_num
+            more_loss["rmse_global_dos"] = self.display_if_exist(
+                rmse_global_dos.detach(), find_global
+            )
+        if self.has_cdf and "dos" in model_pred and "dos" in label:
+            find_global = label.get("find_dos", 0.0)
+            pref_cdf = pref_cdf * find_global
+            global_tensor_pred_cdf = torch.cusum(
+                model_pred["dos"].reshape([-1, self.numb_dos]), dim=-1
+            )
+            global_tensor_label_cdf = torch.cusum(
+                label["dos"].reshape([-1, self.numb_dos]), dim=-1
+            )
+            diff = global_tensor_pred_cdf - global_tensor_label_cdf
+            if "mask" in model_pred:
+                atom_num = model_pred["mask"].sum(-1, keepdim=True)
+                l2_global_loss_cdf = torch.mean(
+                    torch.sum(torch.square(diff) * atom_num, dim=0) / atom_num.sum()
+                )
+                atom_num = torch.mean(atom_num.float())
+            else:
+                atom_num = natoms
+                l2_global_loss_cdf = torch.mean(torch.square(diff))
+            if not self.inference:
+                more_loss["l2_global_cdf_loss"] = self.display_if_exist(
+                    l2_global_loss_cdf.detach(), find_global
+                )
+            loss += pref_cdf * l2_global_loss_cdf
+            rmse_global_dos = l2_global_loss_cdf.sqrt() / atom_num
+            more_loss["rmse_global_cdf"] = self.display_if_exist(
+                rmse_global_dos.detach(), find_global
+            )
+        return model_pred, loss, more_loss
+
+    @property
+    def label_requirement(self) -> List[DataRequirementItem]:
+        """Return data label requirements needed for this loss calculation."""
+        label_requirement = []
+        if self.has_ados or self.has_acdf:
+            label_requirement.append(
+                DataRequirementItem(
+                    "atom_dos",
+                    ndof=self.numb_dos,
+                    atomic=True,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        if self.has_dos or self.has_cdf:
+            label_requirement.append(
+                DataRequirementItem(
+                    "dos",
+                    ndof=self.numb_dos,
+                    atomic=False,
+                    must=False,
+                    high_prec=False,
+                )
+            )
+        return label_requirement
diff --git a/deepmd/pt/model/model/dos_model.py b/deepmd/pt/model/model/dos_model.py
@@ -50,6 +50,11 @@ def forward(
             model_predict["updated_coord"] += coord
         return model_predict
 
+    @torch.jit.export
+    def get_numb_dos(self) -> int:
+        """Get the number of  DOS for DOSFittingNet."""
+        return self.get_fitting_net().dim_out
+
     @torch.jit.export
     def forward_lower(
         self,

diff --git a/deepmd/pt/model/task/dos.py b/deepmd/pt/model/task/dos.py
@@ -2,11 +2,13 @@
 import copy
 import logging
 from typing import (
+    Callable,
     List,
     Optional,
     Union,
 )
 
+import numpy as np
 import torch
 
 from deepmd.dpmodel import (
@@ -28,6 +30,13 @@
 from deepmd.pt.utils.utils import (
     to_numpy_array,
 )
+from deepmd.utils.out_stat import (
+    compute_stats_from_atomic,
+    compute_stats_from_redu,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
 from deepmd.utils.version import (
     check_version_compatibility,
 )
@@ -96,6 +105,63 @@ def output_def(self) -> FittingOutputDef:
             ]
         )
 
+    def compute_output_stats(
+        self,
+        merged: Union[Callable[[], List[dict]], List[dict]],
+        stat_file_path: Optional[DPPath] = None,
+    ) -> None:
+        """
+        Compute the output statistics (e.g. dos bias) for the fitting net from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], List[dict]], List[dict]]
+            - List[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], List[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        stat_file_path : Optional[DPPath]
+            The path to the stat file.
+
+        """
+        if stat_file_path is not None:
+            stat_file_path = stat_file_path / "bias_dos"
+        if stat_file_path is not None and stat_file_path.is_file():
+            bias_dos = stat_file_path.load_numpy()
+        else:
+            if callable(merged):
+                # only get data for once
+                sampled = merged()
+            else:
+                sampled = merged
+            for sys in range(len(sampled)):
+                nframs = sampled[sys]["atype"].shape[0]
+
+                if "atom_dos" in sampled[sys]:
+                    bias_dos = compute_stats_from_atomic(
+                        sampled[sys]["atom_dos"].numpy(force=True),
+                        sampled[sys]["atype"].numpy(force=True),
+                    )[0]
+                else:
+                    sys_type_count = np.zeros(
+                        (nframs, self.ntypes), dtype=env.GLOBAL_NP_FLOAT_PRECISION
+                    )
+                    for itype in range(self.ntypes):
+                        type_mask = sampled[sys]["atype"] == itype
+                        sys_type_count[:, itype] = type_mask.sum(dim=1).numpy(
+                            force=True
+                        )
+                    sys_bias_redu = sampled[sys]["dos"].numpy(force=True)
+
+                    bias_dos = compute_stats_from_redu(
+                        sys_bias_redu, sys_type_count, rcond=self.rcond
+                    )[0]
+                if stat_file_path is not None:
+                    stat_file_path.save_numpy(bias_dos)
+        self.bias_dos = torch.tensor(bias_dos, device=env.DEVICE)
+
     @classmethod
     def deserialize(cls, data: dict) -> "DOSFittingNet":
         data = copy.deepcopy(data)