fixed multiple bugs:

- missing pairs symmetrisation when associating to matrices - inverted sigma/eps order - removed overwriting of oxygen pairs - added missing LJ14 from topology - fixed distance estimate including e_0 - added default epsilon in line with older version - contact were removed too early *this needs to be reimplemented*
carlocamilloni · Nov 25, 2024 · 114f36f · 114f36f
1 parent fa8d16d
commit 114f36f
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 47 deletions.
diff --git a/multiego.py b/multiego.py
@@ -90,9 +90,6 @@ def meGO_parsing():
     if args.multi_epsilon_intra or args.multi_epsilon_inter_domain or args.multi_epsilon_inter:
         multi_flag = True
 
-    # mego_topology = pmd.load_file(f"{args.root_dir}/inputs/{args.system}/topol.top")
-    # topol_names = [m for m in mego_topology.molecules]
-
     custom_dict = {}
     if args.custom_dict:
         custom_dict = parse_json(args.custom_dict)
@@ -185,16 +182,16 @@ def main():
 
     if not args.no_header:
         generate_face.print_welcome()
-    print(f"Multi-eGO: {args.egos}\n")
-
+    print(f"Running Multi-eGO: {args.egos}\n")
+    print("- Processing Multi-eGO topology")
+    st = time.time()
+    elapsed_time = st - bt
+    print("- Done in:", elapsed_time, "seconds")
     print("- Checking for input files and folders")
     io.check_files_existence(args)
     if args.egos == "production":
         io.check_matrix_format(args)
 
-    print("- Processing Multi-eGO topology")
-    st = time.time()
-    # meGO_ensembles = ensemble.init_meGO_ensemble(args, custom_dict)
     print("\t- Generating bonded interactions")
     meGO_ensembles = ensemble.generate_bonded_interactions(meGO_ensembles)
     print("\t- Generating 1-4 data")

diff --git a/src/multiego/ensemble.py b/src/multiego/ensemble.py
@@ -365,12 +365,20 @@ def init_meGO_matrices(ensemble, args, custom_dict):
             warnings.simplefilter("ignore")
             topol = parmed.load_file(topology_path)
 
+        # these are the atom type c6_i,c12_j
         lj_data = topology.get_lj_params(topol)
+        # these are the combined cases (c6_ij, c12_ij)
         lj_pairs = topology.get_lj_pairs(topol)
+        # these are the combined cases in the [pairs] section (c6_ij, c12_ij)
+        lj14_pairs = topology.get_lj14_pairs(topol)
         lj_data_dict = {str(key): val for key, val in zip(lj_data["ai"], lj_data[["c6", "c12"]].values)}
         lj_pairs_dict = {
             (ai, aj): (epsilon, sigma) for ai, aj, epsilon, sigma in lj_pairs[["ai", "aj", "epsilon", "sigma"]].to_numpy()
         }
+        lj14_pairs_dict = {
+            (ai, aj): (epsilon, sigma) for ai, aj, epsilon, sigma in lj14_pairs[["ai", "aj", "epsilon", "sigma"]].to_numpy()
+        }
+
         ensemble["topology_dataframe"]["c6"] = lj_data["c6"].to_numpy()
         ensemble["topology_dataframe"]["c12"] = lj_data["c12"].to_numpy()
 
@@ -404,8 +412,8 @@ def init_meGO_matrices(ensemble, args, custom_dict):
             reference_contact_matrices[name]["sigma_prior"] = np.where(
                 reference_contact_matrices[name]["c6"] > 0,
                 (reference_contact_matrices[name]["c12"] / reference_contact_matrices[name]["c6"]) ** (1 / 6),
-                # reference_contact_matrices[name]["rc_cutoff"]/ (2.0 ** (1.0 / 6.0)),
-                reference_contact_matrices[name]["c12"] ** (1 / 12) / (2.0 ** (1.0 / 6.0)),
+                reference_contact_matrices[name]["c12"] ** (1 / 12)
+                / (2.0 ** (1.0 / 6.0)),  # at this point this still needs to be divided by epsilon_0
             )
             reference_contact_matrices[name]["epsilon_prior"] = np.where(
                 reference_contact_matrices[name]["c6"] > 0,
@@ -418,6 +426,19 @@ def init_meGO_matrices(ensemble, args, custom_dict):
                 if (row["rc_ai"], row["rc_aj"]) in lj_pairs_dict.keys():
                     reference_contact_matrices[name].loc[i, "epsilon_prior"] = lj_pairs_dict[(row["rc_ai"], row["rc_aj"])][0]
                     reference_contact_matrices[name].loc[i, "sigma_prior"] = lj_pairs_dict[(row["rc_ai"], row["rc_aj"])][1]
+                if (row["rc_aj"], row["rc_ai"]) in lj_pairs_dict.keys():
+                    reference_contact_matrices[name].loc[i, "epsilon_prior"] = lj_pairs_dict[(row["rc_aj"], row["rc_ai"])][0]
+                    reference_contact_matrices[name].loc[i, "sigma_prior"] = lj_pairs_dict[(row["rc_aj"], row["rc_ai"])][1]
+
+            # apply the LJ14 pairs (only if same_chain = True)
+            for i, row in reference_contact_matrices[name].iterrows():
+                if row["rc_same_chain"] and (row["rc_ai"], row["rc_aj"]) in lj14_pairs_dict.keys():
+                    reference_contact_matrices[name].loc[i, "epsilon_prior"] = lj14_pairs_dict[(row["rc_ai"], row["rc_aj"])][0]
+                    reference_contact_matrices[name].loc[i, "sigma_prior"] = lj14_pairs_dict[(row["rc_ai"], row["rc_aj"])][1]
+                if row["rc_same_chain"] and (row["rc_aj"], row["rc_ai"]) in lj14_pairs_dict.keys():
+                    reference_contact_matrices[name].loc[i, "epsilon_prior"] = lj14_pairs_dict[(row["rc_aj"], row["rc_ai"])][0]
+                    reference_contact_matrices[name].loc[i, "sigma_prior"] = lj14_pairs_dict[(row["rc_aj"], row["rc_ai"])][1]
+
             reference_contact_matrices[name].drop(columns=["c6_i", "c6_j", "c12_i", "c12_j", "c6", "c12"], inplace=True)
 
         et = time.time()
@@ -778,8 +799,8 @@ def init_LJ_datasets(meGO_ensemble, matrices, pairs14, exclusion_bonds14, args):
         ),
     )
     train_dataset["rep"] = train_dataset["rep"].fillna(pd.Series(pairwise_c12))
-    train_dataset.loc[oxygen_mask.flatten(), "epsilon_prior"] = 0
-    train_dataset.loc[oxygen_mask.flatten(), "sigma_prior"] = train_dataset["rep"] ** (1.0 / 12.0) / (2.0 ** (1.0 / 6.0))
+    # train_dataset.loc[oxygen_mask.flatten(), "epsilon_prior"] = 0
+    # train_dataset.loc[oxygen_mask.flatten(), "sigma_prior"] = train_dataset["rep"] ** (1.0 / 12.0) / (2.0 ** (1.0 / 6.0))
 
     return train_dataset
 
@@ -985,37 +1006,25 @@ def set_sig_epsilon(meGO_LJ, needed_fields, parameters):
     consistent with the given probability and distance thresholds, maintaining the accuracy of simulations or calculations.
     """
     # when distance estimates are poor we use the cutoff value
-    if not parameters.regtest:
-        meGO_LJ.loc[(meGO_LJ["probability"] <= meGO_LJ["md_threshold"]), "distance"] = meGO_LJ["sigma_prior"] * 2.0 ** (
-            1.0 / 6.0
-        )
-        meGO_LJ.loc[(meGO_LJ["rc_probability"] <= meGO_LJ["md_threshold"]), "rc_distance"] = meGO_LJ["sigma_prior"] * 2.0 ** (
-            1.0 / 6.0
-        )
-    else:
-        meGO_LJ.loc[(meGO_LJ["probability"] <= meGO_LJ["md_threshold"]), "distance"] = (
-            meGO_LJ["sigma_prior"] * 2.0 ** (1.0 / 6.0) / meGO_LJ["epsilon_0"] ** (1.0 / 12.0)
-        )
-        meGO_LJ.loc[(meGO_LJ["rc_probability"] <= meGO_LJ["md_threshold"]), "rc_distance"] = (
-            meGO_LJ["sigma_prior"] * 2.0 ** (1.0 / 6.0) / meGO_LJ["epsilon_0"] ** (1.0 / 12.0)
-        )
-
-    # by default epsilon is set to the prior epsilon
-    meGO_LJ["epsilon"] = meGO_LJ["epsilon_prior"]
-
-    # Epsilon is initialised to a rescaled C12
-    # This is always correct becasue distance is always well defined by either training data
-    # or using default C12 values
-    # negative epsilon are used to identify non-attractive interactions
+    # Update the "distance" column for rows in the mask
+    mask = meGO_LJ["probability"] <= meGO_LJ["md_threshold"]
+    meGO_LJ.loc[mask, "distance"] = np.where(
+        meGO_LJ.loc[mask, "epsilon_prior"] == 0,
+        meGO_LJ.loc[mask, "sigma_prior"] * 2.0 ** (1.0 / 6.0) / meGO_LJ.loc[mask, "epsilon_0"] ** (1.0 / 12.0),
+        meGO_LJ.loc[mask, "sigma_prior"] * 2.0 ** (1.0 / 6.0),
+    )
+    mask = meGO_LJ["rc_probability"] <= meGO_LJ["md_threshold"]
+    meGO_LJ.loc[mask, "rc_distance"] = np.where(
+        meGO_LJ.loc[mask, "epsilon_prior"] == 0,
+        meGO_LJ.loc[mask, "sigma_prior"] * 2.0 ** (1.0 / 6.0) / meGO_LJ.loc[mask, "epsilon_0"] ** (1.0 / 12.0),
+        meGO_LJ.loc[mask, "sigma_prior"] * 2.0 ** (1.0 / 6.0),
+    )
 
-    # # Correct version
-    if not parameters.regtest:
-        meGO_LJ.loc[meGO_LJ["probability"] > meGO_LJ["md_threshold"], "epsilon"] = (
-            -meGO_LJ["rep"] * (meGO_LJ["distance"] / meGO_LJ["rc_distance"]) ** 12
-        )
-    # # Needed for regtests
-    else:
-        meGO_LJ["epsilon"] = -meGO_LJ["rep"] * (meGO_LJ["distance"] / meGO_LJ["rc_distance"]) ** 12
+    meGO_LJ["epsilon"] = np.where(
+        meGO_LJ["epsilon_prior"] == 0,
+        -meGO_LJ["rep"] * (meGO_LJ["distance"] / meGO_LJ["rc_distance"]) ** 12,  # If epsilon_prior == 0
+        meGO_LJ["epsilon_prior"],  # Otherwise, set to epsilon_prior
+    )
 
     # Attractive interactions
     # These are defined only if the training probability is greater than MD_threshold and
@@ -1103,7 +1112,7 @@ def set_sig_epsilon(meGO_LJ, needed_fields, parameters):
     meGO_LJ.loc[:, "learned"] = 1
 
     # keep only epsilons different from the epsilon prior ones (avoid c6,c12 already defined by atomtypes)
-    meGO_LJ = meGO_LJ.loc[meGO_LJ["epsilon"] != meGO_LJ["epsilon_prior"]]
+    # meGO_LJ=meGO_LJ.loc[meGO_LJ["epsilon"]!=meGO_LJ["epsilon_prior"]]
 
     # keep only needed fields
     meGO_LJ = meGO_LJ[needed_fields]
@@ -1304,6 +1313,9 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
             & (meGO_LJ["1-4"] == "1>4")
         )
     ]
+    # now we can remove all attractive contacts with default (i.e., att) c12 becasue these
+    # are uninformative and predefined. This also allow to replace them with contact learned
+    # by either intra/inter training. We cannot remove 1-4 interactions.
 
     # transfer rule for inter/intra contacts:
     # 1) only attractive contacts can be transferd
@@ -1395,8 +1407,8 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
 
     # Now is time to add masked default interactions for pairs
     # that have not been learned in any other way
-    basic_LJ = basic_LJ[needed_fields]
-    meGO_LJ = pd.concat([meGO_LJ, basic_LJ])
+    # basic_LJ = basic_LJ[needed_fields]
+    # meGO_LJ = pd.concat([meGO_LJ, basic_LJ])
 
     # make meGO_LJ fully symmetric
     # Create inverse DataFrame

diff --git a/src/multiego/topology.py b/src/multiego/topology.py
@@ -350,13 +350,58 @@ def get_lj_pairs(topology):
     for i, (sbtype_i, sbtype_j) in enumerate(topology.parameterset.nbfix_types):
         key = (sbtype_i, sbtype_j)
         c12, c6 = topology.parameterset.nbfix_types[key][0] * 4.184, topology.parameterset.nbfix_types[key][1] * 4.184
-        epsilon = c6**2 / (4 * c12) if c6 > 0 else c12
-        sigma = (c12 / c6) ** (1 / 6) if c6 > 0 else 0
+        epsilon = c6**2 / (4 * c12) if c6 > 0 else 0 
+        sigma = (c12 / c6) ** (1 / 6) if c6 > 0 else c12 ** (1 / 12) / (2.0 ** (1.0 / 6.0))
         lj_pairs.loc[i] = [sbtype_i, sbtype_j, epsilon, sigma]
 
     return lj_pairs
 
 
+def get_lj14_pairs(topology):
+    """
+    Extracts Lennard-Jones pair information from a molecular topology.
+
+    Parameters
+    ----------
+    topology: parmed.topology object
+        Contains the molecular topology information
+
+    Returns
+    -------
+    pairs_dataframe: pd.DataFrame
+        DataFrame containing Lennard-Jones pair information
+    """
+    lj14_pairs = pd.DataFrame()
+    for mol, top in topology.molecules.items():
+        pair14 = [
+            {
+                "ai": pair.atom1.type,
+                "aj": pair.atom2.type,
+                "c6": pair.type.rmin * 4.184,
+                "c12": pair.type.epsilon * 4.184,
+            }
+            for pair in top[0].adjusts
+        ]
+        df = pd.DataFrame(pair14)
+        lj14_pairs = pd.concat([lj14_pairs, df])
+
+    lj14_pairs = lj14_pairs.reset_index()
+
+    # Calculate "epsilon" using a vectorized conditional expression
+    lj14_pairs["epsilon"] = np.where(lj14_pairs["c6"] > 0, lj14_pairs["c6"] ** 2 / (4 * lj14_pairs["c12"]), 0)
+
+    # Calculate "sigma" using a vectorized conditional expression
+    lj14_pairs["sigma"] = np.where(
+        lj14_pairs["c6"] > 0,
+        (lj14_pairs["c12"] / lj14_pairs["c6"]) ** (1 / 6),
+        lj14_pairs["c12"] ** (1 / 12) / (2.0 ** (1.0 / 6.0)),
+    )
+
+    lj14_pairs.drop(columns=["c6", "c12"], inplace=True)
+
+    return lj14_pairs
+
+
 def create_pairs_14_dataframe(atomtype1, atomtype2, c6=0.0, shift=0, prefactor=None, constant=None):
     """
     Used to create additional or modified, multi-eGO-specific 1-4 (like) interactions. Two sets of atomtypes with