Merge pull request multi-ego#486 from brunostega/mglob_prior

added stuff for regtest, fixed topology reading and c6 c12 mapping
carlocamilloni · Nov 17, 2024 · 5ada1f0 · 5ada1f0
2 parents ec45ba1 + 5f99522
commit 5ada1f0
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 643 deletions.
diff --git a/src/multiego/arguments.py b/src/multiego/arguments.py
@@ -120,6 +120,12 @@
         "type": str,
         "help": "Explicit name for the output directory stored in outputs/system",
     },
+    "--regtest": {
+        "type": bool,
+        "default": False,
+        "action": "store_true",
+        "help": "Use old rules for regtests check.",
+    },
     "--config": {
         "default": "",
         "type": str,

diff --git a/src/multiego/ensemble.py b/src/multiego/ensemble.py
@@ -111,7 +111,8 @@ def initialize_topology(topology, custom_dict, args):
         + "_"
         + ensemble_topology_dataframe["resnum"].astype(str)
     )
-    ensemble_topology_dataframe.rename(columns={"epsilon": "c12"}, inplace=True)
+    # TODO remove, not sure why it was here (it was overwritten immidiately after)
+    ensemble_topology_dataframe.rename(columns={"epsilon": "to_remove_c12"}, inplace=True)
 
     atp_c12_map = {k: v for k, v in zip(type_definitions.gromos_atp["name"], type_definitions.gromos_atp["rc_c12"])}
     atp_mg_c6_map = {k: v for k, v in zip(type_definitions.gromos_atp["name"], type_definitions.gromos_atp["mg_c6"])}
@@ -125,8 +126,8 @@ def initialize_topology(topology, custom_dict, args):
     ensemble_topology_dataframe["charge"] = 0.0
     ensemble_topology_dataframe["rc_c6"] = 0.0
     ensemble_topology_dataframe["rc_c12"] = ensemble_topology_dataframe["type"].map(atp_c12_map)
-    ensemble_topology_dataframe["c6"] = ensemble_topology_dataframe["type"].map(atp_mg_c6_map)
-    ensemble_topology_dataframe["c12"] = ensemble_topology_dataframe["type"].map(atp_mg_c12_map)
+    ensemble_topology_dataframe["mg_c6"] = ensemble_topology_dataframe["type"].map(atp_mg_c6_map)
+    ensemble_topology_dataframe["mg_c12"] = ensemble_topology_dataframe["type"].map(atp_mg_c12_map)
     ensemble_topology_dataframe["molecule_type"] = ensemble_topology_dataframe["molecule_name"].map(molecule_type_dict)
 
     for molecule in ensemble_molecules_idx_sbtype_dictionary.keys():
@@ -349,8 +350,12 @@ def init_meGO_matrices(ensemble, args, custom_dict):
     for reference in args.reference:  # reference_paths:
         print("\t-", f"Initializing {reference} ensemble data")
         reference_path = f"{args.root_dir}/inputs/{args.system}/{reference}"
+        topol_files = [f for f in os.listdir(reference_path) if ".top" in f]
         # path = f"{args.root_dir}/inputs/{args.system}/{reference_path}"
-        topology_path = f"{reference_path}/topol.top"
+        if len(topol_files) > 1:
+            raise RuntimeError(f"More than 1 topology file found in {reference_path}. Only one should be used")
+
+        topology_path = f"{reference_path}/{topol_files[0]}"
         if not os.path.isfile(topology_path):
             raise FileNotFoundError(f"{topology_path} not found.")
 
@@ -362,10 +367,12 @@ def init_meGO_matrices(ensemble, args, custom_dict):
 
         lj_data = topology.get_lj_params(topol)
         lj_pairs = topology.get_lj_pairs(topol)
-        lj_data_dict = {key: val for key, val in zip(lj_data["ai"], lj_data[["c6", "c12"]].values)}
+        lj_data_dict = {str(key): val for key, val in zip(lj_data["ai"], lj_data[["c6", "c12"]].values)}
         lj_pairs_dict = {
             (ai, aj): (epsilon, sigma) for ai, aj, epsilon, sigma in lj_pairs[["ai", "aj", "epsilon", "sigma"]].to_numpy()
         }
+        ensemble["topology_dataframe"]["c6"] = lj_data["c6"].to_numpy()
+        ensemble["topology_dataframe"]["c12"] = lj_data["c12"].to_numpy()
 
         matrix_paths = glob.glob(f"{reference_path}/int??mat_?_?.ndx")
         matrix_paths = matrix_paths + glob.glob(f"{reference_path}/int??mat_?_?.ndx.gz")
@@ -384,28 +391,21 @@ def init_meGO_matrices(ensemble, args, custom_dict):
                 path, ensemble["molecules_idx_sbtype_dictionary"], reference, path.endswith(".h5")
             )
             reference_contact_matrices[name] = reference_contact_matrices[name].add_prefix("rc_")
-            reference_contact_matrices[name]["c6_i"] = [
-                lj_data_dict[x][0] for x in reference_contact_matrices[name]["rc_ai"].str.split(r"_.+_").str[0]
-            ]
-            reference_contact_matrices[name]["c6_j"] = [
-                lj_data_dict[x][0] for x in reference_contact_matrices[name]["rc_aj"].str.split(r"_.+_").str[0]
-            ]
+            reference_contact_matrices[name]["c6_i"] = [lj_data_dict[x][0] for x in reference_contact_matrices[name]["rc_ai"]]
+            reference_contact_matrices[name]["c6_j"] = [lj_data_dict[x][0] for x in reference_contact_matrices[name]["rc_aj"]]
             reference_contact_matrices[name]["c6"] = np.sqrt(
                 reference_contact_matrices[name]["c6_i"] * reference_contact_matrices[name]["c6_j"]
             )
-            reference_contact_matrices[name]["c12_i"] = [
-                lj_data_dict[x][1] for x in reference_contact_matrices[name]["rc_ai"].str.split(r"_.+_").str[0]
-            ]
-            reference_contact_matrices[name]["c12_j"] = [
-                lj_data_dict[x][1] for x in reference_contact_matrices[name]["rc_aj"].str.split(r"_.+_").str[0]
-            ]
+            reference_contact_matrices[name]["c12_i"] = [lj_data_dict[x][1] for x in reference_contact_matrices[name]["rc_ai"]]
+            reference_contact_matrices[name]["c12_j"] = [lj_data_dict[x][1] for x in reference_contact_matrices[name]["rc_aj"]]
             reference_contact_matrices[name]["c12"] = np.sqrt(
                 reference_contact_matrices[name]["c12_i"] * reference_contact_matrices[name]["c12_j"]
             )
             reference_contact_matrices[name]["sigma_prior"] = np.where(
                 reference_contact_matrices[name]["c6"] > 0,
                 (reference_contact_matrices[name]["c12"] / reference_contact_matrices[name]["c6"]) ** (1 / 6),
-                reference_contact_matrices[name]["rc_cutoff"] / (2.0 ** (1.0 / 6.0)),
+                # reference_contact_matrices[name]["rc_cutoff"]/ (2.0 ** (1.0 / 6.0)),
+                reference_contact_matrices[name]["c12"] ** (1 / 12) / (2.0 ** (1.0 / 6.0)),
             )
             reference_contact_matrices[name]["epsilon_prior"] = np.where(
                 reference_contact_matrices[name]["c6"] > 0,
@@ -779,7 +779,7 @@ def init_LJ_datasets(meGO_ensemble, matrices, pairs14, exclusion_bonds14, args):
     )
     train_dataset["rep"] = train_dataset["rep"].fillna(pd.Series(pairwise_c12))
     train_dataset.loc[oxygen_mask.flatten(), "epsilon_prior"] = 0
-    train_dataset.loc[oxygen_mask.flatten(), "sigma_prior"] = train_dataset["cutoff"]
+    train_dataset.loc[oxygen_mask.flatten(), "sigma_prior"] = train_dataset["rep"] ** (1.0 / 12.0) / (2.0 ** (1.0 / 6.0))
 
     return train_dataset
 
@@ -807,7 +807,7 @@ def generate_rc_LJ(meGO_ensemble):
     rc_LJ["c12"] = 11.4 * np.sqrt(
         rc_LJ["ai"].map(meGO_ensemble["sbtype_c12_dict"]) * rc_LJ["aj"].map(meGO_ensemble["sbtype_c12_dict"])
     )
-
+    rc_LJ.sort_values(by=["ai", "aj"], ascending=[True, True], inplace=True)
     return rc_LJ
 
 
@@ -961,7 +961,7 @@ def generate_basic_LJ(meGO_ensemble, args, matrices=None):
     return basic_LJ
 
 
-def set_sig_epsilon(meGO_LJ, needed_fields):
+def set_sig_epsilon(meGO_LJ, needed_fields, parameters):
     """
     Set the epsilon parameter for LJ interactions based on probability and distance.
 
@@ -985,23 +985,37 @@ def set_sig_epsilon(meGO_LJ, needed_fields):
     consistent with the given probability and distance thresholds, maintaining the accuracy of simulations or calculations.
     """
     # when distance estimates are poor we use the cutoff value
-    meGO_LJ.loc[(meGO_LJ["probability"] <= meGO_LJ["md_threshold"]), "distance"] = meGO_LJ["sigma_prior"] * 2.0 ** (1.0 / 6.0)
-    meGO_LJ.loc[(meGO_LJ["rc_probability"] <= meGO_LJ["md_threshold"]), "rc_distance"] = meGO_LJ["sigma_prior"] * 2.0 ** (
-        1.0 / 6.0
-    )
+    if not parameters.regtest:
+        meGO_LJ.loc[(meGO_LJ["probability"] <= meGO_LJ["md_threshold"]), "distance"] = meGO_LJ["sigma_prior"] * 2.0 ** (
+            1.0 / 6.0
+        )
+        meGO_LJ.loc[(meGO_LJ["rc_probability"] <= meGO_LJ["md_threshold"]), "rc_distance"] = meGO_LJ["sigma_prior"] * 2.0 ** (
+            1.0 / 6.0
+        )
+    else:
+        meGO_LJ.loc[(meGO_LJ["probability"] <= meGO_LJ["md_threshold"]), "distance"] = (
+            meGO_LJ["sigma_prior"] * 2.0 ** (1.0 / 6.0) / meGO_LJ["epsilon_0"] ** (1.0 / 12.0)
+        )
+        meGO_LJ.loc[(meGO_LJ["rc_probability"] <= meGO_LJ["md_threshold"]), "rc_distance"] = (
+            meGO_LJ["sigma_prior"] * 2.0 ** (1.0 / 6.0) / meGO_LJ["epsilon_0"] ** (1.0 / 12.0)
+        )
 
     # by default epsilon is set to the prior epsilon
     meGO_LJ["epsilon"] = meGO_LJ["epsilon_prior"]
-    # meGO_LJ.loc[meGO_LJ["epsilon_prior"]>0, "epsilon"] = meGO_LJ["epsilon_prior"]
-    # #meGO_LJ.loc[meGO_LJ["epsilon_prior"]==0, "epsilon"] = meGO_LJ["rep"]
 
     # Epsilon is initialised to a rescaled C12
     # This is always correct becasue distance is always well defined by either training data
     # or using default C12 values
     # negative epsilon are used to identify non-attractive interactions
-    meGO_LJ.loc[meGO_LJ["probability"] > meGO_LJ["md_threshold"], "epsilon"] = (
-        -meGO_LJ["rep"] * (meGO_LJ["distance"] / meGO_LJ["rc_distance"]) ** 12
-    )
+
+    # # Correct version
+    if not parameters.regtest:
+        meGO_LJ.loc[meGO_LJ["probability"] > meGO_LJ["md_threshold"], "epsilon"] = (
+            -meGO_LJ["rep"] * (meGO_LJ["distance"] / meGO_LJ["rc_distance"]) ** 12
+        )
+    # # Needed for regtests
+    else:
+        meGO_LJ["epsilon"] = -meGO_LJ["rep"] * (meGO_LJ["distance"] / meGO_LJ["rc_distance"]) ** 12
 
     # Attractive interactions
     # These are defined only if the training probability is greater than MD_threshold and
@@ -1073,7 +1087,12 @@ def set_sig_epsilon(meGO_LJ, needed_fields):
     ] = -meGO_LJ["rep"]
 
     # Sigma is set from the estimated interaction length
-    meGO_LJ = meGO_LJ.assign(sigma=meGO_LJ["sigma_prior"] * meGO_LJ["distance"] / meGO_LJ["rc_distance"])
+    # # Correct version
+    if not parameters.regtest:
+        meGO_LJ = meGO_LJ.assign(sigma=meGO_LJ["sigma_prior"] * meGO_LJ["distance"] / meGO_LJ["rc_distance"])
+    # # Needed for regtests
+    else:
+        meGO_LJ = meGO_LJ.assign(sigma=meGO_LJ["distance"] / 2 ** (1.0 / 6.0))
 
     # for repulsive interaction we reset sigma to its effective value
     # this because when merging repulsive contacts from different sources what will matters
@@ -1239,7 +1258,7 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
     ]
 
     # generate attractive and repulsive interactions
-    meGO_LJ = set_sig_epsilon(meGO_LJ, needed_fields)
+    meGO_LJ = set_sig_epsilon(meGO_LJ, needed_fields, parameters)
 
     et = time.time()
     elapsed_time = et - st

diff --git a/src/multiego/io.py b/src/multiego/io.py
@@ -363,6 +363,9 @@ def write_nonbonded(topology_dataframe, meGO_LJ, parameters, output_folder):
         if parameters.egos == "rc":
             atomtypes = topology_dataframe[["sb_type", "atomic_number", "mass", "charge", "ptype", "rc_c6", "rc_c12"]].copy()
             atomtypes.rename(columns={"rc_c6": "c6", "rc_c12": "c12"}, inplace=True)
+        elif parameters.egos == "mg":
+            atomtypes = topology_dataframe[["sb_type", "atomic_number", "mass", "charge", "ptype", "rc_c6", "rc_c12"]].copy()
+            atomtypes.rename(columns={"mg_c6": "c6", "mg_c12": "c12"}, inplace=True)
         else:
             atomtypes = topology_dataframe[["sb_type", "atomic_number", "mass", "charge", "ptype", "c6", "c12"]].copy()
 

diff --git a/src/multiego/topology.py b/src/multiego/topology.py
@@ -334,15 +334,14 @@ def get_lj_params(topology):
                 lj_params,
                 pd.DataFrame(
                     {
-                        "ai": atom.name,
+                        "ai": atom.atom_type,
                         "c6": c6,
                         "c12": c12,
                     },
                     index=[0],
                 ),
             ]
         )
-    lj_params.drop_duplicates(inplace=True)
     lj_params = lj_params.reset_index()
     return lj_params
 

diff --git a/test/test_cases.txt b/test/test_cases.txt
@@ -1,6 +1,6 @@
---system gpref --egos rc --no_header --explicit_name case
---system gpref --egos production --epsilon 0.235 --train md_ensemble --single_molecule --no_header --explicit_name case
---config TEST_ROOT/test_inputs/abetaref/config.yml --explicit_name case # --system abetaref --egos production
---config TEST_ROOT/test_inputs/ttrref/config.yml --explicit_name case # --system ttrref --egos production
---config TEST_ROOT/test_inputs/lyso-bnz_ref/config_1.yml --explicit_name case # --system lyso-bnz_ref --egos production
---config TEST_ROOT/test_inputs/lyso-bnz_ref/config_2.yml --explicit_name case # --system lyso-bnz_ref --egos production
+--system gpref --egos rc --no_header --regtest --explicit_name case
+--system gpref --egos production --epsilon 0.235 --train md_ensemble --single_molecule --no_header --regtest --explicit_name case
+--config TEST_ROOT/test_inputs/abetaref/config.yml --regtest --explicit_name case # --system abetaref --egos production
+--config TEST_ROOT/test_inputs/ttrref/config.yml --regtest --explicit_name case # --system ttrref --egos production
+--config TEST_ROOT/test_inputs/lyso-bnz_ref/config_1.yml --regtest --explicit_name case # --system lyso-bnz_ref --egos production
+--config TEST_ROOT/test_inputs/lyso-bnz_ref/config_2.yml --regtest --explicit_name case # --system lyso-bnz_ref --egos production