multi-ego · carlocamilloni · Nov 2, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/CITATION.cff b/CITATION.cff
@@ -14,7 +14,7 @@ authors:
 - family-names: "Camilloni"
   given-names: "Carlo"
 title: "Multi-eGO"
-version: beta.1
+version: beta.4
 date-released: 2023-10-24
 url: "https://github.com/multi-ego/multi-ego"
 preferred-citation:

diff --git a/README.md b/README.md
@@ -99,7 +99,7 @@ Create a folder in which you want to run the random coil simulation. Copy the ``
 python multiego.py --system $SYSTEM_NAME --egos rc
 ```
 ```multiego.py``` will then create an output directory in ```multi-eGO/outputs/${SYSTEM_NAME}_rc``` which provides the inputs for the random coil simulation.
-The contents of the output folder are ```ffnonbonded.itp``` and ```topol_GRETA.top```. The former is the non-bonded interaction file and needs to be copied into the ```multi-ego-basic.ff/``` folder. The latter needs to be placed in the simulation root directory. We provide ```mdps``` simulation setup files tested with various multi-*e*GO setups in the ```multi-eGO/mdps``` folder. The order in which the simulations are run is as follows:
+The contents of the output folder are ```ffnonbonded.itp``` and ```topol_mego.top```. The former is the non-bonded interaction file and needs to be copied into the ```multi-ego-basic.ff/``` folder. The latter needs to be placed in the simulation root directory. We provide ```mdps``` simulation setup files tested with various multi-*e*GO setups in the ```multi-eGO/mdps``` folder. The order in which the simulations are run is as follows:
 
 ```
     1. ff_em.mdp
@@ -138,7 +138,7 @@ To setup a multi-*e*GO production simulation, you need to run ```multiego.py```
 ```
 python multiego.py --system $SYSTEM_NAME --egos production --epsilon 0.3 --train md_ensemble
 ```
-Here one sets the energy scale &#949; to 0.3 kJ/mol and trains the model from the ```md_ensemble``` data. The output directory will be ```multi-eGO/outputs/${SYSTEM_NAME}_production_e0.3_0.3``` and will contain the inputs for the production simulation. Again, the contents of the output directory are ```ffnonbonded.itp``` and ```topol_GRETA.top``` and need to be copied to the ```multi-ego-basic.ff/``` folder and the simulation root directory. The ```mdps``` files are the same except for the last step which is now ```ff_aa.mdp```.
+Here one sets the energy scale &#949; to 0.3 kJ/mol and trains the model from the ```md_ensemble``` data. The output directory will be ```multi-eGO/outputs/${SYSTEM_NAME}_production_e0.3_0.3``` and will contain the inputs for the production simulation. Again, the contents of the output directory are ```ffnonbonded.itp``` and ```topol_mego.top``` and need to be copied to the ```multi-ego-basic.ff/``` folder and the simulation root directory. The ```mdps``` files are the same except for the last step which is now ```ff_aa.mdp```.
 
 Happy simulating :)
 

diff --git a/multiego.py b/multiego.py
@@ -54,6 +54,11 @@ def meGO_parsing():
     args.root_dir = os.path.dirname(os.path.abspath(__file__))
     multi_flag = False
 
+    # Check if no arguments are provided
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
     if args.config:
         config_yaml = io.read_config(args.config, args_dict)
         # check if yaml file is empty
@@ -196,15 +201,14 @@ def main():
     st = et
     print("- Done in:", elapsed_time, "seconds")
 
-    print("- Generating Multi-eGO model")
     if args.egos != "rc":
-        print("\t- Processing Multi-eGO contact matrices")
+        print("- Processing Multi-eGO contact matrices")
         meGO_ensembles, matrices = ensemble.init_meGO_matrices(meGO_ensembles, args, custom_dict)
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t- Done in:", elapsed_time, "seconds")
-        print("\t- Initializing LJ dataset")
+        print("- Done in:", elapsed_time, "seconds")
+        print("- Initializing LJ dataset")
         train_dataset = ensemble.init_LJ_datasets(meGO_ensembles, matrices, pairs14, exclusion_bonds14, args)
         basic_LJ = ensemble.generate_basic_LJ(meGO_ensembles, args, matrices)
         # force memory cleaning to decrease footprint in case of large dataset
@@ -214,8 +218,8 @@ def main():
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t- Done in:", elapsed_time, "seconds")
-        print("\t- Generate LJ dataset")
+        print("- Done in:", elapsed_time, "seconds")
+        print("- Generate LJ dataset")
         meGO_LJ, meGO_LJ_14 = ensemble.generate_LJ(meGO_ensembles, train_dataset, basic_LJ, args)
         del train_dataset
         del basic_LJ
@@ -224,24 +228,24 @@ def main():
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t- Done in:", elapsed_time, "seconds")
+        print("- Done in:", elapsed_time, "seconds")
     else:
-        print("\t- Generate LJ dataset")
+        print("- Generate LJ dataset")
         meGO_LJ = ensemble.generate_basic_LJ(meGO_ensembles, args)
         meGO_LJ_14 = pairs14
         meGO_LJ_14["epsilon"] = -meGO_LJ_14["c12"]
         # get the end time
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t- Done in:", elapsed_time, "seconds")
+        print("- Done in:", elapsed_time, "seconds")
 
-    print("\t- Finalize pairs and exclusions")
+    print("- Finalize pairs and exclusions")
     meGO_LJ_14 = ensemble.make_pairs_exclusion_topology(meGO_ensembles, meGO_LJ_14)
     et = time.time()
     elapsed_time = et - st
     st = et
-    print("\t- Done in:", elapsed_time, "seconds")
+    print("- Done in:", elapsed_time, "seconds")
 
     print("- Writing Multi-eGO model")
     io.write_model(meGO_ensembles, meGO_LJ, meGO_LJ_14, args)

diff --git a/src/multiego/ensemble.py b/src/multiego/ensemble.py
@@ -344,7 +344,7 @@ def init_meGO_matrices(ensemble, args, custom_dict):
     reference_contact_matrices = {}
     matrices = {}
     for reference in args.reference:  # reference_paths:
-        print("\t\t-", f"Initializing {reference} ensemble data")
+        print("\t-", f"Initializing {reference} ensemble data")
         reference_path = f"{args.root_dir}/inputs/{args.system}/{reference}"
         # path = f"{args.root_dir}/inputs/{args.system}/{reference_path}"
         topology_path = f"{reference_path}/topol.top"
@@ -369,7 +369,7 @@ def init_meGO_matrices(ensemble, args, custom_dict):
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t\t- Done in:", elapsed_time, "seconds")
+        print("\t- Done in:", elapsed_time, "seconds")
 
     matrices["reference_matrices"] = reference_contact_matrices
     reference_set = set(ensemble["topology_dataframe"]["name"].to_list())
@@ -378,13 +378,13 @@ def init_meGO_matrices(ensemble, args, custom_dict):
     train_contact_matrices = {}
     train_topology_dataframe = pd.DataFrame()
     for simulation in args.train:
-        print("\t\t-", f"Initializing {simulation} ensemble data")
+        print("\t-", f"Initializing {simulation} ensemble data")
         simulation_path = f"{args.root_dir}/inputs/{args.system}/{simulation}"
         topology_path = f"{simulation_path}/topol.top"
         if not os.path.isfile(topology_path):
             raise FileNotFoundError(f"{topology_path} not found.")
 
-        print("\t\t\t-", f"Reading {topology_path}")
+        print("\t\t-", f"Reading {topology_path}")
         # ignore the dihedral type overriding in parmed
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
@@ -439,7 +439,7 @@ def init_meGO_matrices(ensemble, args, custom_dict):
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t\t- Done in:", elapsed_time, "seconds")
+        print("\t- Done in:", elapsed_time, "seconds")
 
     matrices["train_matrices"] = train_contact_matrices
 
@@ -590,20 +590,21 @@ def generate_14_data(meGO_ensemble):
                     print("       user provided 1-4 pairs need to define also the C6/C12\n")
                     exit()
                 nonprotein_c12.append(float(test.epsilon) * 4.184)
-            pairs["c12"] = nonprotein_c12
-            pairs["c6"] = 0.0
             pairs["func"] = 1
-            pairs["rep"] = pairs["c12"]
-            pairs["same_chain"] = True
-            pairs["source"] = pd.Series(["1-4"] * len(pairs), dtype="category")
+            pairs["c6"] = 0.0
+            pairs["c12"] = nonprotein_c12
             pairs["probability"] = 1.0
             pairs["rc_probability"] = 1.0
+            pairs["source"] = pd.Series(["1-4"] * len(pairs), dtype="category")
+            pairs["rep"] = pairs["c12"]
+            pairs["same_chain"] = True
             # copy and symmetrize
             tmp = pairs.copy()
             tmp["ai"], tmp["aj"] = tmp["aj"], tmp["ai"]
             pairs = pd.concat([pairs, tmp], axis=0, sort=False, ignore_index=True)
 
-        pairs14 = pd.concat([pairs14, pairs], axis=0, sort=False, ignore_index=True)
+        if not pairs.empty:
+            pairs14 = pd.concat([pairs14, pairs], axis=0, sort=False, ignore_index=True)
 
     return pairs14, exclusion_bonds14
 
@@ -627,6 +628,38 @@ def init_LJ_datasets(meGO_ensemble, matrices, pairs14, exclusion_bonds14, args):
             how="outer",
         )
         train_dataset = pd.concat([train_dataset, temp_merged], axis=0, sort=False, ignore_index=True)
+
+    # This is a debug check to avoid data inconsistencies
+    if (np.abs(train_dataset["rc_cutoff"] - train_dataset["cutoff"])).max() > 0:
+        print(
+            train_dataset[["source", "file", "rc_source", "rc_file", "cutoff", "rc_cutoff"]]
+            .loc[(np.abs(train_dataset["rc_cutoff"] - train_dataset["cutoff"]) > 0)]
+            .to_string()
+        )
+        exit("HERE SOMETHING BAD HAPPEND: There are inconsistent cutoff values between the MD and corresponding RC input data")
+
+    needed_fields = [
+        "molecule_name_ai",
+        "ai",
+        "molecule_name_aj",
+        "aj",
+        "distance",
+        "probability",
+        "cutoff",
+        "intra_domain",
+        "same_chain",
+        "source",
+        "file",
+        "zf",
+        "epsilon_0",
+        "md_threshold",
+        "rc_threshold",
+        "limit_rc",
+        "rc_distance",
+        "rc_probability",
+    ]
+    train_dataset = train_dataset[needed_fields]
+
     # This is to FLAG 1-1, 1-2, 1-3, 1-4 cases:
     train_dataset = pd.merge(
         train_dataset,
@@ -684,14 +717,6 @@ def init_LJ_datasets(meGO_ensemble, matrices, pairs14, exclusion_bonds14, args):
         ),
     )
     train_dataset["rep"] = train_dataset["rep"].fillna(pd.Series(pairwise_c12))
-    # This is a debug check to avoid data inconsistencies
-    if (np.abs(train_dataset["rc_cutoff"] - train_dataset["cutoff"])).max() > 0:
-        print(
-            train_dataset[["source", "file", "rc_source", "rc_file", "cutoff", "rc_cutoff"]]
-            .loc[(np.abs(train_dataset["rc_cutoff"] - train_dataset["cutoff"]) > 0)]
-            .to_string()
-        )
-        exit("HERE SOMETHING BAD HAPPEND: There are inconsistent cutoff values between the MD and corresponding RC input data")
 
     return train_dataset
 
@@ -811,9 +836,6 @@ def generate_basic_LJ(meGO_ensemble, args, matrices=None):
             temp_basic_LJ["c12"] = 11.4 * np.sqrt(c12_list_i * c12_list_j[:, np.newaxis]).flatten()
             temp_basic_LJ["rep"] = temp_basic_LJ["c12"]
             temp_basic_LJ = temp_basic_LJ[oxygen_mask]
-            # temp_basic_LJ["ai"], temp_basic_LJ["aj"] = temp_basic_LJ[["ai", "aj"]].min(axis=1), temp_basic_LJ[["ai", "aj"]].max(
-            #    axis=1
-            # )
             temp_basic_LJ = temp_basic_LJ.dropna(axis=1, how="all")
             temp_basic_LJ = temp_basic_LJ.drop_duplicates(subset=["ai", "aj", "same_chain"], keep="first")
 
@@ -1082,6 +1104,9 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
     meGO_LJ_14 : pd.DataFrame
         Contains 1-4 atomic contacts associated with LJ parameters and statistics.
     """
+
+    st = time.time()
+    print("\t- Set sigma and epsilon")
     # copy but remove intramolecular excluded interactions
     meGO_LJ = train_dataset.loc[(train_dataset["1-4"] != "1_2_3") & (train_dataset["1-4"] != "0")].copy()
     meGO_LJ.reset_index(inplace=True)
@@ -1134,23 +1159,26 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
     ]
     # keep only needed fields
     meGO_LJ = meGO_LJ[needed_fields]
+    et = time.time()
+    elapsed_time = et - st
+    print("\t- Done in:", elapsed_time, "seconds")
 
     # apply symmetries for equivalent atoms
     if parameters.symmetry:
         st = time.time()
-        print("\t\t- Apply the defined atomic symmetries")
+        print("\t- Apply the defined atomic symmetries")
         meGO_LJ_sym = apply_symmetries(meGO_ensemble, meGO_LJ, parameters.symmetry)
         meGO_LJ = pd.concat([meGO_LJ, meGO_LJ_sym])
         meGO_LJ.reset_index(inplace=True)
         et = time.time()
         elapsed_time = et - st
         st = et
-        print("\t\t- Done in:", elapsed_time, "seconds")
+        print("\t- Done in:", elapsed_time, "seconds")
 
     # meGO consistency checks
     consistency_checks(meGO_LJ)
 
-    print("\t\t- Merging multiple states (training, symmetries, inter/intra)")
+    print("\t- Merging multiple states (training, symmetries, inter/intra)")
 
     # Merging of multiple simulations:
     # Here we sort all the atom pairs based on the distance and the probability.

diff --git a/src/multiego/io.py b/src/multiego/io.py
@@ -265,7 +265,7 @@ def read_molecular_contacts(path, ensemble_molecules_idx_sbtype_dictionary, simu
     """
     Reads intra-/intermat files to determine molecular contact statistics.
     """
-    print("\t\t\t-", f"Reading {path}")
+    print("\t\t-", f"Reading {path}")
     st = time.time()
     # Define column names and data types directly during read
     col_names = ["molecule_name_ai", "ai", "molecule_name_aj", "aj", "distance", "probability", "cutoff", "intra_domain"]
@@ -288,7 +288,7 @@ def read_molecular_contacts(path, ensemble_molecules_idx_sbtype_dictionary, simu
         contact_matrix = pd.read_hdf(path, key="data", dtype=col_types)
 
     t1 = time.time()
-    print("\t\t\t- Read in:", t1 - st)
+    print("\t\t- Read in:", t1 - st)
 
     # Validation checks using `query` for more efficient conditional filtering
     if contact_matrix.query("probability < 0 or probability > 1").shape[0] > 0:
@@ -337,9 +337,10 @@ def read_molecular_contacts(path, ensemble_molecules_idx_sbtype_dictionary, simu
     if len_ai * len_aj != len(contact_matrix):
         raise Exception("The " + simulation + " topology and " + name[0] + " files are inconsistent")
 
-    contact_matrix = contact_matrix.drop(
-        contact_matrix[(contact_matrix["ai"].str.startswith("H")) | (contact_matrix["aj"].str.startswith("H"))].index
-    )
+    mask = np.logical_or(contact_matrix["ai"].str.startswith("H"), contact_matrix["aj"].str.startswith("H"))
+    if mask.any():
+        # Drop rows based on the mask
+        contact_matrix = contact_matrix[~mask]
 
     contact_matrix = contact_matrix.assign(
         same_chain=name[0] == "intramat",
@@ -351,7 +352,7 @@ def read_molecular_contacts(path, ensemble_molecules_idx_sbtype_dictionary, simu
     contact_matrix.set_index(["idx_ai", "idx_aj"], inplace=True)
 
     t2 = time.time()
-    print("\t\t\t- Processesed in:", t2 - t1)
+    print("\t\t- Processesed in:", t2 - t1)
 
     return contact_matrix
 
@@ -380,7 +381,6 @@ def write_nonbonded(topology_dataframe, meGO_LJ, parameters, output_folder):
         # write the defaults section
         file.write("\n[ defaults ]\n")
         file.write("; Include forcefield parameters\n")
-        file.write("#define _FF_MAGROS\n\n")
         file.write("; nbfunc        comb-rule       gen-pairs       fudgeLJ fudgeQQ\n")
         file.write("  1             1               no              1.0     1.0\n\n")
 
@@ -429,7 +429,7 @@ def write_model(meGO_ensemble, meGO_LJ, meGO_LJ_14, parameters):
     )
     write_nonbonded(meGO_ensemble["topology_dataframe"], meGO_LJ_out, parameters, output_dir)
     write_output_readme(meGO_LJ, parameters, output_dir)
-    print(f"Output files written to {output_dir}")
+    print("\t- " f"Output files written to {output_dir}")
 
 
 def write_output_readme(meGO_LJ, parameters, output_dir):
@@ -578,16 +578,16 @@ def print_stats(meGO_LJ):
 
     print(
         f"""
-    - LJ parameterization completed for a total of {len(meGO_LJ)} contacts.
-    - Attractive: intra-domain: {intrad_a_contacts}, inter-domain: {interd_a_contacts}, inter-molecular: {interm_a_contacts}
-    - Repulsive: intra-domain: {intrad_r_contacts}, inter-domain: {interd_r_contacts}, inter-molecular: {interm_r_contacts}
-    - The average epsilon is: {intrad_a_ave_contacts:5.3f} {interd_a_ave_contacts:5.3f} {interm_a_ave_contacts:5.3f} kJ/mol
-    - Epsilon range is: [{intrad_a_min_contacts:5.3f}:{intrad_a_max_contacts:5.3f}] [{interd_a_min_contacts:5.3f}:{interd_a_max_contacts:5.3f}] [{interm_a_min_contacts:5.3f}:{interm_a_max_contacts:5.3f}] kJ/mol
-    - Sigma range is: [{intrad_a_s_min_contacts:5.3f}:{intrad_a_s_max_contacts:5.3f}] [{interd_a_s_min_contacts:5.3f}:{interd_a_s_max_contacts:5.3f}] [{interm_a_s_min_contacts:5.3f}:{interm_a_s_max_contacts:5.3f}] nm
+\t- LJ parameterization completed for a total of {len(meGO_LJ)} contacts.
+\t- Attractive: intra-domain: {intrad_a_contacts}, inter-domain: {interd_a_contacts}, inter-molecular: {interm_a_contacts}
+\t- Repulsive: intra-domain: {intrad_r_contacts}, inter-domain: {interd_r_contacts}, inter-molecular: {interm_r_contacts}
+\t- The average epsilon is: {intrad_a_ave_contacts:5.3f} {interd_a_ave_contacts:5.3f} {interm_a_ave_contacts:5.3f} kJ/mol
+\t- Epsilon range is: [{intrad_a_min_contacts:5.3f}:{intrad_a_max_contacts:5.3f}] [{interd_a_min_contacts:5.3f}:{interd_a_max_contacts:5.3f}] [{interm_a_min_contacts:5.3f}:{interm_a_max_contacts:5.3f}] kJ/mol
+\t- Sigma range is: [{intrad_a_s_min_contacts:5.3f}:{intrad_a_s_max_contacts:5.3f}] [{interd_a_s_min_contacts:5.3f}:{interd_a_s_max_contacts:5.3f}] [{interm_a_s_min_contacts:5.3f}:{interm_a_s_max_contacts:5.3f}] nm
 
-    RELEVANT MDP PARAMETERS:
-    - Suggested rlist value: {1.1*2.5*meGO_LJ['sigma'].max():4.2f} nm
-    - Suggested cut-off value: {2.5*meGO_LJ['sigma'].max():4.2f} nm
+\t- RELEVANT MDP PARAMETERS:
+\t- Suggested rlist value: {1.1*2.5*meGO_LJ['sigma'].max():4.2f} nm
+\t- Suggested cut-off value: {2.5*meGO_LJ['sigma'].max():4.2f} nm
     """
     )
 
@@ -638,7 +638,7 @@ def dataframe_to_write(df):
     """
     if df.empty:
         # TODO insert and improve the following warning
-        print("A topology parameter is empty. Check the reference topology.")
+        print("\t- WARNING: A topology parameter is empty. Check the reference topology.")
         return "; The following parameters where not parametrized on multi-eGO.\n; If this is not expected, check the reference topology."
     else:
         df.rename(columns={df.columns[0]: f"; {df.columns[0]}"}, inplace=True)
@@ -649,7 +649,7 @@ def make_header(parameters):
     now = time.strftime("%d-%m-%Y %H:%M", time.localtime())
 
     header = f"""
-; Multi-eGO force field version beta.1
+; Multi-eGO force field version beta.4
 ; https://github.com/multi-ego/multi-eGO
 ; Please read and cite:
 ; Scalone, E. et al. PNAS 119, e2203181119 (2022) 10.1073/pnas.2203181119
@@ -698,7 +698,7 @@ def write_topology(
     output_folder,
 ):
     """
-    Writes the topology output content into GRETA_topol.top
+    Writes the topology output content into topol_mego.top
 
     Parameters
     ----------
@@ -721,7 +721,7 @@ def write_topology(
     if write_header:
         header = make_header(vars(parameters))
 
-    with open(f"{output_folder}/topol_GRETA.top", "w") as file:
+    with open(f"{output_folder}/topol_mego.top", "w") as file:
         header += """
 ; Include forcefield parameters
 #include "ffnonbonded.itp"