From c71a93d1277d394e181e19ae3491a348a73be366 Mon Sep 17 00:00:00 2001 From: Carlo Camilloni Date: Thu, 5 Sep 2024 15:37:50 +0200 Subject: [PATCH] updated molten globule module --- multiego.py | 34 +++++++++++- src/multiego/ensemble.py | 113 ++++++++++++++++++++++++++++++--------- 2 files changed, 120 insertions(+), 27 deletions(-) diff --git a/multiego.py b/multiego.py index b1614c72..2209feec 100644 --- a/multiego.py +++ b/multiego.py @@ -2,6 +2,7 @@ import sys import os import numpy as np +import pandas as pd from src.multiego import ensemble from src.multiego import io @@ -406,9 +407,38 @@ def get_meGO_LJ(meGO_ensemble, args): """ pairs14, exclusion_bonds14 = ensemble.generate_14_data(meGO_ensemble) if args.egos == "rc": - meGO_LJ = ensemble.generate_basic_LJ(meGO_ensemble, args) - meGO_LJ_14 = pairs14 + meGO_LJ, meGO_LJ_14 = ensemble.generate_basic_LJ(meGO_ensemble, args) + # meGO_LJ_14 = pairs14 + meGO_LJ_14 = pd.concat([meGO_LJ_14, pairs14]) + needed_fields = [ + "ai", + "aj", + "type", + "c6", + "c12", + "sigma", + "epsilon", + "probability", + "rc_probability", + "md_threshold", + "rc_threshold", + "rep", + "cutoff", + "molecule_name_ai", + "molecule_name_aj", + "same_chain", + "source", + "number_ai", + "number_aj", + ] + meGO_LJ_14 = meGO_LJ_14[needed_fields] meGO_LJ_14["epsilon"] = -meGO_LJ_14["c12"] + meGO_LJ_14.reset_index(inplace=True) + # Sorting the pairs prioritising intermolecular interactions + meGO_LJ_14.sort_values(by=["ai", "aj", "c12"], ascending=[True, True, True], inplace=True) + # Cleaning the duplicates + meGO_LJ_14 = meGO_LJ_14.drop_duplicates(subset=["ai", "aj"], keep="first") + print(meGO_LJ_14) else: train_dataset, check_dataset = ensemble.init_LJ_datasets(meGO_ensemble, pairs14, exclusion_bonds14, args) meGO_LJ, meGO_LJ_14 = ensemble.generate_LJ(meGO_ensemble, train_dataset, check_dataset, args) diff --git a/src/multiego/ensemble.py b/src/multiego/ensemble.py index 0e409695..9bd8e357 100644 --- a/src/multiego/ensemble.py +++ b/src/multiego/ensemble.py @@ -762,6 +762,7 @@ def generate_14_data(meGO_ensemble): pairs["aj"] = pairs["aj"].map(type_atnum_dict) pairs["rep"] = pairs["c12"] pairs["same_chain"] = True + pairs["1-4"] = "1_4" else: pairs["ai"] = meGO_ensemble["user_pairs"][molecule].ai.astype(str) pairs["aj"] = meGO_ensemble["user_pairs"][molecule].aj.astype(str) @@ -980,7 +981,7 @@ def init_LJ_datasets(meGO_ensemble, pairs14, exclusion_bonds14, args): def get_residue_number(s): - return int(s.split('_')[-1]) + return int(s.split("_")[-1]) def generate_basic_LJ(meGO_ensemble, args): @@ -1025,6 +1026,7 @@ def generate_basic_LJ(meGO_ensemble, args): "cutoff", "rep", "att", + "bare", ] basic_LJ = pd.DataFrame() @@ -1063,6 +1065,32 @@ def generate_basic_LJ(meGO_ensemble, args): c6_list = ai_name.map(name_to_c6).to_numpy() ai_name = ai_name.to_numpy(dtype=str) oxygen_mask = masking.create_array_mask(ai_name, ai_name, [("O", "OM"), ("O", "O"), ("OM", "OM")], symmetrize=True) + ca_mask = masking.create_array_mask( + ai_name, + ai_name, + [ + ("CH1", "CH1"), + ("CH1", "C"), + ("CH1", "N"), + ("CH1", "S"), + ("CH1", "CH"), + ("CH1", "CH2"), + ("CH1", "CH2r"), + ("CH1", "CH3"), + ("CH1", "OA"), + ("NL", "CH1"), + ("NZ", "CH1"), + ("NE", "CH1"), + ("NT", "CH1"), + ("NR", "CH1"), + ("N", "N"), + ("N", "NT"), + ("NT", "NT"), + ("C", "C"), + ("S", "S"), + ], + symmetrize=True, + ) bb_mask = masking.create_array_mask( ai_name, ai_name, @@ -1152,40 +1180,60 @@ def generate_basic_LJ(meGO_ensemble, args): basic_LJ["type"] = 1 basic_LJ["source"] = "basic" basic_LJ["same_chain"] = True + basic_LJ.c6 = 0.0 basic_LJ.c12 = np.sqrt(bare_c12_list * bare_c12_list[:, np.newaxis]).flatten() + basic_LJ.bare = np.sqrt(bare_c12_list * bare_c12_list[:, np.newaxis]).flatten() basic_LJ.rep = np.sqrt(c12_list * c12_list[:, np.newaxis]).flatten() basic_LJ.att = np.sqrt(c6_list * c6_list[:, np.newaxis]).flatten() - oxygen_LJ = basic_LJ[oxygen_mask].copy() - oxygen_LJ["c12"] *= 11.4 - oxygen_LJ["c6"] = 0.0 - hydrophobic_LJ = basic_LJ[hydrophobic_mask].copy() - hydrophobic_LJ["c12"] = 0.25 * hydrophobic_LJ["rep"] - hydrophobic_LJ["c6"] = 0.25 * hydrophobic_LJ["att"] - hydrophobic_w_LJ = basic_LJ[hydrophobic_w_mask].copy() - hydrophobic_w_LJ["c12"] = 0.12 * hydrophobic_w_LJ["rep"] - hydrophobic_w_LJ["c6"] = 0.12 * hydrophobic_w_LJ["att"] - bb_LJ = basic_LJ[bb_mask].copy() - bb_LJ["c12"] = 0.275 * bb_LJ["rep"] - bb_LJ["c6"] = 0.275 * bb_LJ["att"] - hbond_LJ = basic_LJ[hbond_mask].copy() - hbond_LJ["c12"] = 0.275 * hbond_LJ["rep"] - hbond_LJ["c6"] = 0.275 * hbond_LJ["att"] - catpi_LJ = basic_LJ[catpi_mask].copy() - catpi_LJ["c12"] = 0.20 * catpi_LJ["rep"] - catpi_LJ["c6"] = 0.20 * catpi_LJ["att"] - basic_LJ = pd.concat([oxygen_LJ, hbond_LJ, hydrophobic_LJ, hydrophobic_w_LJ, catpi_LJ, bb_LJ]) + # oxygen_LJ = basic_LJ[oxygen_mask].copy() + # oxygen_LJ["c12"] *= 11.4 + # oxygen_LJ["c6"] = 0.0 + # hydrophobic_LJ = basic_LJ[hydrophobic_mask].copy() + # hydrophobic_LJ["c12"] = 0.25 * hydrophobic_LJ["rep"] + # hydrophobic_LJ["c6"] = 0.25 * hydrophobic_LJ["att"] + # hydrophobic_w_LJ = basic_LJ[hydrophobic_w_mask].copy() + # hydrophobic_w_LJ["c12"] = 0.12 * hydrophobic_w_LJ["rep"] + # hydrophobic_w_LJ["c6"] = 0.12 * hydrophobic_w_LJ["att"] + # bb_LJ = basic_LJ[bb_mask].copy() + # bb_LJ["c12"] = 0.275 * bb_LJ["rep"] + # bb_LJ["c6"] = 0.275 * bb_LJ["att"] + # hbond_LJ = basic_LJ[hbond_mask].copy() + # hbond_LJ["c12"] = 0.275 * hbond_LJ["rep"] + # hbond_LJ["c6"] = 0.275 * hbond_LJ["att"] + # catpi_LJ = basic_LJ[catpi_mask].copy() + # catpi_LJ["c12"] = 0.20 * catpi_LJ["rep"] + # catpi_LJ["c6"] = 0.20 * catpi_LJ["att"] + # basic_LJ = pd.concat([oxygen_LJ, hbond_LJ, hydrophobic_LJ, hydrophobic_w_LJ, catpi_LJ, bb_LJ]) basic_LJ["intra_domain"] = True - basic_LJ["rep"] = basic_LJ["c12"] basic_LJ["residue_ai"] = basic_LJ["ai"].apply(get_residue_number) basic_LJ["residue_aj"] = basic_LJ["aj"].apply(get_residue_number) - basic_LJ = basic_LJ.loc[(basic_LJ["same_chain"]) & ((basic_LJ["residue_ai"] - basic_LJ["residue_aj"]).abs()>2)] - basic_LJ = basic_LJ.loc[(basic_LJ["c6"] == 0.0) | ((basic_LJ["c6"] ** 2 / (4.0 * basic_LJ["c12"])) > args.epsilon_min)] + basic_LJ.loc[oxygen_mask, "c12"] *= 11.4 + basic_LJ.loc[oxygen_mask, "c6"] = 0.0 + basic_LJ.loc[~oxygen_mask, "c12"] = 0.195 * basic_LJ["rep"] + basic_LJ.loc[~oxygen_mask, "c6"] = 0.195 * basic_LJ["att"] + basic_LJ_14 = basic_LJ.copy() + basic_LJ_14 = basic_LJ_14[(~oxygen_mask & ~ca_mask)] + basic_LJ_14 = basic_LJ_14.loc[ + (basic_LJ["same_chain"]) & ((basic_LJ["residue_ai"] - basic_LJ["residue_aj"]).abs() <= 2) + ] + basic_LJ = basic_LJ[~ca_mask] + basic_LJ["rep"] = basic_LJ["bare"] + basic_LJ_14["c12"] = basic_LJ_14["bare"] + basic_LJ_14["c6"] = 0.0 + basic_LJ_14["rep"] = basic_LJ_14["c12"] + # basic_LJ = basic_LJ.loc[(basic_LJ["c6"] == 0.0) | ((basic_LJ["c6"] ** 2 / (4.0 * basic_LJ["c12"])) > args.epsilon_min)] basic_LJ["index_ai"], basic_LJ["index_aj"] = basic_LJ[["index_ai", "index_aj"]].min(axis=1), basic_LJ[ ["index_ai", "index_aj"] ].max(axis=1) basic_LJ = basic_LJ.drop_duplicates(subset=["index_ai", "index_aj", "same_chain"], keep="first") basic_LJ.sort_values(by=["index_ai", "index_aj"], inplace=True) basic_LJ = basic_LJ.drop(["index_ai", "index_aj"], axis=1) + basic_LJ_14["index_ai"], basic_LJ_14["index_aj"] = basic_LJ_14[["index_ai", "index_aj"]].min(axis=1), basic_LJ_14[ + ["index_ai", "index_aj"] + ].max(axis=1) + basic_LJ_14 = basic_LJ_14.drop_duplicates(subset=["index_ai", "index_aj", "same_chain"], keep="first") + basic_LJ_14.sort_values(by=["index_ai", "index_aj"], inplace=True) + basic_LJ_14 = basic_LJ_14.drop(["index_ai", "index_aj"], axis=1) for name in meGO_ensemble["reference_matrices"].keys(): temp_basic_LJ = pd.DataFrame(columns=columns) @@ -1236,9 +1284,24 @@ def generate_basic_LJ(meGO_ensemble, args): basic_LJ.sort_values(by=["ai", "aj", "same_chain"], ascending=[True, True, True], inplace=True) # Cleaning the duplicates basic_LJ = basic_LJ.drop_duplicates(subset=["ai", "aj"], keep="first") + basic_LJ_14["probability"] = 1.0 + basic_LJ_14["rc_probability"] = 1.0 + basic_LJ_14["rc_threshold"] = 1.0 + basic_LJ_14["md_threshold"] = 1.0 + basic_LJ_14["epsilon"] = -basic_LJ_14["c12"] + basic_LJ_14.loc[basic_LJ_14["c6"] > 0, "epsilon"] = basic_LJ_14["c6"] ** 2 / (4.0 * basic_LJ_14["c12"]) + basic_LJ_14["cutoff"] = 1.45 * basic_LJ_14["c12"] ** (1.0 / 12.0) + basic_LJ_14["sigma"] = basic_LJ_14["cutoff"] / (2.0 ** (1.0 / 6.0)) + basic_LJ_14.loc[basic_LJ_14["c6"] > 0, "sigma"] = (basic_LJ_14["c12"] / basic_LJ_14["c6"]) ** (1 / 6) + basic_LJ_14["distance"] = basic_LJ_14["cutoff"] + basic_LJ_14["learned"] = 0 + basic_LJ_14["1-4"] = "1>4" + # Sorting the pairs prioritising intermolecular interactions + basic_LJ_14.sort_values(by=["ai", "aj", "same_chain"], ascending=[True, True, True], inplace=True) + # Cleaning the duplicates + basic_LJ_14 = basic_LJ_14.drop_duplicates(subset=["ai", "aj"], keep="first") - - return basic_LJ + return basic_LJ, basic_LJ_14 def set_epsilon(meGO_LJ):