Skip to content

Commit

Permalink
further optimisations
Browse files Browse the repository at this point in the history
  • Loading branch information
carlocamilloni committed Nov 3, 2024
1 parent 8debd8b commit 59f2356
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 146 deletions.
122 changes: 60 additions & 62 deletions src/multiego/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,90 +614,88 @@ def init_LJ_datasets(meGO_ensemble, matrices, pairs14, exclusion_bonds14, args):
# we cycle over train matrices to pair them with reference matrices and
# then we add 1-4 assignments and defaults c12s and concatenate everything
train_dataset = pd.DataFrame()
for name, ref_name in meGO_ensemble["train_matrix_tuples"]:
# sysname_train_intramat_1_1 <-> sysname_reference_intramat_1_1
if ref_name not in matrices["reference_matrices"].keys():
raise RuntimeError(
f'Encountered error while trying to find {ref_name} in reference matrices {matrices["reference_matrices"].keys()}'
)

temp_merged = pd.merge(
matrices["train_matrices"][name],
matrices["reference_matrices"][ref_name],
left_index=True,
right_index=True,
how="outer",
)
train_dataset = pd.concat([train_dataset, temp_merged], axis=0, sort=False, ignore_index=True)

# This is a debug check to avoid data inconsistencies
if (np.abs(train_dataset["rc_cutoff"] - train_dataset["cutoff"])).max() > 0:
print(
train_dataset[["source", "file", "rc_source", "rc_file", "cutoff", "rc_cutoff"]]
.loc[(np.abs(train_dataset["rc_cutoff"] - train_dataset["cutoff"]) > 0)]
.to_string()
)
exit("HERE SOMETHING BAD HAPPEND: There are inconsistent cutoff values between the MD and corresponding RC input data")

# This is a debug check to avoid data inconsistencies
if not train_dataset["rc_same_chain"].equals(train_dataset["rc_same_chain"]):
diff_indices = train_dataset.index[train_dataset["same_chain"] != train_dataset["rc_same_chain"]].tolist()
print(f"Difference found at indices: {diff_indices}")
exit("HERE SOMETHING BAD HAPPEND: You are pairing intra and inter molecular training and reference data")

needed_fields = [
td_fields = [
"molecule_name_ai",
"ai",
"molecule_name_aj",
"aj",
"distance",
"probability",
"cutoff",
"intra_domain",
"same_chain",
"source",
"file",
"zf",
"epsilon_0",
"md_threshold",
"rc_threshold",
"limit_rc",
"rc_distance",
"rc_probability",
"rc_source",
]
train_dataset = train_dataset[needed_fields]

# This is to FLAG 1-1, 1-2, 1-3, 1-4 cases:
train_dataset = pd.merge(
train_dataset,
exclusion_bonds14[["ai", "aj", "same_chain", "1-4"]],
how="left",
on=["ai", "aj", "same_chain"],
)
# Add the new category "0" to the column's categories
train_dataset["1-4"] = train_dataset["1-4"].cat.add_categories(["0"])
train_dataset.loc[
(train_dataset["ai"] == train_dataset["aj"]) & (train_dataset["same_chain"]),
"1-4",
] = "0"
for name, ref_name in meGO_ensemble["train_matrix_tuples"]:
# sysname_train_intramat_1_1 <-> sysname_reference_intramat_1_1
if ref_name not in matrices["reference_matrices"].keys():
raise RuntimeError(
f'Encountered error while trying to find {ref_name} in reference matrices {matrices["reference_matrices"].keys()}'
)

train_dataset["1-4"] = train_dataset["1-4"].cat.add_categories(["1>4"])
train_dataset["1-4"] = train_dataset["1-4"].fillna("1>4").astype("category")
temp_merged = pd.merge(
matrices["train_matrices"][name],
matrices["reference_matrices"][ref_name],
left_index=True,
right_index=True,
how="outer",
)

# This is a debug check to avoid data inconsistencies
if (np.abs(temp_merged["rc_cutoff"] - temp_merged["cutoff"])).max() > 0:
print(
temp_merged[["ai", "aj", "rc_ai", "rc_aj", "source", "rc_source", "cutoff", "rc_cutoff"]]
.loc[(np.abs(temp_merged["rc_cutoff"] - temp_merged["cutoff"]) > 0)]
.to_string()
)
exit(
"HERE SOMETHING BAD HAPPEND: There are inconsistent cutoff values between the MD and corresponding RC input data"
)

# This is a debug check to avoid data inconsistencies
if not temp_merged["rc_same_chain"].equals(temp_merged["rc_same_chain"]):
diff_indices = temp_merged.index[temp_merged["same_chain"] != temp_merged["rc_same_chain"]].tolist()
print(f"Difference found at indices: {diff_indices}")
exit("HERE SOMETHING BAD HAPPEND: You are pairing intra and inter molecular training and reference data")

temp_merged = temp_merged[td_fields]
train_dataset = pd.concat([train_dataset, temp_merged], axis=0, sort=False, ignore_index=True)

train_dataset["molecule_name_ai"] = train_dataset["molecule_name_ai"].astype("category")
train_dataset["molecule_name_aj"] = train_dataset["molecule_name_aj"].astype("category")
train_dataset["source"] = train_dataset["source"].astype("category")

# This is to set the correct default C12 values taking into account specialised 1-4 values (including the special 1-5 O-O)
train_dataset = pd.merge(
train_dataset,
pairs14[["ai", "aj", "same_chain", "rep"]],
pd.merge(
train_dataset,
pairs14[["ai", "aj", "same_chain", "rep"]],
how="left",
on=["ai", "aj", "same_chain"],
),
exclusion_bonds14[["ai", "aj", "same_chain", "1-4"]],
how="left",
on=["ai", "aj", "same_chain"],
)

train_dataset["ai"] = train_dataset["ai"].astype("category")
train_dataset["aj"] = train_dataset["aj"].astype("category")

train_dataset.loc[(train_dataset["1-4"] == "0"), "rep"] = 0.0
train_dataset.loc[(train_dataset["1-4"] == "1_2_3"), "rep"] = 0.0
# We remove from train the 0_1_2_3 intramolecolar interactions
train_dataset = train_dataset[
~(((train_dataset["ai"] == train_dataset["aj"]) & train_dataset["same_chain"]) | (train_dataset["1-4"] == "1_2_3"))
]
train_dataset.reset_index(inplace=True)

train_dataset["1-4"] = train_dataset["1-4"].cat.add_categories(["1>4"])
train_dataset["1-4"] = train_dataset["1-4"].fillna("1>4").astype("category")
train_dataset.loc[(train_dataset["1-4"] == "1_4") & (train_dataset["rep"].isnull()), "rep"] = 0.0

type_to_c12 = {key: val for key, val in zip(type_definitions.gromos_atp.name, type_definitions.gromos_atp.c12)}
Expand Down Expand Up @@ -762,7 +760,6 @@ def generate_basic_LJ(meGO_ensemble, args, matrices=None):
"molecule_name_ai",
"molecule_name_aj",
"same_chain",
"intra_domain",
"source",
"md_threshold",
"rc_threshold",
Expand Down Expand Up @@ -806,7 +803,6 @@ def generate_basic_LJ(meGO_ensemble, args, matrices=None):
basic_LJ["type"] = 1
basic_LJ["source"] = "basic"
basic_LJ["same_chain"] = True
basic_LJ["intra_domain"] = True
basic_LJ["c6"] = 0.0
basic_LJ["c12"] = 11.4 * np.sqrt(c12_list * c12_list[:, np.newaxis]).flatten()
basic_LJ["rep"] = basic_LJ["c12"]
Expand All @@ -829,7 +825,6 @@ def generate_basic_LJ(meGO_ensemble, args, matrices=None):
temp_basic_LJ["c6"] = 0.0
temp_basic_LJ["c12"] = 0.0
temp_basic_LJ["same_chain"] = ensemble["rc_same_chain"]
temp_basic_LJ["intra_domain"] = ensemble["rc_intra_domain"]
temp_basic_LJ["molecule_name_ai"] = ensemble["rc_molecule_name_ai"]
temp_basic_LJ["molecule_name_aj"] = ensemble["rc_molecule_name_aj"]
temp_basic_LJ["source"] = "basic"
Expand Down Expand Up @@ -1013,7 +1008,7 @@ def consistency_checks(meGO_LJ):
# This is a debug check to avoid data inconsistencies
if (np.abs(1.45 * meGO_LJ["rep"] ** (1 / 12) - meGO_LJ["cutoff"])).max() > 10e-6:
print(
meGO_LJ[["source", "file", "rc_source", "rep", "cutoff"]]
meGO_LJ[["ai", "aj", "same_chain", "source", "rep", "cutoff"]]
.loc[(np.abs(1.45 * meGO_LJ["rep"] ** (1 / 12) - meGO_LJ["cutoff"]) > 10e-6)]
.to_string()
)
Expand Down Expand Up @@ -1116,8 +1111,7 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
st = time.time()
print("\t- Set sigma and epsilon")
# copy but remove intramolecular excluded interactions
meGO_LJ = train_dataset.loc[(train_dataset["1-4"] != "1_2_3") & (train_dataset["1-4"] != "0")].copy()
meGO_LJ.reset_index(inplace=True)
meGO_LJ = train_dataset.copy()

# when distance estimates are poor we use the cutoff value
meGO_LJ.loc[(meGO_LJ["probability"] <= meGO_LJ["md_threshold"]), "distance"] = (meGO_LJ["rep"] / meGO_LJ["epsilon_0"]) ** (
Expand Down Expand Up @@ -1152,7 +1146,6 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
"aj",
"probability",
"same_chain",
"intra_domain",
"source",
"rc_probability",
"sigma",
Expand Down Expand Up @@ -1378,6 +1371,11 @@ def generate_LJ(meGO_ensemble, train_dataset, basic_LJ, parameters):
)

meGO_LJ.sort_values(by=["number_ai", "number_aj"], inplace=True)

# meGO consistency checks
consistency_checks(meGO_LJ)
consistency_checks(meGO_LJ_14)

et = time.time()
elapsed_time = et - st
print("\t- Done in:", elapsed_time, "seconds")
Expand Down
109 changes: 25 additions & 84 deletions src/multiego/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,6 @@ def read_molecular_contacts(path, ensemble_molecules_idx_sbtype_dictionary, simu
contact_matrix = contact_matrix.assign(
same_chain=name[0] == "intramat",
source=pd.Categorical([simulation] * len(contact_matrix)), # Convert to category
file=pd.Categorical(["_".join(name)] * len(contact_matrix)),
)

contact_matrix[["idx_ai", "idx_aj"]] = contact_matrix[["ai", "aj"]]
Expand Down Expand Up @@ -458,114 +457,56 @@ def write_output_readme(meGO_LJ, parameters, output_dir):

f.write("\nContact parameters:\n")
# write average data intra
f.write("\n - Intermolecular contacts:\n")
f.write("- Intramolecular contacts:\n")
f.write(
f" - epsilon: {meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] > 0.0)]['epsilon'].mean():.3f} kJ/mol\n"
)
f.write(
f" - sigma: {meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] > 0.0)]['sigma'].mean():.3f} nm\n"
)
f.write(
f" - number of attractive contacts: {len(meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] > 0.0)])}\n"
)
f.write(
f" - number of repulsive contacts: {len(meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] < 0.0)])}\n"
)

# write average data interdomain
f.write("\n - Interdomain contacts:\n")
f.write(
f" - epsilon: {meGO_LJ.loc[(meGO_LJ['same_chain']) & (~meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] > 0.0)]['epsilon'].mean():.3f} kJ/mol\n"
)
f.write(
f" - sigma: {meGO_LJ.loc[(meGO_LJ['same_chain']) & (~meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] > 0.0)]['sigma'].mean():.3f} nm\n"
)
f.write(
f" - number of attractive contacts: {len(meGO_LJ.loc[(meGO_LJ['same_chain']) & (~meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] > 0.0)])}\n"
)
f.write(
f" - number of repulsive contacts: {len(meGO_LJ.loc[(meGO_LJ['same_chain']) & (~meGO_LJ['intra_domain']) & (meGO_LJ['epsilon'] < 0.0)])}\n"
f"- epsilon: {meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)]['epsilon'].mean():.3f} kJ/mol\n"
)
f.write(f"- sigma: {meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)]['sigma'].mean():.3f} nm\n")
f.write(f"- number of attractive contacts: {len(meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)])}\n")
f.write(f"- number of repulsive contacts: {len(meGO_LJ.loc[(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] < 0.0)])}\n")

# write average data inter
f.write("\n - Intermolecular contacts:\n")
f.write(
f" - epsilon: {meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)]['epsilon'].mean():.3f} kJ/mol\n"
)
f.write(f" - sigma: {meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)]['sigma'].mean():.3f} nm\n")
f.write("- Intermolecular contacts:\n")
f.write(
f" - number of attractive contacts: {len(meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)])}\n"
f"- epsilon: {meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)]['epsilon'].mean():.3f} kJ/mol\n"
)
f.write(f"- sigma: {meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)]['sigma'].mean():.3f} nm\n")
f.write(
f" - number of repulsive contacts: {len(meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] < 0.0)])}\n"
f"- number of attractive contacts: {len(meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] > 0.0)])}\n"
)
f.write(f"- number of repulsive contacts: {len(meGO_LJ.loc[~(meGO_LJ['same_chain']) & (meGO_LJ['epsilon'] < 0.0)])}\n")

# mdp parameters
f.write("\nCutoff MDP parameters:\n")
f.write(f" - Suggested rlist value: {1.1*2.5*meGO_LJ['sigma'].max():4.2f} nm\n")
f.write(f" - Suggested cut-off value: {2.5*meGO_LJ['sigma'].max():4.2f} nm\n")
f.write("Cutoff MDP parameters:\n")
f.write(f"- Suggested rlist value: {1.1*2.5*meGO_LJ['sigma'].max():4.2f} nm\n")
f.write(f"- Suggested cut-off value: {2.5*meGO_LJ['sigma'].max():4.2f} nm\n")


def print_stats(meGO_LJ):
# it would be nice to cycle over molecule types and print an half matrix with all the relevant information
intrad_contacts = len(meGO_LJ.loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"])])
interd_contacts = len(meGO_LJ.loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"])])
intrad_contacts = len(meGO_LJ.loc[(meGO_LJ["same_chain"])])
interm_contacts = len(meGO_LJ.loc[~(meGO_LJ["same_chain"])])
intrad_a_contacts = len(meGO_LJ.loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)])
interd_a_contacts = len(meGO_LJ.loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)])
intrad_a_contacts = len(meGO_LJ.loc[(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)])
interm_a_contacts = len(meGO_LJ.loc[~(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)])
intrad_r_contacts = intrad_contacts - intrad_a_contacts
interd_r_contacts = interd_contacts - interd_a_contacts
interm_r_contacts = interm_contacts - interm_a_contacts
intrad_a_ave_contacts = 0.000
intrad_a_min_contacts = 0.000
intrad_a_max_contacts = 0.000
intrad_a_s_min_contacts = 0.000
intrad_a_s_max_contacts = 0.000
interd_a_ave_contacts = 0.000
interd_a_min_contacts = 0.000
interd_a_max_contacts = 0.000
interd_a_s_min_contacts = 0.000
interd_a_s_max_contacts = 0.000
interm_a_ave_contacts = 0.000
interm_a_min_contacts = 0.000
interm_a_max_contacts = 0.000
interm_a_s_min_contacts = 0.000
interm_a_s_max_contacts = 0.000

if intrad_a_contacts > 0:
intrad_a_ave_contacts = (
meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].mean()
)
intrad_a_min_contacts = (
meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].min()
)
intrad_a_max_contacts = (
meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].max()
)
intrad_a_s_min_contacts = (
meGO_LJ["sigma"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].min()
)
intrad_a_s_max_contacts = (
meGO_LJ["sigma"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].max()
)

if interd_a_contacts > 0:
interd_a_ave_contacts = (
meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].mean()
)
interd_a_min_contacts = (
meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].min()
)
interd_a_max_contacts = (
meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].max()
)
interd_a_s_min_contacts = (
meGO_LJ["sigma"].loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].min()
)
interd_a_s_max_contacts = (
meGO_LJ["sigma"].loc[(meGO_LJ["same_chain"]) & (~meGO_LJ["intra_domain"]) & (meGO_LJ["epsilon"] > 0.0)].max()
)
intrad_a_ave_contacts = meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)].mean()
intrad_a_min_contacts = meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)].min()
intrad_a_max_contacts = meGO_LJ["epsilon"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)].max()
intrad_a_s_min_contacts = meGO_LJ["sigma"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)].min()
intrad_a_s_max_contacts = meGO_LJ["sigma"].loc[(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)].max()

if interm_a_contacts > 0:
interm_a_ave_contacts = meGO_LJ["epsilon"].loc[~(meGO_LJ["same_chain"]) & (meGO_LJ["epsilon"] > 0.0)].mean()
Expand All @@ -577,11 +518,11 @@ def print_stats(meGO_LJ):
print(
f"""
\t- LJ parameterization completed for a total of {len(meGO_LJ)} contacts.
\t- Attractive: intra-domain: {intrad_a_contacts}, inter-domain: {interd_a_contacts}, inter-molecular: {interm_a_contacts}
\t- Repulsive: intra-domain: {intrad_r_contacts}, inter-domain: {interd_r_contacts}, inter-molecular: {interm_r_contacts}
\t- The average epsilon is: {intrad_a_ave_contacts:5.3f} {interd_a_ave_contacts:5.3f} {interm_a_ave_contacts:5.3f} kJ/mol
\t- Epsilon range is: [{intrad_a_min_contacts:5.3f}:{intrad_a_max_contacts:5.3f}] [{interd_a_min_contacts:5.3f}:{interd_a_max_contacts:5.3f}] [{interm_a_min_contacts:5.3f}:{interm_a_max_contacts:5.3f}] kJ/mol
\t- Sigma range is: [{intrad_a_s_min_contacts:5.3f}:{intrad_a_s_max_contacts:5.3f}] [{interd_a_s_min_contacts:5.3f}:{interd_a_s_max_contacts:5.3f}] [{interm_a_s_min_contacts:5.3f}:{interm_a_s_max_contacts:5.3f}] nm
\t- Attractive: intra-domain: {intrad_a_contacts}, inter-molecular: {interm_a_contacts}
\t- Repulsive: intra-domain: {intrad_r_contacts}, inter-molecular: {interm_r_contacts}
\t- The average epsilon is: {intrad_a_ave_contacts:5.3f} {interm_a_ave_contacts:5.3f} kJ/mol
\t- Epsilon range is: [{intrad_a_min_contacts:5.3f}:{intrad_a_max_contacts:5.3f}] [{interm_a_min_contacts:5.3f}:{interm_a_max_contacts:5.3f}] kJ/mol
\t- Sigma range is: [{intrad_a_s_min_contacts:5.3f}:{intrad_a_s_max_contacts:5.3f}] [{interm_a_s_min_contacts:5.3f}:{interm_a_s_max_contacts:5.3f}] nm
\t- RELEVANT MDP PARAMETERS:
\t- Suggested rlist value: {1.1*2.5*meGO_LJ['sigma'].max():4.2f} nm
Expand Down

0 comments on commit 59f2356

Please sign in to comment.