-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
automatic official FASTA file fetching, several new utility functions…
… related to structure including flexible 3d alignment that supports different length chains to be aligned! (#101) Co-authored-by: YoelShoshan <[email protected]> Co-authored-by: [email protected] <[email protected]>
- Loading branch information
1 parent
c788d1a
commit 4a1208a
Showing
9 changed files
with
731 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from io import StringIO | ||
from Bio import SeqIO | ||
from urllib.request import urlopen | ||
from typing import Dict | ||
|
||
|
||
def get_fasta_from_rcsb(pdb_id: str) -> Dict: # TODO: consider adding caching | ||
""" | ||
Given some pdb_id, (like "7vux"), we will retrieve its fasta file from rcsb database and return it as a dict {chain: sequence}. | ||
""" | ||
fasta_data = ( | ||
urlopen(f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}") | ||
.read() | ||
.decode("utf-8") | ||
) | ||
fasta_file_handle = StringIO(fasta_data) | ||
chains_full_seq = SeqIO.to_dict( | ||
SeqIO.parse(fasta_file_handle, "fasta"), | ||
key_function=lambda rec: _description_to_author_chain_id(rec.description), | ||
) | ||
chains_full_seq = {k: str(d.seq) for (k, d) in chains_full_seq.items()} | ||
return chains_full_seq | ||
|
||
|
||
def _description_to_author_chain_id(description: str) -> str: | ||
loc = description.find(" ") | ||
assert loc >= 0 | ||
description = description[loc + 1 :] | ||
loc = description.find(",") | ||
if loc >= 0: | ||
description = description[:loc] | ||
|
||
token = "auth " | ||
loc = description.find(token) | ||
if loc >= 0: | ||
return description[loc + len(token)] | ||
|
||
return description[0] |
104 changes: 104 additions & 0 deletions
104
fusedrug/data/protein/structure/align_multiple_antibodies.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
from os.path import join, dirname | ||
from fusedrug.data.protein.structure.flexible_align_chains_structure import ( | ||
flexible_align_chains_structure, | ||
) | ||
from jsonargparse import CLI | ||
import pandas as pd | ||
from typing import Optional | ||
import numpy as np | ||
|
||
|
||
def main( | ||
input_excel_filename: str, | ||
unique_id_column: str, | ||
reference_heavy_chain_pdb_filename_column: str, | ||
reference_heavy_chain_id_column: str, | ||
heavy_chain_pdb_filename_column: str, | ||
heavy_chain_id_column: str, | ||
light_chain_pdb_filename_column: str, | ||
light_chain_id_column: str, | ||
aligned_using_only_heavy_chain: bool = True, | ||
output_structure_file_prefix: str = "aligned_antibody_", | ||
output_excel_filename: Optional[str] = None, | ||
output_excel_aligned_heavy_chain_pdb_filename_column: str = "aligned_heavy_chain_pdb_filename", | ||
output_excel_aligned_heavy_chain_id_column: str = None, | ||
output_excel_aligned_light_chain_pdb_filename_column: str = "aligned_light_chain_pdb_filename", | ||
output_excel_aligned_light_chain_id_column: str = None, | ||
) -> pd.DataFrame: | ||
|
||
assert ( | ||
aligned_using_only_heavy_chain | ||
), "only supporting aligned_using_only_heavy_chain=True for now. Note that flexible_align_chains_structure is indeed flexible enough to support this, if needed." | ||
|
||
df = pd.read_excel(input_excel_filename, index_col=unique_id_column) | ||
|
||
# base = '/dccstor/dsa-ab-cli-val-0/2024_feb_delivery/top_100_with_indels/antibody_dimers_af2_predicted_structure' | ||
# reference_heavy_chain = '/dccstor/dsa-ab-cli-val-0/targets/PD-1/7VUX/relaxed_complex/PD1_7VUX_H_eq.pdb' | ||
|
||
df[output_excel_aligned_heavy_chain_pdb_filename_column] = np.nan | ||
df[output_excel_aligned_heavy_chain_id_column] = np.nan | ||
df[output_excel_aligned_light_chain_pdb_filename_column] = np.nan | ||
df[output_excel_aligned_light_chain_id_column] = np.nan | ||
|
||
for index, row in df.iterrows(): | ||
reference_heavy_chain_pdb_filename = row[ | ||
reference_heavy_chain_pdb_filename_column | ||
] | ||
reference_heavy_chain_id = row[reference_heavy_chain_id_column] | ||
# reference_light_chain_id = row[reference_light_chain_id_column] | ||
|
||
# heavy chain | ||
heavy_chain_pdb_filename = row[heavy_chain_pdb_filename_column] | ||
heavy_chain_id = row[heavy_chain_id_column] # 'A' | ||
# light chain | ||
light_chain_pdb_filename = row[light_chain_pdb_filename_column] | ||
light_chain_id = row[light_chain_id_column] # 'B' | ||
|
||
output_aligned_fn = join( | ||
dirname(heavy_chain_pdb_filename), output_structure_file_prefix | ||
) | ||
|
||
if not isinstance(reference_heavy_chain_pdb_filename, str): | ||
print( | ||
f"ERROR: expected reference_heavy_chain_pdb_filename to be string, but got {reference_heavy_chain_pdb_filename} of type {type(reference_heavy_chain_pdb_filename)}" | ||
) | ||
continue | ||
|
||
if len(reference_heavy_chain_pdb_filename) < 2: | ||
print( | ||
f'ERROR: expected reference_heavy_chain_pdb_filename to be string, but got a suspicious empty or extremely short one: "{reference_heavy_chain_pdb_filename}"' | ||
) | ||
continue | ||
|
||
flexible_align_chains_structure( | ||
dynamic_ordered_chains=[(heavy_chain_pdb_filename, heavy_chain_id)], | ||
apply_rigid_transformation_to_dynamic_chain_ids=[ | ||
(heavy_chain_pdb_filename, heavy_chain_id), | ||
(light_chain_pdb_filename, light_chain_id), | ||
], | ||
static_ordered_chains=[ | ||
(reference_heavy_chain_pdb_filename, reference_heavy_chain_id) | ||
], | ||
output_pdb_filename_extentionless=output_aligned_fn, | ||
) | ||
|
||
# heavy chain | ||
df.loc[index, output_excel_aligned_heavy_chain_pdb_filename_column] = ( | ||
output_aligned_fn + f"_chain_{heavy_chain_id}.pdb" | ||
) | ||
df.loc[index, output_excel_aligned_heavy_chain_id_column] = heavy_chain_id | ||
# light chain | ||
df.loc[index, output_excel_aligned_light_chain_pdb_filename_column] = ( | ||
output_aligned_fn + f"_chain_{light_chain_id}.pdb" | ||
) | ||
df.loc[index, output_excel_aligned_light_chain_id_column] = light_chain_id | ||
|
||
if output_excel_filename is not None: | ||
df.to_excel(output_excel_filename) | ||
print("saved ", output_excel_filename) | ||
|
||
return df | ||
|
||
|
||
if __name__ == "__main__": | ||
CLI(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from jsonargparse import CLI | ||
from fusedrug.data.protein.structure.structure_io import ( | ||
load_pdb_chain_features, | ||
save_structure_file, | ||
) | ||
from typing import Optional | ||
|
||
|
||
def main( | ||
*, | ||
input_pdb_path: str, | ||
orig_name_chains_to_extract: str, | ||
output_pdb_path_extensionless: str, | ||
output_chain_ids_to_extract: Optional[str] = None, | ||
) -> None: | ||
""" | ||
Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired | ||
Args: | ||
input_pdb_path: | ||
input_chain_ids_to_extract: '_' separated chain ids | ||
output_chain_ids_to_extract: '_' separated chain ids | ||
if not provided, will keep original chain ids | ||
""" | ||
|
||
orig_name_chains_to_extract = orig_name_chains_to_extract.split("_") | ||
if output_chain_ids_to_extract is None: | ||
output_chain_ids_to_extract = orig_name_chains_to_extract.split("_") | ||
else: | ||
output_chain_ids_to_extract = output_chain_ids_to_extract.split("_") | ||
|
||
assert len(orig_name_chains_to_extract) > 0 | ||
assert len(orig_name_chains_to_extract) == len(output_chain_ids_to_extract) | ||
assert len(orig_name_chains_to_extract[0]) == 1 | ||
|
||
loaded_chains = {} | ||
for orig_chain_id in orig_name_chains_to_extract: | ||
loaded_chains[orig_chain_id] = load_pdb_chain_features( | ||
input_pdb_path, orig_chain_id | ||
) | ||
|
||
mapping = dict(zip(orig_name_chains_to_extract, output_chain_ids_to_extract)) | ||
|
||
loaded_chains_mapped = { | ||
mapping[chain_id]: data for (chain_id, data) in loaded_chains.items() | ||
} | ||
|
||
save_structure_file( | ||
output_filename_extensionless=output_pdb_path_extensionless, | ||
pdb_id="unknown", | ||
chain_to_atom14={ | ||
chain_id: data["atom14_gt_positions"] | ||
for (chain_id, data) in loaded_chains_mapped.items() | ||
}, | ||
chain_to_aa_str_seq={ | ||
chain_id: data["aasequence_str"] | ||
for (chain_id, data) in loaded_chains_mapped.items() | ||
}, | ||
chain_to_aa_index_seq={ | ||
chain_id: data["aatype"] | ||
for (chain_id, data) in loaded_chains_mapped.items() | ||
}, | ||
save_cif=False, | ||
mask=None, # TODO: check | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
CLI(main) |
Oops, something went wrong.