Skip to content

Commit

Permalink
automatic official FASTA file fetching, several new utility functions…
Browse files Browse the repository at this point in the history
… related to structure including flexible 3d alignment that supports different length chains to be aligned! (#101)

Co-authored-by: YoelShoshan <[email protected]>
Co-authored-by: [email protected] <[email protected]>
  • Loading branch information
3 people authored May 26, 2024
1 parent c788d1a commit 4a1208a
Show file tree
Hide file tree
Showing 9 changed files with 731 additions and 48 deletions.
8 changes: 6 additions & 2 deletions fusedrug/data/protein/antibody/antibody.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Dict
from typing import List, Dict, Optional
from fusedrug.data.protein.structure.sabdab import load_sabdab_dataframe
import pandas as pd
from collections import namedtuple
Expand Down Expand Up @@ -33,12 +33,16 @@ def get_antibody_regions(sequence: str, scheme: str = "chothia") -> Dict[str, st
return ans


def get_antibodies_info_from_sabdab(antibodies_pdb_ids: List[str]) -> List[Antibody]:
def get_antibodies_info_from_sabdab(
antibodies_pdb_ids: Optional[List[str]] = None,
) -> List[Antibody]:
"""
Collects information on all provided antibodies_pdb_ids based on SabDab DB.
"""
sabdab_df = load_sabdab_dataframe()
if antibodies_pdb_ids is None:
antibodies_pdb_ids = sabdab_df.pdb.unique().tolist()
antibodies = []
for pdb_id in antibodies_pdb_ids:
found = sabdab_df[sabdab_df.pdb == pdb_id]
Expand Down
38 changes: 38 additions & 0 deletions fusedrug/data/protein/sequence/official_pdb_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from io import StringIO
from Bio import SeqIO
from urllib.request import urlopen
from typing import Dict


def get_fasta_from_rcsb(pdb_id: str) -> Dict: # TODO: consider adding caching
"""
Given some pdb_id, (like "7vux"), we will retrieve its fasta file from rcsb database and return it as a dict {chain: sequence}.
"""
fasta_data = (
urlopen(f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}")
.read()
.decode("utf-8")
)
fasta_file_handle = StringIO(fasta_data)
chains_full_seq = SeqIO.to_dict(
SeqIO.parse(fasta_file_handle, "fasta"),
key_function=lambda rec: _description_to_author_chain_id(rec.description),
)
chains_full_seq = {k: str(d.seq) for (k, d) in chains_full_seq.items()}
return chains_full_seq


def _description_to_author_chain_id(description: str) -> str:
loc = description.find(" ")
assert loc >= 0
description = description[loc + 1 :]
loc = description.find(",")
if loc >= 0:
description = description[:loc]

token = "auth "
loc = description.find(token)
if loc >= 0:
return description[loc + len(token)]

return description[0]
104 changes: 104 additions & 0 deletions fusedrug/data/protein/structure/align_multiple_antibodies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from os.path import join, dirname
from fusedrug.data.protein.structure.flexible_align_chains_structure import (
flexible_align_chains_structure,
)
from jsonargparse import CLI
import pandas as pd
from typing import Optional
import numpy as np


def main(
input_excel_filename: str,
unique_id_column: str,
reference_heavy_chain_pdb_filename_column: str,
reference_heavy_chain_id_column: str,
heavy_chain_pdb_filename_column: str,
heavy_chain_id_column: str,
light_chain_pdb_filename_column: str,
light_chain_id_column: str,
aligned_using_only_heavy_chain: bool = True,
output_structure_file_prefix: str = "aligned_antibody_",
output_excel_filename: Optional[str] = None,
output_excel_aligned_heavy_chain_pdb_filename_column: str = "aligned_heavy_chain_pdb_filename",
output_excel_aligned_heavy_chain_id_column: str = None,
output_excel_aligned_light_chain_pdb_filename_column: str = "aligned_light_chain_pdb_filename",
output_excel_aligned_light_chain_id_column: str = None,
) -> pd.DataFrame:

assert (
aligned_using_only_heavy_chain
), "only supporting aligned_using_only_heavy_chain=True for now. Note that flexible_align_chains_structure is indeed flexible enough to support this, if needed."

df = pd.read_excel(input_excel_filename, index_col=unique_id_column)

# base = '/dccstor/dsa-ab-cli-val-0/2024_feb_delivery/top_100_with_indels/antibody_dimers_af2_predicted_structure'
# reference_heavy_chain = '/dccstor/dsa-ab-cli-val-0/targets/PD-1/7VUX/relaxed_complex/PD1_7VUX_H_eq.pdb'

df[output_excel_aligned_heavy_chain_pdb_filename_column] = np.nan
df[output_excel_aligned_heavy_chain_id_column] = np.nan
df[output_excel_aligned_light_chain_pdb_filename_column] = np.nan
df[output_excel_aligned_light_chain_id_column] = np.nan

for index, row in df.iterrows():
reference_heavy_chain_pdb_filename = row[
reference_heavy_chain_pdb_filename_column
]
reference_heavy_chain_id = row[reference_heavy_chain_id_column]
# reference_light_chain_id = row[reference_light_chain_id_column]

# heavy chain
heavy_chain_pdb_filename = row[heavy_chain_pdb_filename_column]
heavy_chain_id = row[heavy_chain_id_column] # 'A'
# light chain
light_chain_pdb_filename = row[light_chain_pdb_filename_column]
light_chain_id = row[light_chain_id_column] # 'B'

output_aligned_fn = join(
dirname(heavy_chain_pdb_filename), output_structure_file_prefix
)

if not isinstance(reference_heavy_chain_pdb_filename, str):
print(
f"ERROR: expected reference_heavy_chain_pdb_filename to be string, but got {reference_heavy_chain_pdb_filename} of type {type(reference_heavy_chain_pdb_filename)}"
)
continue

if len(reference_heavy_chain_pdb_filename) < 2:
print(
f'ERROR: expected reference_heavy_chain_pdb_filename to be string, but got a suspicious empty or extremely short one: "{reference_heavy_chain_pdb_filename}"'
)
continue

flexible_align_chains_structure(
dynamic_ordered_chains=[(heavy_chain_pdb_filename, heavy_chain_id)],
apply_rigid_transformation_to_dynamic_chain_ids=[
(heavy_chain_pdb_filename, heavy_chain_id),
(light_chain_pdb_filename, light_chain_id),
],
static_ordered_chains=[
(reference_heavy_chain_pdb_filename, reference_heavy_chain_id)
],
output_pdb_filename_extentionless=output_aligned_fn,
)

# heavy chain
df.loc[index, output_excel_aligned_heavy_chain_pdb_filename_column] = (
output_aligned_fn + f"_chain_{heavy_chain_id}.pdb"
)
df.loc[index, output_excel_aligned_heavy_chain_id_column] = heavy_chain_id
# light chain
df.loc[index, output_excel_aligned_light_chain_pdb_filename_column] = (
output_aligned_fn + f"_chain_{light_chain_id}.pdb"
)
df.loc[index, output_excel_aligned_light_chain_id_column] = light_chain_id

if output_excel_filename is not None:
df.to_excel(output_excel_filename)
print("saved ", output_excel_filename)

return df


if __name__ == "__main__":
CLI(main)
71 changes: 71 additions & 0 deletions fusedrug/data/protein/structure/extract_chains_to_pdbs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from jsonargparse import CLI
from fusedrug.data.protein.structure.structure_io import (
load_pdb_chain_features,
save_structure_file,
)
from typing import Optional


def main(
*,
input_pdb_path: str,
orig_name_chains_to_extract: str,
output_pdb_path_extensionless: str,
output_chain_ids_to_extract: Optional[str] = None,
) -> None:
"""
Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired
Args:
input_pdb_path:
input_chain_ids_to_extract: '_' separated chain ids
output_chain_ids_to_extract: '_' separated chain ids
if not provided, will keep original chain ids
"""

orig_name_chains_to_extract = orig_name_chains_to_extract.split("_")
if output_chain_ids_to_extract is None:
output_chain_ids_to_extract = orig_name_chains_to_extract.split("_")
else:
output_chain_ids_to_extract = output_chain_ids_to_extract.split("_")

assert len(orig_name_chains_to_extract) > 0
assert len(orig_name_chains_to_extract) == len(output_chain_ids_to_extract)
assert len(orig_name_chains_to_extract[0]) == 1

loaded_chains = {}
for orig_chain_id in orig_name_chains_to_extract:
loaded_chains[orig_chain_id] = load_pdb_chain_features(
input_pdb_path, orig_chain_id
)

mapping = dict(zip(orig_name_chains_to_extract, output_chain_ids_to_extract))

loaded_chains_mapped = {
mapping[chain_id]: data for (chain_id, data) in loaded_chains.items()
}

save_structure_file(
output_filename_extensionless=output_pdb_path_extensionless,
pdb_id="unknown",
chain_to_atom14={
chain_id: data["atom14_gt_positions"]
for (chain_id, data) in loaded_chains_mapped.items()
},
chain_to_aa_str_seq={
chain_id: data["aasequence_str"]
for (chain_id, data) in loaded_chains_mapped.items()
},
chain_to_aa_index_seq={
chain_id: data["aatype"]
for (chain_id, data) in loaded_chains_mapped.items()
},
save_cif=False,
mask=None, # TODO: check
)


if __name__ == "__main__":
CLI(main)
Loading

0 comments on commit 4a1208a

Please sign in to comment.