automatic official FASTA file fetching, several new utility functions…

… related to structure including flexible 3d alignment that supports different length chains to be aligned! (#101) Co-authored-by: YoelShoshan <[email protected]> Co-authored-by: [email protected] <[email protected]>
BiomedSciAI · May 26, 2024 · 4a1208a · 4a1208a
1 parent c788d1a
commit 4a1208a
Show file tree

Hide file tree

Showing 9 changed files with 731 additions and 48 deletions.
diff --git a/fusedrug/data/protein/antibody/antibody.py b/fusedrug/data/protein/antibody/antibody.py
@@ -1,4 +1,4 @@
-from typing import List, Dict
+from typing import List, Dict, Optional
 from fusedrug.data.protein.structure.sabdab import load_sabdab_dataframe
 import pandas as pd
 from collections import namedtuple
@@ -33,12 +33,16 @@ def get_antibody_regions(sequence: str, scheme: str = "chothia") -> Dict[str, st
     return ans
 
 
-def get_antibodies_info_from_sabdab(antibodies_pdb_ids: List[str]) -> List[Antibody]:
+def get_antibodies_info_from_sabdab(
+    antibodies_pdb_ids: Optional[List[str]] = None,
+) -> List[Antibody]:
     """
     Collects information on all provided antibodies_pdb_ids based on SabDab DB.
 
     """
     sabdab_df = load_sabdab_dataframe()
+    if antibodies_pdb_ids is None:
+        antibodies_pdb_ids = sabdab_df.pdb.unique().tolist()
     antibodies = []
     for pdb_id in antibodies_pdb_ids:
         found = sabdab_df[sabdab_df.pdb == pdb_id]

diff --git a/fusedrug/data/protein/sequence/official_pdb_fasta.py b/fusedrug/data/protein/sequence/official_pdb_fasta.py
@@ -0,0 +1,38 @@
+from io import StringIO
+from Bio import SeqIO
+from urllib.request import urlopen
+from typing import Dict
+
+
+def get_fasta_from_rcsb(pdb_id: str) -> Dict:  # TODO: consider adding caching
+    """
+    Given some pdb_id, (like "7vux"), we will retrieve its fasta file from rcsb database and return it as a dict {chain: sequence}.
+    """
+    fasta_data = (
+        urlopen(f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}")
+        .read()
+        .decode("utf-8")
+    )
+    fasta_file_handle = StringIO(fasta_data)
+    chains_full_seq = SeqIO.to_dict(
+        SeqIO.parse(fasta_file_handle, "fasta"),
+        key_function=lambda rec: _description_to_author_chain_id(rec.description),
+    )
+    chains_full_seq = {k: str(d.seq) for (k, d) in chains_full_seq.items()}
+    return chains_full_seq
+
+
+def _description_to_author_chain_id(description: str) -> str:
+    loc = description.find(" ")
+    assert loc >= 0
+    description = description[loc + 1 :]
+    loc = description.find(",")
+    if loc >= 0:
+        description = description[:loc]
+
+    token = "auth "
+    loc = description.find(token)
+    if loc >= 0:
+        return description[loc + len(token)]
+
+    return description[0]
diff --git a/fusedrug/data/protein/structure/align_multiple_antibodies.py b/fusedrug/data/protein/structure/align_multiple_antibodies.py
@@ -0,0 +1,104 @@
+from os.path import join, dirname
+from fusedrug.data.protein.structure.flexible_align_chains_structure import (
+    flexible_align_chains_structure,
+)
+from jsonargparse import CLI
+import pandas as pd
+from typing import Optional
+import numpy as np
+
+
+def main(
+    input_excel_filename: str,
+    unique_id_column: str,
+    reference_heavy_chain_pdb_filename_column: str,
+    reference_heavy_chain_id_column: str,
+    heavy_chain_pdb_filename_column: str,
+    heavy_chain_id_column: str,
+    light_chain_pdb_filename_column: str,
+    light_chain_id_column: str,
+    aligned_using_only_heavy_chain: bool = True,
+    output_structure_file_prefix: str = "aligned_antibody_",
+    output_excel_filename: Optional[str] = None,
+    output_excel_aligned_heavy_chain_pdb_filename_column: str = "aligned_heavy_chain_pdb_filename",
+    output_excel_aligned_heavy_chain_id_column: str = None,
+    output_excel_aligned_light_chain_pdb_filename_column: str = "aligned_light_chain_pdb_filename",
+    output_excel_aligned_light_chain_id_column: str = None,
+) -> pd.DataFrame:
+
+    assert (
+        aligned_using_only_heavy_chain
+    ), "only supporting aligned_using_only_heavy_chain=True for now. Note that flexible_align_chains_structure is indeed flexible enough to support this, if needed."
+
+    df = pd.read_excel(input_excel_filename, index_col=unique_id_column)
+
+    # base = '/dccstor/dsa-ab-cli-val-0/2024_feb_delivery/top_100_with_indels/antibody_dimers_af2_predicted_structure'
+    # reference_heavy_chain = '/dccstor/dsa-ab-cli-val-0/targets/PD-1/7VUX/relaxed_complex/PD1_7VUX_H_eq.pdb'
+
+    df[output_excel_aligned_heavy_chain_pdb_filename_column] = np.nan
+    df[output_excel_aligned_heavy_chain_id_column] = np.nan
+    df[output_excel_aligned_light_chain_pdb_filename_column] = np.nan
+    df[output_excel_aligned_light_chain_id_column] = np.nan
+
+    for index, row in df.iterrows():
+        reference_heavy_chain_pdb_filename = row[
+            reference_heavy_chain_pdb_filename_column
+        ]
+        reference_heavy_chain_id = row[reference_heavy_chain_id_column]
+        # reference_light_chain_id = row[reference_light_chain_id_column]
+
+        # heavy chain
+        heavy_chain_pdb_filename = row[heavy_chain_pdb_filename_column]
+        heavy_chain_id = row[heavy_chain_id_column]  # 'A'
+        # light chain
+        light_chain_pdb_filename = row[light_chain_pdb_filename_column]
+        light_chain_id = row[light_chain_id_column]  # 'B'
+
+        output_aligned_fn = join(
+            dirname(heavy_chain_pdb_filename), output_structure_file_prefix
+        )
+
+        if not isinstance(reference_heavy_chain_pdb_filename, str):
+            print(
+                f"ERROR: expected reference_heavy_chain_pdb_filename to be string, but got {reference_heavy_chain_pdb_filename} of type {type(reference_heavy_chain_pdb_filename)}"
+            )
+            continue
+
+        if len(reference_heavy_chain_pdb_filename) < 2:
+            print(
+                f'ERROR: expected reference_heavy_chain_pdb_filename to be string, but got a suspicious empty or extremely short one: "{reference_heavy_chain_pdb_filename}"'
+            )
+            continue
+
+        flexible_align_chains_structure(
+            dynamic_ordered_chains=[(heavy_chain_pdb_filename, heavy_chain_id)],
+            apply_rigid_transformation_to_dynamic_chain_ids=[
+                (heavy_chain_pdb_filename, heavy_chain_id),
+                (light_chain_pdb_filename, light_chain_id),
+            ],
+            static_ordered_chains=[
+                (reference_heavy_chain_pdb_filename, reference_heavy_chain_id)
+            ],
+            output_pdb_filename_extentionless=output_aligned_fn,
+        )
+
+        # heavy chain
+        df.loc[index, output_excel_aligned_heavy_chain_pdb_filename_column] = (
+            output_aligned_fn + f"_chain_{heavy_chain_id}.pdb"
+        )
+        df.loc[index, output_excel_aligned_heavy_chain_id_column] = heavy_chain_id
+        # light chain
+        df.loc[index, output_excel_aligned_light_chain_pdb_filename_column] = (
+            output_aligned_fn + f"_chain_{light_chain_id}.pdb"
+        )
+        df.loc[index, output_excel_aligned_light_chain_id_column] = light_chain_id
+
+    if output_excel_filename is not None:
+        df.to_excel(output_excel_filename)
+        print("saved ", output_excel_filename)
+
+    return df
+
+
+if __name__ == "__main__":
+    CLI(main)
diff --git a/fusedrug/data/protein/structure/extract_chains_to_pdbs.py b/fusedrug/data/protein/structure/extract_chains_to_pdbs.py
@@ -0,0 +1,71 @@
+from jsonargparse import CLI
+from fusedrug.data.protein.structure.structure_io import (
+    load_pdb_chain_features,
+    save_structure_file,
+)
+from typing import Optional
+
+
+def main(
+    *,
+    input_pdb_path: str,
+    orig_name_chains_to_extract: str,
+    output_pdb_path_extensionless: str,
+    output_chain_ids_to_extract: Optional[str] = None,
+) -> None:
+    """
+
+    Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired
+
+    Args:
+    input_pdb_path:
+    input_chain_ids_to_extract: '_' separated chain ids
+    output_chain_ids_to_extract: '_' separated chain ids
+        if not provided, will keep original chain ids
+
+    """
+
+    orig_name_chains_to_extract = orig_name_chains_to_extract.split("_")
+    if output_chain_ids_to_extract is None:
+        output_chain_ids_to_extract = orig_name_chains_to_extract.split("_")
+    else:
+        output_chain_ids_to_extract = output_chain_ids_to_extract.split("_")
+
+    assert len(orig_name_chains_to_extract) > 0
+    assert len(orig_name_chains_to_extract) == len(output_chain_ids_to_extract)
+    assert len(orig_name_chains_to_extract[0]) == 1
+
+    loaded_chains = {}
+    for orig_chain_id in orig_name_chains_to_extract:
+        loaded_chains[orig_chain_id] = load_pdb_chain_features(
+            input_pdb_path, orig_chain_id
+        )
+
+    mapping = dict(zip(orig_name_chains_to_extract, output_chain_ids_to_extract))
+
+    loaded_chains_mapped = {
+        mapping[chain_id]: data for (chain_id, data) in loaded_chains.items()
+    }
+
+    save_structure_file(
+        output_filename_extensionless=output_pdb_path_extensionless,
+        pdb_id="unknown",
+        chain_to_atom14={
+            chain_id: data["atom14_gt_positions"]
+            for (chain_id, data) in loaded_chains_mapped.items()
+        },
+        chain_to_aa_str_seq={
+            chain_id: data["aasequence_str"]
+            for (chain_id, data) in loaded_chains_mapped.items()
+        },
+        chain_to_aa_index_seq={
+            chain_id: data["aatype"]
+            for (chain_id, data) in loaded_chains_mapped.items()
+        },
+        save_cif=False,
+        mask=None,  # TODO: check
+    )
+
+
+if __name__ == "__main__":
+    CLI(main)