From 346382948ad7c374e250cb26e9a86858b6f4ea2f Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Sun, 15 Dec 2024 10:59:04 -0500 Subject: [PATCH 1/9] added a programmatic/CLI util to split scfv to heavy and light chains --- .../protein/structure/split_scfv_chain.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 fusedrug/data/protein/structure/split_scfv_chain.py diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py new file mode 100644 index 00000000..a7a62722 --- /dev/null +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -0,0 +1,130 @@ +from jsonargparse import CLI +from fusedrug.data.protein.structure.structure_io import ( + load_pdb_chain_features, + save_structure_file, +) +from typing import Optional, Sequence +from os.path import isfile, join, dirname, basename +import os +import sys +import subprocess + +def main( + *, + input_pdb_path: str, + input_scfv_chain_id: str, + output_pdb_path_extensionless: str, + output_heavy_chain_id: Optional[str] = 'H', + output_light_chain_id: Optional[str] = 'L', + cleanup_temp_files:bool = True, +) -> None: + """ + + Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired + + Args: + input_pdb_path: + + """ + + loaded_scfv = load_pdb_chain_features( + input_pdb_path, input_scfv_chain_id + ) + + scfv_seq = loaded_scfv['aasequence_str'] + + scfv_sequence_filename = join( + dirname(input_pdb_path), + f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt", + ) + + if not isfile(scfv_sequence_filename): + with open(scfv_sequence_filename, 'wt') as f: + f.write(f'>scfv_{input_scfv_chain_id}:...\n{scfv_seq}\n') + + # run anarci: + anarci_executable = join(dirname(sys.executable), "ANARCI") + if not isfile(anarci_executable): + raise Exception( + f"ANARCI binary not found in {dirname(sys.executable)}. check installation" + ) + + anarci_output_filename = join( + dirname(input_pdb_path), + f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt", + ) + + if not isfile(anarci_output_filename): + subprocess.run( + [ + anarci_executable, + "-i", + scfv_sequence_filename, + "-o", + anarci_output_filename, + ] + ) + # parse anarci outputs and obtain separate heavy and light chains: + heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(anarci_output_filename) + #assert len(heavy_chains) == len(light_chains) == len(sequences) + + #cleanup + if cleanup_temp_files: + os.remove(scfv_sequence_filename) + os.remove(anarci_output_filename) + + heavy_start = scfv_seq.find(heavy_chain) + assert heavy_start >= 0 + + light_start = scfv_seq.find(light_chain) + assert light_start >= 0 + + saved_files = save_structure_file( + output_filename_extensionless=output_pdb_path_extensionless, + pdb_id="unknown", + chain_to_atom14={ + output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][heavy_start:heavy_start+len(heavy_chain)], + output_light_chain_id: loaded_scfv["atom14_gt_positions"][light_start:light_start+len(light_chain)], + }, + chain_to_aa_str_seq={ + output_heavy_chain_id: loaded_scfv["aasequence_str"][heavy_start:heavy_start+len(heavy_chain)], + output_light_chain_id: loaded_scfv["aasequence_str"][light_start:light_start+len(light_chain)], + }, + chain_to_aa_index_seq={ + output_heavy_chain_id: loaded_scfv["aatype"][heavy_start:heavy_start+len(heavy_chain)], + output_light_chain_id: loaded_scfv["aatype"][light_start:light_start+len(light_chain)], + }, + save_cif=False, + mask=None, # TODO: check + ) + + print(f"saved {saved_files}") + + +def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[str]]: + # parses ANARCI output on a fasta file of a single heavy and light chain domains + heavy_chain = [] + light_chain = [] + with open(filename, "r") as file: + for line in file: + if line.startswith("#"): + continue + else: + parts = line.split() + residue = parts[-1] + if residue == "-": + continue + if line.startswith("H"): + heavy_chain.append(residue) + elif line.startswith("L"): + light_chain.append(residue) + # last sequence: + + heavy_chain = "".join(heavy_chain) + light_chain = "".join(light_chain) + + return heavy_chain, light_chain + + +if __name__ == "__main__": + CLI(main, as_positional=False) From 9c62639b1a41399168c8e8591d4717cf7ec5fbab Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Sun, 15 Dec 2024 12:46:46 -0500 Subject: [PATCH 2/9] scfv --- .../protein/structure/split_scfv_chain.py | 44 +++++++++++++++---- .../data/protein/structure/structure_io.py | 7 +-- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index a7a62722..272e9f3b 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -16,6 +16,7 @@ def main( output_pdb_path_extensionless: str, output_heavy_chain_id: Optional[str] = 'H', output_light_chain_id: Optional[str] = 'L', + passthrough_chains: Optional[str] = None, cleanup_temp_files:bool = True, ) -> None: """ @@ -24,9 +25,15 @@ def main( Args: input_pdb_path: + + passthrough_chains: optional, will be "pass through", '_' separated if you want multiple """ + if passthrough_chains is not None: + passthrough_chains = passthrough_chains.split('_') + + loaded_scfv = load_pdb_chain_features( input_pdb_path, input_scfv_chain_id ) @@ -79,27 +86,46 @@ def main( light_start = scfv_seq.find(light_chain) assert light_start >= 0 - saved_files = save_structure_file( - output_filename_extensionless=output_pdb_path_extensionless, - pdb_id="unknown", - chain_to_atom14={ + chain_to_atom14={ output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][heavy_start:heavy_start+len(heavy_chain)], output_light_chain_id: loaded_scfv["atom14_gt_positions"][light_start:light_start+len(light_chain)], - }, - chain_to_aa_str_seq={ + } + + chain_to_aa_str_seq={ output_heavy_chain_id: loaded_scfv["aasequence_str"][heavy_start:heavy_start+len(heavy_chain)], output_light_chain_id: loaded_scfv["aasequence_str"][light_start:light_start+len(light_chain)], - }, - chain_to_aa_index_seq={ + } + + chain_to_aa_index_seq={ output_heavy_chain_id: loaded_scfv["aatype"][heavy_start:heavy_start+len(heavy_chain)], output_light_chain_id: loaded_scfv["aatype"][light_start:light_start+len(light_chain)], - }, + } + + + if passthrough_chains is not None: + for chain_id in passthrough_chains: + curr_loaded_chain_data = load_pdb_chain_features(input_pdb_path, chain_id) + + chain_to_atom14[chain_id] = curr_loaded_chain_data['atom14_gt_positions'] + chain_to_aa_str_seq[chain_id] = curr_loaded_chain_data['aasequence_str'] + chain_to_aa_index_seq[chain_id] = curr_loaded_chain_data['aatype'] + + + saved_files = save_structure_file( + output_filename_extensionless=output_pdb_path_extensionless, + pdb_id="unknown", + chain_to_atom14=chain_to_atom14, + chain_to_aa_str_seq=chain_to_aa_str_seq, + chain_to_aa_index_seq=chain_to_aa_index_seq, save_cif=False, mask=None, # TODO: check ) + assert len(saved_files) == 1 print(f"saved {saved_files}") + return saved_files[0] + def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[str]]: # parses ANARCI output on a fasta file of a single heavy and light chain domains diff --git a/fusedrug/data/protein/structure/structure_io.py b/fusedrug/data/protein/structure/structure_io.py index 72aa0f94..ebe730d3 100644 --- a/fusedrug/data/protein/structure/structure_io.py +++ b/fusedrug/data/protein/structure/structure_io.py @@ -798,10 +798,11 @@ def flexible_save_pdb_file( and ((b_factors is None) or isinstance(b_factors, dict)) ) - assert list(xyz.keys()) == list(sequence.keys()) - assert list(xyz.keys()) == list(residues_mask.keys()) + assert set(xyz.keys()) == set(sequence.keys()) + assert set(xyz.keys()) == set(residues_mask.keys()) + if b_factors is not None: - assert list(xyz.keys()) == list(b_factors.keys()) + assert set(xyz.keys()) == set(b_factors.keys()) if only_save_backbone: print( From 3826feaf9eade33262185b589e331c59540ee213 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Mon, 16 Dec 2024 06:27:13 -0500 Subject: [PATCH 3/9] scfv --- fusedrug/data/protein/structure/split_scfv_chain.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index 272e9f3b..b477bca1 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -8,6 +8,7 @@ import os import sys import subprocess +import threading def main( *, @@ -40,9 +41,11 @@ def main( scfv_seq = loaded_scfv['aasequence_str'] + safety = f'_{os.getpid()}_{threading.get_ident()}' + scfv_sequence_filename = join( dirname(input_pdb_path), - f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt", + f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt", ) if not isfile(scfv_sequence_filename): @@ -58,7 +61,7 @@ def main( anarci_output_filename = join( dirname(input_pdb_path), - f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt", + f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt", ) if not isfile(anarci_output_filename): @@ -144,7 +147,7 @@ def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[s heavy_chain.append(residue) elif line.startswith("L"): light_chain.append(residue) - # last sequence: + heavy_chain = "".join(heavy_chain) light_chain = "".join(light_chain) From 73dd6321267ced7396b6a539ca465f6d4bd2cc57 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Mon, 16 Dec 2024 06:27:35 -0500 Subject: [PATCH 4/9] scfv --- .../protein/structure/split_scfv_chain.py | 111 ++++++++++-------- 1 file changed, 63 insertions(+), 48 deletions(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index b477bca1..512d4f9b 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -10,48 +10,49 @@ import subprocess import threading + def main( *, input_pdb_path: str, input_scfv_chain_id: str, output_pdb_path_extensionless: str, - output_heavy_chain_id: Optional[str] = 'H', - output_light_chain_id: Optional[str] = 'L', + output_heavy_chain_id: Optional[str] = "H", + output_light_chain_id: Optional[str] = "L", passthrough_chains: Optional[str] = None, - cleanup_temp_files:bool = True, + cleanup_temp_files: bool = True, ) -> None: """ Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired Args: - input_pdb_path: + input_pdb_path: passthrough_chains: optional, will be "pass through", '_' separated if you want multiple - + """ if passthrough_chains is not None: - passthrough_chains = passthrough_chains.split('_') - + passthrough_chains = passthrough_chains.split("_") - loaded_scfv = load_pdb_chain_features( - input_pdb_path, input_scfv_chain_id - ) + loaded_scfv = load_pdb_chain_features(input_pdb_path, input_scfv_chain_id) - scfv_seq = loaded_scfv['aasequence_str'] + scfv_seq = loaded_scfv["aasequence_str"] - safety = f'_{os.getpid()}_{threading.get_ident()}' + safety = f"_{os.getpid()}_{threading.get_ident()}" scfv_sequence_filename = join( dirname(input_pdb_path), - f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt", + f"sequence_info_{input_scfv_chain_id}_" + + basename(input_pdb_path) + + safety + + ".txt", ) if not isfile(scfv_sequence_filename): - with open(scfv_sequence_filename, 'wt') as f: - f.write(f'>scfv_{input_scfv_chain_id}:...\n{scfv_seq}\n') - + with open(scfv_sequence_filename, "wt") as f: + f.write(f">scfv_{input_scfv_chain_id}:...\n{scfv_seq}\n") + # run anarci: anarci_executable = join(dirname(sys.executable), "ANARCI") if not isfile(anarci_executable): @@ -61,7 +62,10 @@ def main( anarci_output_filename = join( dirname(input_pdb_path), - f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt", + f"anarci_output_{input_scfv_chain_id}_" + + basename(input_pdb_path) + + safety + + ".txt", ) if not isfile(anarci_output_filename): @@ -75,10 +79,12 @@ def main( ] ) # parse anarci outputs and obtain separate heavy and light chains: - heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(anarci_output_filename) - #assert len(heavy_chains) == len(light_chains) == len(sequences) - - #cleanup + heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output( + anarci_output_filename + ) + # assert len(heavy_chains) == len(light_chains) == len(sequences) + + # cleanup if cleanup_temp_files: os.remove(scfv_sequence_filename) os.remove(anarci_output_filename) @@ -89,30 +95,40 @@ def main( light_start = scfv_seq.find(light_chain) assert light_start >= 0 - chain_to_atom14={ - output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][heavy_start:heavy_start+len(heavy_chain)], - output_light_chain_id: loaded_scfv["atom14_gt_positions"][light_start:light_start+len(light_chain)], - } - - chain_to_aa_str_seq={ - output_heavy_chain_id: loaded_scfv["aasequence_str"][heavy_start:heavy_start+len(heavy_chain)], - output_light_chain_id: loaded_scfv["aasequence_str"][light_start:light_start+len(light_chain)], - } - - chain_to_aa_index_seq={ - output_heavy_chain_id: loaded_scfv["aatype"][heavy_start:heavy_start+len(heavy_chain)], - output_light_chain_id: loaded_scfv["aatype"][light_start:light_start+len(light_chain)], - } - + chain_to_atom14 = { + output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][ + heavy_start : heavy_start + len(heavy_chain) + ], + output_light_chain_id: loaded_scfv["atom14_gt_positions"][ + light_start : light_start + len(light_chain) + ], + } + + chain_to_aa_str_seq = { + output_heavy_chain_id: loaded_scfv["aasequence_str"][ + heavy_start : heavy_start + len(heavy_chain) + ], + output_light_chain_id: loaded_scfv["aasequence_str"][ + light_start : light_start + len(light_chain) + ], + } + + chain_to_aa_index_seq = { + output_heavy_chain_id: loaded_scfv["aatype"][ + heavy_start : heavy_start + len(heavy_chain) + ], + output_light_chain_id: loaded_scfv["aatype"][ + light_start : light_start + len(light_chain) + ], + } if passthrough_chains is not None: for chain_id in passthrough_chains: curr_loaded_chain_data = load_pdb_chain_features(input_pdb_path, chain_id) - - chain_to_atom14[chain_id] = curr_loaded_chain_data['atom14_gt_positions'] - chain_to_aa_str_seq[chain_id] = curr_loaded_chain_data['aasequence_str'] - chain_to_aa_index_seq[chain_id] = curr_loaded_chain_data['aatype'] + chain_to_atom14[chain_id] = curr_loaded_chain_data["atom14_gt_positions"] + chain_to_aa_str_seq[chain_id] = curr_loaded_chain_data["aasequence_str"] + chain_to_aa_index_seq[chain_id] = curr_loaded_chain_data["aatype"] saved_files = save_structure_file( output_filename_extensionless=output_pdb_path_extensionless, @@ -124,7 +140,7 @@ def main( mask=None, # TODO: check ) - assert len(saved_files) == 1 + assert len(saved_files) == 1 print(f"saved {saved_files}") return saved_files[0] @@ -135,7 +151,7 @@ def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[s heavy_chain = [] light_chain = [] with open(filename, "r") as file: - for line in file: + for line in file: if line.startswith("#"): continue else: @@ -143,16 +159,15 @@ def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[s residue = parts[-1] if residue == "-": continue - if line.startswith("H"): + if line.startswith("H"): heavy_chain.append(residue) - elif line.startswith("L"): + elif line.startswith("L"): light_chain.append(residue) - - + heavy_chain = "".join(heavy_chain) light_chain = "".join(light_chain) - - return heavy_chain, light_chain + + return heavy_chain, light_chain if __name__ == "__main__": From 03748219d4e7afa61252fcb9cd1529fe16e0e3d6 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Mon, 16 Dec 2024 09:19:11 -0500 Subject: [PATCH 5/9] ... --- fusedrug/data/protein/structure/split_scfv_chain.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index 512d4f9b..50a0ef6d 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -82,6 +82,12 @@ def main( heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output( anarci_output_filename ) + + if 0==len(heavy_chain): + raise Exception("ANARCI could not find the heavy chain domain") + + if 0==len(light_chain): + raise Exception("ANARCI could not find the light chain domain") # assert len(heavy_chains) == len(light_chains) == len(sequences) # cleanup From 9b30549774726337b92287bf7569c8d5782d16c2 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Mon, 16 Dec 2024 09:19:29 -0500 Subject: [PATCH 6/9] ... --- fusedrug/data/protein/structure/split_scfv_chain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index 50a0ef6d..242faae6 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -83,10 +83,10 @@ def main( anarci_output_filename ) - if 0==len(heavy_chain): + if 0 == len(heavy_chain): raise Exception("ANARCI could not find the heavy chain domain") - if 0==len(light_chain): + if 0 == len(light_chain): raise Exception("ANARCI could not find the light chain domain") # assert len(heavy_chains) == len(light_chains) == len(sequences) From 9fda3d19ab93fd8bc196fe880d89876dbe228e10 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Tue, 14 Jan 2025 04:10:51 -0500 Subject: [PATCH 7/9] PR comments --- fusedrug/data/protein/structure/split_scfv_chain.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index 242faae6..0f50dac6 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -23,7 +23,10 @@ def main( ) -> None: """ - Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired + Takes an input PDB file and allows to split scfv within it to 2 separate chains. + This is useful for modifying such PDB to be used in follow up steps that assume such separate chains for heavy and light chain. + + It allows also to "passthrough" additional chains to maintain a "full" PDB. Args: input_pdb_path: @@ -88,7 +91,6 @@ def main( if 0 == len(light_chain): raise Exception("ANARCI could not find the light chain domain") - # assert len(heavy_chains) == len(light_chains) == len(sequences) # cleanup if cleanup_temp_files: From 9887e1cf5d475bec744a864cb8147724077527c1 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Tue, 14 Jan 2025 04:13:05 -0500 Subject: [PATCH 8/9] ... --- fusedrug/data/protein/structure/split_scfv_chain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index 0f50dac6..2c09cf86 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -81,7 +81,7 @@ def main( anarci_output_filename, ] ) - # parse anarci outputs and obtain separate heavy and light chains: + # parse anarci outputs and obtain separate heavy and light chains: heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output( anarci_output_filename ) From a8f719d90a5edf7131677b0cd6cb963316f83778 Mon Sep 17 00:00:00 2001 From: yoel shoshan Date: Tue, 14 Jan 2025 06:18:36 -0500 Subject: [PATCH 9/9] better message when ANARCI is missing --- fusedrug/data/protein/structure/split_scfv_chain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py index 2c09cf86..bedddbb0 100644 --- a/fusedrug/data/protein/structure/split_scfv_chain.py +++ b/fusedrug/data/protein/structure/split_scfv_chain.py @@ -60,7 +60,7 @@ def main( anarci_executable = join(dirname(sys.executable), "ANARCI") if not isfile(anarci_executable): raise Exception( - f"ANARCI binary not found in {dirname(sys.executable)}. check installation" + f"ANARCI binary not found in {dirname(sys.executable)}. check installation. You can install it in your env like this: conda install -c bioconda abnumber" ) anarci_output_filename = join(