From 346382948ad7c374e250cb26e9a86858b6f4ea2f Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Sun, 15 Dec 2024 10:59:04 -0500
Subject: [PATCH 1/9] added a programmatic/CLI util to split scfv to heavy and
 light chains

---
 .../protein/structure/split_scfv_chain.py     | 130 ++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 fusedrug/data/protein/structure/split_scfv_chain.py

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
new file mode 100644
index 00000000..a7a62722
--- /dev/null
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -0,0 +1,130 @@
+from jsonargparse import CLI
+from fusedrug.data.protein.structure.structure_io import (
+    load_pdb_chain_features,
+    save_structure_file,
+)
+from typing import Optional, Sequence
+from os.path import isfile, join, dirname, basename
+import os
+import sys
+import subprocess
+
+def main(
+    *,
+    input_pdb_path: str,
+    input_scfv_chain_id: str,
+    output_pdb_path_extensionless: str,
+    output_heavy_chain_id: Optional[str] = 'H',
+    output_light_chain_id: Optional[str] = 'L',
+    cleanup_temp_files:bool = True,
+) -> None:
+    """
+
+    Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired
+
+    Args:
+    input_pdb_path:    
+    
+    """
+
+    loaded_scfv = load_pdb_chain_features(
+        input_pdb_path, input_scfv_chain_id
+    )
+
+    scfv_seq = loaded_scfv['aasequence_str']
+
+    scfv_sequence_filename = join(
+        dirname(input_pdb_path),
+        f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt",
+    )
+
+    if not isfile(scfv_sequence_filename):
+        with open(scfv_sequence_filename, 'wt') as f:
+            f.write(f'>scfv_{input_scfv_chain_id}:...\n{scfv_seq}\n')
+        
+    # run anarci:
+    anarci_executable = join(dirname(sys.executable), "ANARCI")
+    if not isfile(anarci_executable):
+        raise Exception(
+            f"ANARCI binary not found in {dirname(sys.executable)}. check installation"
+        )
+
+    anarci_output_filename = join(
+        dirname(input_pdb_path),
+        f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt",
+    )
+
+    if not isfile(anarci_output_filename):
+        subprocess.run(
+            [
+                anarci_executable,
+                "-i",
+                scfv_sequence_filename,
+                "-o",
+                anarci_output_filename,
+            ]
+        )
+    # parse anarci outputs and obtain separate heavy and light chains:
+    heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(anarci_output_filename)
+    #assert len(heavy_chains) == len(light_chains) == len(sequences)
+    
+    #cleanup
+    if cleanup_temp_files:
+        os.remove(scfv_sequence_filename)
+        os.remove(anarci_output_filename)
+
+    heavy_start = scfv_seq.find(heavy_chain)
+    assert heavy_start >= 0
+
+    light_start = scfv_seq.find(light_chain)
+    assert light_start >= 0
+
+    saved_files = save_structure_file(
+        output_filename_extensionless=output_pdb_path_extensionless,
+        pdb_id="unknown",
+        chain_to_atom14={
+            output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][heavy_start:heavy_start+len(heavy_chain)],
+            output_light_chain_id: loaded_scfv["atom14_gt_positions"][light_start:light_start+len(light_chain)],
+        },
+        chain_to_aa_str_seq={
+            output_heavy_chain_id: loaded_scfv["aasequence_str"][heavy_start:heavy_start+len(heavy_chain)],
+            output_light_chain_id: loaded_scfv["aasequence_str"][light_start:light_start+len(light_chain)],            
+        },
+        chain_to_aa_index_seq={
+            output_heavy_chain_id: loaded_scfv["aatype"][heavy_start:heavy_start+len(heavy_chain)],
+            output_light_chain_id: loaded_scfv["aatype"][light_start:light_start+len(light_chain)],            
+        },
+        save_cif=False,
+        mask=None,  # TODO: check
+    )
+
+    print(f"saved {saved_files}")
+
+
+def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[str]]:
+    # parses ANARCI output on a fasta file of a single heavy and light chain domains
+    heavy_chain = []
+    light_chain = []
+    with open(filename, "r") as file:
+        for line in file:            
+            if line.startswith("#"):
+                continue
+            else:
+                parts = line.split()
+                residue = parts[-1]
+                if residue == "-":
+                    continue
+                if line.startswith("H"):                   
+                    heavy_chain.append(residue)
+                elif line.startswith("L"):                    
+                    light_chain.append(residue)
+        # last sequence:
+    
+    heavy_chain = "".join(heavy_chain)
+    light_chain = "".join(light_chain)
+        
+    return heavy_chain, light_chain    
+
+
+if __name__ == "__main__":
+    CLI(main, as_positional=False)

From 9c62639b1a41399168c8e8591d4717cf7ec5fbab Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Sun, 15 Dec 2024 12:46:46 -0500
Subject: [PATCH 2/9] scfv

---
 .../protein/structure/split_scfv_chain.py     | 44 +++++++++++++++----
 .../data/protein/structure/structure_io.py    |  7 +--
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index a7a62722..272e9f3b 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -16,6 +16,7 @@ def main(
     output_pdb_path_extensionless: str,
     output_heavy_chain_id: Optional[str] = 'H',
     output_light_chain_id: Optional[str] = 'L',
+    passthrough_chains: Optional[str] = None,
     cleanup_temp_files:bool = True,
 ) -> None:
     """
@@ -24,9 +25,15 @@ def main(
 
     Args:
     input_pdb_path:    
+
+    passthrough_chains: optional, will be "pass through", '_' separated if you want multiple
     
     """
 
+    if passthrough_chains is not None:
+        passthrough_chains = passthrough_chains.split('_')
+
+
     loaded_scfv = load_pdb_chain_features(
         input_pdb_path, input_scfv_chain_id
     )
@@ -79,27 +86,46 @@ def main(
     light_start = scfv_seq.find(light_chain)
     assert light_start >= 0
 
-    saved_files = save_structure_file(
-        output_filename_extensionless=output_pdb_path_extensionless,
-        pdb_id="unknown",
-        chain_to_atom14={
+    chain_to_atom14={
             output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][heavy_start:heavy_start+len(heavy_chain)],
             output_light_chain_id: loaded_scfv["atom14_gt_positions"][light_start:light_start+len(light_chain)],
-        },
-        chain_to_aa_str_seq={
+        }
+
+    chain_to_aa_str_seq={
             output_heavy_chain_id: loaded_scfv["aasequence_str"][heavy_start:heavy_start+len(heavy_chain)],
             output_light_chain_id: loaded_scfv["aasequence_str"][light_start:light_start+len(light_chain)],            
-        },
-        chain_to_aa_index_seq={
+        }
+
+    chain_to_aa_index_seq={
             output_heavy_chain_id: loaded_scfv["aatype"][heavy_start:heavy_start+len(heavy_chain)],
             output_light_chain_id: loaded_scfv["aatype"][light_start:light_start+len(light_chain)],            
-        },
+        }
+
+
+    if passthrough_chains is not None:
+        for chain_id in passthrough_chains:
+            curr_loaded_chain_data = load_pdb_chain_features(input_pdb_path, chain_id)
+            
+            chain_to_atom14[chain_id] = curr_loaded_chain_data['atom14_gt_positions']
+            chain_to_aa_str_seq[chain_id] = curr_loaded_chain_data['aasequence_str']
+            chain_to_aa_index_seq[chain_id] = curr_loaded_chain_data['aatype']
+
+
+    saved_files = save_structure_file(
+        output_filename_extensionless=output_pdb_path_extensionless,
+        pdb_id="unknown",
+        chain_to_atom14=chain_to_atom14,
+        chain_to_aa_str_seq=chain_to_aa_str_seq,
+        chain_to_aa_index_seq=chain_to_aa_index_seq,
         save_cif=False,
         mask=None,  # TODO: check
     )
 
+    assert len(saved_files) == 1    
     print(f"saved {saved_files}")
 
+    return saved_files[0]
+
 
 def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[str]]:
     # parses ANARCI output on a fasta file of a single heavy and light chain domains
diff --git a/fusedrug/data/protein/structure/structure_io.py b/fusedrug/data/protein/structure/structure_io.py
index 72aa0f94..ebe730d3 100644
--- a/fusedrug/data/protein/structure/structure_io.py
+++ b/fusedrug/data/protein/structure/structure_io.py
@@ -798,10 +798,11 @@ def flexible_save_pdb_file(
         and ((b_factors is None) or isinstance(b_factors, dict))
     )
 
-    assert list(xyz.keys()) == list(sequence.keys())
-    assert list(xyz.keys()) == list(residues_mask.keys())
+    assert set(xyz.keys()) == set(sequence.keys())
+    assert set(xyz.keys()) == set(residues_mask.keys())
+
     if b_factors is not None:
-        assert list(xyz.keys()) == list(b_factors.keys())
+        assert set(xyz.keys()) == set(b_factors.keys())
 
     if only_save_backbone:
         print(

From 3826feaf9eade33262185b589e331c59540ee213 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Mon, 16 Dec 2024 06:27:13 -0500
Subject: [PATCH 3/9] scfv

---
 fusedrug/data/protein/structure/split_scfv_chain.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index 272e9f3b..b477bca1 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -8,6 +8,7 @@
 import os
 import sys
 import subprocess
+import threading
 
 def main(
     *,
@@ -40,9 +41,11 @@ def main(
 
     scfv_seq = loaded_scfv['aasequence_str']
 
+    safety = f'_{os.getpid()}_{threading.get_ident()}'
+
     scfv_sequence_filename = join(
         dirname(input_pdb_path),
-        f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt",
+        f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt",
     )
 
     if not isfile(scfv_sequence_filename):
@@ -58,7 +61,7 @@ def main(
 
     anarci_output_filename = join(
         dirname(input_pdb_path),
-        f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+".txt",
+        f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt",
     )
 
     if not isfile(anarci_output_filename):
@@ -144,7 +147,7 @@ def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[s
                     heavy_chain.append(residue)
                 elif line.startswith("L"):                    
                     light_chain.append(residue)
-        # last sequence:
+        
     
     heavy_chain = "".join(heavy_chain)
     light_chain = "".join(light_chain)

From 73dd6321267ced7396b6a539ca465f6d4bd2cc57 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Mon, 16 Dec 2024 06:27:35 -0500
Subject: [PATCH 4/9] scfv

---
 .../protein/structure/split_scfv_chain.py     | 111 ++++++++++--------
 1 file changed, 63 insertions(+), 48 deletions(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index b477bca1..512d4f9b 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -10,48 +10,49 @@
 import subprocess
 import threading
 
+
 def main(
     *,
     input_pdb_path: str,
     input_scfv_chain_id: str,
     output_pdb_path_extensionless: str,
-    output_heavy_chain_id: Optional[str] = 'H',
-    output_light_chain_id: Optional[str] = 'L',
+    output_heavy_chain_id: Optional[str] = "H",
+    output_light_chain_id: Optional[str] = "L",
     passthrough_chains: Optional[str] = None,
-    cleanup_temp_files:bool = True,
+    cleanup_temp_files: bool = True,
 ) -> None:
     """
 
     Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired
 
     Args:
-    input_pdb_path:    
+    input_pdb_path:
 
     passthrough_chains: optional, will be "pass through", '_' separated if you want multiple
-    
+
     """
 
     if passthrough_chains is not None:
-        passthrough_chains = passthrough_chains.split('_')
-
+        passthrough_chains = passthrough_chains.split("_")
 
-    loaded_scfv = load_pdb_chain_features(
-        input_pdb_path, input_scfv_chain_id
-    )
+    loaded_scfv = load_pdb_chain_features(input_pdb_path, input_scfv_chain_id)
 
-    scfv_seq = loaded_scfv['aasequence_str']
+    scfv_seq = loaded_scfv["aasequence_str"]
 
-    safety = f'_{os.getpid()}_{threading.get_ident()}'
+    safety = f"_{os.getpid()}_{threading.get_ident()}"
 
     scfv_sequence_filename = join(
         dirname(input_pdb_path),
-        f"sequence_info_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt",
+        f"sequence_info_{input_scfv_chain_id}_"
+        + basename(input_pdb_path)
+        + safety
+        + ".txt",
     )
 
     if not isfile(scfv_sequence_filename):
-        with open(scfv_sequence_filename, 'wt') as f:
-            f.write(f'>scfv_{input_scfv_chain_id}:...\n{scfv_seq}\n')
-        
+        with open(scfv_sequence_filename, "wt") as f:
+            f.write(f">scfv_{input_scfv_chain_id}:...\n{scfv_seq}\n")
+
     # run anarci:
     anarci_executable = join(dirname(sys.executable), "ANARCI")
     if not isfile(anarci_executable):
@@ -61,7 +62,10 @@ def main(
 
     anarci_output_filename = join(
         dirname(input_pdb_path),
-        f"anarci_output_{input_scfv_chain_id}_"+basename(input_pdb_path)+safety+".txt",
+        f"anarci_output_{input_scfv_chain_id}_"
+        + basename(input_pdb_path)
+        + safety
+        + ".txt",
     )
 
     if not isfile(anarci_output_filename):
@@ -75,10 +79,12 @@ def main(
             ]
         )
     # parse anarci outputs and obtain separate heavy and light chains:
-    heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(anarci_output_filename)
-    #assert len(heavy_chains) == len(light_chains) == len(sequences)
-    
-    #cleanup
+    heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(
+        anarci_output_filename
+    )
+    # assert len(heavy_chains) == len(light_chains) == len(sequences)
+
+    # cleanup
     if cleanup_temp_files:
         os.remove(scfv_sequence_filename)
         os.remove(anarci_output_filename)
@@ -89,30 +95,40 @@ def main(
     light_start = scfv_seq.find(light_chain)
     assert light_start >= 0
 
-    chain_to_atom14={
-            output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][heavy_start:heavy_start+len(heavy_chain)],
-            output_light_chain_id: loaded_scfv["atom14_gt_positions"][light_start:light_start+len(light_chain)],
-        }
-
-    chain_to_aa_str_seq={
-            output_heavy_chain_id: loaded_scfv["aasequence_str"][heavy_start:heavy_start+len(heavy_chain)],
-            output_light_chain_id: loaded_scfv["aasequence_str"][light_start:light_start+len(light_chain)],            
-        }
-
-    chain_to_aa_index_seq={
-            output_heavy_chain_id: loaded_scfv["aatype"][heavy_start:heavy_start+len(heavy_chain)],
-            output_light_chain_id: loaded_scfv["aatype"][light_start:light_start+len(light_chain)],            
-        }
-
+    chain_to_atom14 = {
+        output_heavy_chain_id: loaded_scfv["atom14_gt_positions"][
+            heavy_start : heavy_start + len(heavy_chain)
+        ],
+        output_light_chain_id: loaded_scfv["atom14_gt_positions"][
+            light_start : light_start + len(light_chain)
+        ],
+    }
+
+    chain_to_aa_str_seq = {
+        output_heavy_chain_id: loaded_scfv["aasequence_str"][
+            heavy_start : heavy_start + len(heavy_chain)
+        ],
+        output_light_chain_id: loaded_scfv["aasequence_str"][
+            light_start : light_start + len(light_chain)
+        ],
+    }
+
+    chain_to_aa_index_seq = {
+        output_heavy_chain_id: loaded_scfv["aatype"][
+            heavy_start : heavy_start + len(heavy_chain)
+        ],
+        output_light_chain_id: loaded_scfv["aatype"][
+            light_start : light_start + len(light_chain)
+        ],
+    }
 
     if passthrough_chains is not None:
         for chain_id in passthrough_chains:
             curr_loaded_chain_data = load_pdb_chain_features(input_pdb_path, chain_id)
-            
-            chain_to_atom14[chain_id] = curr_loaded_chain_data['atom14_gt_positions']
-            chain_to_aa_str_seq[chain_id] = curr_loaded_chain_data['aasequence_str']
-            chain_to_aa_index_seq[chain_id] = curr_loaded_chain_data['aatype']
 
+            chain_to_atom14[chain_id] = curr_loaded_chain_data["atom14_gt_positions"]
+            chain_to_aa_str_seq[chain_id] = curr_loaded_chain_data["aasequence_str"]
+            chain_to_aa_index_seq[chain_id] = curr_loaded_chain_data["aatype"]
 
     saved_files = save_structure_file(
         output_filename_extensionless=output_pdb_path_extensionless,
@@ -124,7 +140,7 @@ def main(
         mask=None,  # TODO: check
     )
 
-    assert len(saved_files) == 1    
+    assert len(saved_files) == 1
     print(f"saved {saved_files}")
 
     return saved_files[0]
@@ -135,7 +151,7 @@ def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[s
     heavy_chain = []
     light_chain = []
     with open(filename, "r") as file:
-        for line in file:            
+        for line in file:
             if line.startswith("#"):
                 continue
             else:
@@ -143,16 +159,15 @@ def split_heavy_light_chain_from_anarci_output(filename: str) -> list[Sequence[s
                 residue = parts[-1]
                 if residue == "-":
                     continue
-                if line.startswith("H"):                   
+                if line.startswith("H"):
                     heavy_chain.append(residue)
-                elif line.startswith("L"):                    
+                elif line.startswith("L"):
                     light_chain.append(residue)
-        
-    
+
     heavy_chain = "".join(heavy_chain)
     light_chain = "".join(light_chain)
-        
-    return heavy_chain, light_chain    
+
+    return heavy_chain, light_chain
 
 
 if __name__ == "__main__":

From 03748219d4e7afa61252fcb9cd1529fe16e0e3d6 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Mon, 16 Dec 2024 09:19:11 -0500
Subject: [PATCH 5/9] ...

---
 fusedrug/data/protein/structure/split_scfv_chain.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index 512d4f9b..50a0ef6d 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -82,6 +82,12 @@ def main(
     heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(
         anarci_output_filename
     )
+
+    if 0==len(heavy_chain):
+        raise Exception("ANARCI could not find the heavy chain domain")
+
+    if 0==len(light_chain):
+        raise Exception("ANARCI could not find the light chain domain")
     # assert len(heavy_chains) == len(light_chains) == len(sequences)
 
     # cleanup

From 9b30549774726337b92287bf7569c8d5782d16c2 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Mon, 16 Dec 2024 09:19:29 -0500
Subject: [PATCH 6/9] ...

---
 fusedrug/data/protein/structure/split_scfv_chain.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index 50a0ef6d..242faae6 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -83,10 +83,10 @@ def main(
         anarci_output_filename
     )
 
-    if 0==len(heavy_chain):
+    if 0 == len(heavy_chain):
         raise Exception("ANARCI could not find the heavy chain domain")
 
-    if 0==len(light_chain):
+    if 0 == len(light_chain):
         raise Exception("ANARCI could not find the light chain domain")
     # assert len(heavy_chains) == len(light_chains) == len(sequences)
 

From 9fda3d19ab93fd8bc196fe880d89876dbe228e10 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Tue, 14 Jan 2025 04:10:51 -0500
Subject: [PATCH 7/9] PR comments

---
 fusedrug/data/protein/structure/split_scfv_chain.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index 242faae6..0f50dac6 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -23,7 +23,10 @@ def main(
 ) -> None:
     """
 
-    Takes an input PDB files and splits it into separate files, one per describe chain, allowing to rename the chains if desired
+    Takes an input PDB file and allows to split scfv within it to 2 separate chains.
+    This is useful for modifying such PDB to be used in follow up steps that assume such separate chains for heavy and light chain.
+
+    It allows also to "passthrough" additional chains to maintain a "full" PDB.
 
     Args:
     input_pdb_path:
@@ -88,7 +91,6 @@ def main(
 
     if 0 == len(light_chain):
         raise Exception("ANARCI could not find the light chain domain")
-    # assert len(heavy_chains) == len(light_chains) == len(sequences)
 
     # cleanup
     if cleanup_temp_files:

From 9887e1cf5d475bec744a864cb8147724077527c1 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Tue, 14 Jan 2025 04:13:05 -0500
Subject: [PATCH 8/9] ...

---
 fusedrug/data/protein/structure/split_scfv_chain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index 0f50dac6..2c09cf86 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -81,7 +81,7 @@ def main(
                 anarci_output_filename,
             ]
         )
-    # parse anarci outputs and obtain separate heavy and light chains:
+    # parse anarci outputs  and obtain separate heavy and light chains:
     heavy_chain, light_chain = split_heavy_light_chain_from_anarci_output(
         anarci_output_filename
     )

From a8f719d90a5edf7131677b0cd6cb963316f83778 Mon Sep 17 00:00:00 2001
From: yoel shoshan <yoels@il.ibm.com>
Date: Tue, 14 Jan 2025 06:18:36 -0500
Subject: [PATCH 9/9] better message when ANARCI is missing

---
 fusedrug/data/protein/structure/split_scfv_chain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fusedrug/data/protein/structure/split_scfv_chain.py b/fusedrug/data/protein/structure/split_scfv_chain.py
index 2c09cf86..bedddbb0 100644
--- a/fusedrug/data/protein/structure/split_scfv_chain.py
+++ b/fusedrug/data/protein/structure/split_scfv_chain.py
@@ -60,7 +60,7 @@ def main(
     anarci_executable = join(dirname(sys.executable), "ANARCI")
     if not isfile(anarci_executable):
         raise Exception(
-            f"ANARCI binary not found in {dirname(sys.executable)}. check installation"
+            f"ANARCI binary not found in {dirname(sys.executable)}. check installation. You can install it in your env like this: conda install -c bioconda abnumber"
         )
 
     anarci_output_filename = join(