From 07c12534fa951cbe7c70c7b34f955f9ad6cd12ee Mon Sep 17 00:00:00 2001
From: tjmier <tjmier96@gmail.com>
Date: Sun, 16 Jun 2024 11:38:13 -0500
Subject: [PATCH 1/5] Added a new parameter 'chain_id' to the function
 'get_sequence' in the module 'biotite.structure.io.pdb' to return a
 dictionary mapping chain_id to the sequence

---
 src/biotite/structure/io/pdbx/convert.py | 31 +++++++++++++++++++-----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index f5c896175..9d011acb0 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -110,8 +110,8 @@ def _filter(category, index):
     })
 
 
-def get_sequence(pdbx_file, data_block=None):
-    """
+def get_sequence(pdbx_file, data_block=None, chain_ids=False):
+    """""
     Get the protein and nucleotide sequences from the
     ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
 
@@ -131,12 +131,18 @@ def get_sequence(pdbx_file, data_block=None):
         file.
         If the data block object is passed directly to `pdbx_file`,
         this parameter is ignored.
+    chain_ids : bool, optional
+        If True, a dictionary mapping chain IDs to sequences is returned.
+        If False or not given, all sequences are returned.
+        Default is False.
 
     Returns
     -------
-    sequences : list of Sequence
-        The protein and nucleotide sequences for each entity
-        (equivalent to chains in most cases).
+    sequences : list of Sequence or dict
+        If `chain_ids` is False, returns a list of protein and nucleotide 
+        sequences for each entity.
+        If `chain_ids` is True, returns a dictionary where each key is a 
+        chain ID and each value is the corresponding sequence.
     """
     block = _get_block(pdbx_file, data_block)
 
@@ -148,7 +154,20 @@ def get_sequence(pdbx_file, data_block=None):
         sequence = _convert_string_to_sequence(string, stype)
         if sequence is not None:
             sequences.append(sequence)
-    return sequences
+            
+    if chain_ids:
+        strand_ids = poly_category['pdbx_strand_id'].as_array(str)
+        strand_ids = [strand_id.split(",") for strand_id in strand_ids]
+
+        strand_ids_to_seq_dict = {}
+        for entity, strand_ids in enumerate(strand_ids):
+            for strand_id in strand_ids:
+                strand_ids_to_seq_dict[strand_id] = sequences[entity-1]
+
+        return strand_ids_to_seq_dict
+    
+    else:
+        return sequences
 
 
 def get_model_count(pdbx_file, data_block=None):

From d5eb03391aa3c9f0c57b21b26cd80f50a92732d6 Mon Sep 17 00:00:00 2001
From: tjmier <tjmier96@gmail.com>
Date: Mon, 17 Jun 2024 13:57:21 -0500
Subject: [PATCH 2/5] updated get_sequence to return dict of sequences with
 entity_poly.pdbx_strand_id as keys. Updated test_pdx.py test_get_sequence to
 reflect returning a dict instead of a list

---
 src/biotite/structure/io/pdbx/convert.py | 59 ++++++++++++------------
 tests/structure/test_pdbx.py             | 24 +++++-----
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index 9d011acb0..15665c767 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -110,7 +110,7 @@ def _filter(category, index):
     })
 
 
-def get_sequence(pdbx_file, data_block=None, chain_ids=False):
+def get_sequence(pdbx_file, data_block=None):
     """""
     Get the protein and nucleotide sequences from the
     ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
@@ -131,43 +131,44 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False):
         file.
         If the data block object is passed directly to `pdbx_file`,
         this parameter is ignored.
-    chain_ids : bool, optional
-        If True, a dictionary mapping chain IDs to sequences is returned.
-        If False or not given, all sequences are returned.
-        Default is False.
 
     Returns
     -------
-    sequences : list of Sequence or dict
-        If `chain_ids` is False, returns a list of protein and nucleotide 
-        sequences for each entity.
-        If `chain_ids` is True, returns a dictionary where each key is a 
-        chain ID and each value is the corresponding sequence.
+    sequence_dict : Dictionary of Sequences 
+        Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
+        (often equivalent to chain_id and atom_site.auth_asym_id
+        in most cases). Dictionary values are sequences.
+        
+    Notes
+    -----
+    The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial 
+    complete sequence. If the structure represents a truncated or spliced 
+    version of this initial sequence, it will include only a subset of the 
+    initial sequence. Use biotite.structure.get_residues to retrieve only 
+    the residues that are represented in the structure.
     """
     block = _get_block(pdbx_file, data_block)
-
     poly_category= block["entity_poly"]
+
     seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
     seq_type = poly_category["type"].as_array(str)
-    sequences = []
-    for string, stype in zip(seq_string, seq_type):
-        sequence = _convert_string_to_sequence(string, stype)
-        if sequence is not None:
-            sequences.append(sequence)
-            
-    if chain_ids:
-        strand_ids = poly_category['pdbx_strand_id'].as_array(str)
-        strand_ids = [strand_id.split(",") for strand_id in strand_ids]
-
-        strand_ids_to_seq_dict = {}
-        for entity, strand_ids in enumerate(strand_ids):
-            for strand_id in strand_ids:
-                strand_ids_to_seq_dict[strand_id] = sequences[entity-1]
-
-        return strand_ids_to_seq_dict
+
+    sequences = [
+        _convert_string_to_sequence(string, stype)
+        for string, stype in zip(seq_string, seq_type)
+        if _convert_string_to_sequence(string, stype) is not None
+    ]
     
-    else:
-        return sequences
+    strand_ids = poly_category['pdbx_strand_id'].as_array(str)
+    strand_ids = [strand_id.split(",") for strand_id in strand_ids]
+    
+    sequence_dict = {
+        strand_id: sequence
+        for sequence, strand_ids in zip(sequences, strand_ids)
+        for strand_id in strand_ids
+    }
+
+    return sequence_dict
 
 
 def get_model_count(pdbx_file, data_block=None):
diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
index 8523b8b94..68d166271 100644
--- a/tests/structure/test_pdbx.py
+++ b/tests/structure/test_pdbx.py
@@ -457,17 +457,17 @@ def test_get_sequence(format):
         File = pdbx.BinaryCIFFile
 
     pdbx_file = File.read(join(data_dir("structure"), f"5ugo.{format}"))
-    sequences = pdbx.get_sequence(pdbx_file)
+    sequences_1 = pdbx.get_sequence(pdbx_file)
     pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}"))
-    sequences += pdbx.get_sequence(pdbx_file)
-    assert str(sequences[0]) == "CCGACGGCGCATCAGC"
-    assert type(sequences[0]) is seq.NucleotideSequence
-    assert str(sequences[1]) == "GCTGATGCGCC"
-    assert type(sequences[1]) is seq.NucleotideSequence
-    assert str(sequences[2]) == "GTCGG"
-    assert type(sequences[2]) is seq.NucleotideSequence
+    sequences_2 = pdbx.get_sequence(pdbx_file)
+    assert str(sequences_1['T']) == "CCGACGGCGCATCAGC"
+    assert type(sequences_1['T']) is seq.NucleotideSequence
+    assert str(sequences_1['P']) == "GCTGATGCGCC"
+    assert type(sequences_1['P']) is seq.NucleotideSequence
+    assert str(sequences_1['D']) == "GTCGG"
+    assert type(sequences_1['D']) is seq.NucleotideSequence
     assert (
-        str(sequences[3]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
+        str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
         "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD"
         "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD"
         "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS"
@@ -475,14 +475,14 @@ def test_get_sequence(format):
         "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA"
         "GEPLPVDSEKDIFDYIQWKYREPKDRSE"
     )
-    assert type(sequences[3]) is seq.ProteinSequence
+    assert type(sequences_1['A']) is seq.ProteinSequence
     assert (
-        str(sequences[4]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
+        str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
         "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC"
         "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT"
         "CCGGAGTCAGGAAACCTGCCTGCCGTC"
     )
-    assert type(sequences[4]) is seq.NucleotideSequence
+    assert type(sequences_2['A']) is seq.NucleotideSequence
 
 
 def test_bcif_encoding():

From e79237e28d0a60fe1aae6dc79de55a775ec16ebb Mon Sep 17 00:00:00 2001
From: tjmier <tjmier96@gmail.com>
Date: Tue, 18 Jun 2024 19:05:57 -0500
Subject: [PATCH 3/5] update local branch with remote branch

---
 src/biotite/structure/io/pdbx/convert.py | 31 +++++++++++++++++++-----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index f5c896175..9d011acb0 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -110,8 +110,8 @@ def _filter(category, index):
     })
 
 
-def get_sequence(pdbx_file, data_block=None):
-    """
+def get_sequence(pdbx_file, data_block=None, chain_ids=False):
+    """""
     Get the protein and nucleotide sequences from the
     ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
 
@@ -131,12 +131,18 @@ def get_sequence(pdbx_file, data_block=None):
         file.
         If the data block object is passed directly to `pdbx_file`,
         this parameter is ignored.
+    chain_ids : bool, optional
+        If True, a dictionary mapping chain IDs to sequences is returned.
+        If False or not given, all sequences are returned.
+        Default is False.
 
     Returns
     -------
-    sequences : list of Sequence
-        The protein and nucleotide sequences for each entity
-        (equivalent to chains in most cases).
+    sequences : list of Sequence or dict
+        If `chain_ids` is False, returns a list of protein and nucleotide 
+        sequences for each entity.
+        If `chain_ids` is True, returns a dictionary where each key is a 
+        chain ID and each value is the corresponding sequence.
     """
     block = _get_block(pdbx_file, data_block)
 
@@ -148,7 +154,20 @@ def get_sequence(pdbx_file, data_block=None):
         sequence = _convert_string_to_sequence(string, stype)
         if sequence is not None:
             sequences.append(sequence)
-    return sequences
+            
+    if chain_ids:
+        strand_ids = poly_category['pdbx_strand_id'].as_array(str)
+        strand_ids = [strand_id.split(",") for strand_id in strand_ids]
+
+        strand_ids_to_seq_dict = {}
+        for entity, strand_ids in enumerate(strand_ids):
+            for strand_id in strand_ids:
+                strand_ids_to_seq_dict[strand_id] = sequences[entity-1]
+
+        return strand_ids_to_seq_dict
+    
+    else:
+        return sequences
 
 
 def get_model_count(pdbx_file, data_block=None):

From 997469956d6159a3b751d4da6688363845fedb81 Mon Sep 17 00:00:00 2001
From: tjmier <tjmier96@gmail.com>
Date: Tue, 18 Jun 2024 19:15:23 -0500
Subject: [PATCH 4/5] fix merge mistake

---
 src/biotite/structure/io/pdbx/convert.py | 39 +-----------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index 718214712..b91b3a0d5 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -110,11 +110,7 @@ def _filter(category, index):
     })
 
 
-<<<<<<< HEAD
 def get_sequence(pdbx_file, data_block=None):
-=======
-def get_sequence(pdbx_file, data_block=None, chain_ids=False):
->>>>>>> master
     """""
     Get the protein and nucleotide sequences from the
     ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
@@ -135,14 +131,9 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False):
         file.
         If the data block object is passed directly to `pdbx_file`,
         this parameter is ignored.
-    chain_ids : bool, optional
-        If True, a dictionary mapping chain IDs to sequences is returned.
-        If False or not given, all sequences are returned.
-        Default is False.
 
     Returns
     -------
-<<<<<<< HEAD
     sequence_dict : Dictionary of Sequences 
         Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
         (often equivalent to chain_id and atom_site.auth_asym_id
@@ -155,20 +146,13 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False):
     version of this initial sequence, it will include only a subset of the 
     initial sequence. Use biotite.structure.get_residues to retrieve only 
     the residues that are represented in the structure.
-=======
-    sequences : list of Sequence or dict
-        If `chain_ids` is False, returns a list of protein and nucleotide 
-        sequences for each entity.
-        If `chain_ids` is True, returns a dictionary where each key is a 
-        chain ID and each value is the corresponding sequence.
->>>>>>> master
     """
+    
     block = _get_block(pdbx_file, data_block)
     poly_category= block["entity_poly"]
 
     seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
     seq_type = poly_category["type"].as_array(str)
-<<<<<<< HEAD
 
     sequences = [
         _convert_string_to_sequence(string, stype)
@@ -186,27 +170,6 @@ def get_sequence(pdbx_file, data_block=None, chain_ids=False):
     }
 
     return sequence_dict
-=======
-    sequences = []
-    for string, stype in zip(seq_string, seq_type):
-        sequence = _convert_string_to_sequence(string, stype)
-        if sequence is not None:
-            sequences.append(sequence)
-            
-    if chain_ids:
-        strand_ids = poly_category['pdbx_strand_id'].as_array(str)
-        strand_ids = [strand_id.split(",") for strand_id in strand_ids]
-
-        strand_ids_to_seq_dict = {}
-        for entity, strand_ids in enumerate(strand_ids):
-            for strand_id in strand_ids:
-                strand_ids_to_seq_dict[strand_id] = sequences[entity-1]
-
-        return strand_ids_to_seq_dict
-    
-    else:
-        return sequences
->>>>>>> master
 
 
 def get_model_count(pdbx_file, data_block=None):

From 89297ac1334d1e28242ea6dab60eca4d5730f2f8 Mon Sep 17 00:00:00 2001
From: tjmier <tjmier96@gmail.com>
Date: Tue, 18 Jun 2024 21:28:50 -0500
Subject: [PATCH 5/5] updated get_sequence, dependent tests, and example

---
 doc/examples/scripts/sequence/residue_coevolution.py | 2 +-
 src/biotite/structure/io/pdbx/convert.py             | 2 +-
 tests/database/test_rcsb.py                          | 2 +-
 tests/structure/test_sequence.py                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/examples/scripts/sequence/residue_coevolution.py b/doc/examples/scripts/sequence/residue_coevolution.py
index a45e7bdb9..e1f2f7329 100644
--- a/doc/examples/scripts/sequence/residue_coevolution.py
+++ b/doc/examples/scripts/sequence/residue_coevolution.py
@@ -58,7 +58,7 @@
 
 # Get structure and sequence
 pdbx_file = pdbx.CIFFile.read(rcsb.fetch("1GUU", "mmcif"))
-sequence = pdbx.get_sequence(pdbx_file)[0]
+sequence = pdbx.get_sequence(pdbx_file)['A']
 # 'use_author_fields' is set to false,
 # to ensure that values in the 'res_id' annotation point to the sequence
 structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False)
diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
index b91b3a0d5..0a149d186 100644
--- a/src/biotite/structure/io/pdbx/convert.py
+++ b/src/biotite/structure/io/pdbx/convert.py
@@ -157,7 +157,6 @@ def get_sequence(pdbx_file, data_block=None):
     sequences = [
         _convert_string_to_sequence(string, stype)
         for string, stype in zip(seq_string, seq_type)
-        if _convert_string_to_sequence(string, stype) is not None
     ]
     
     strand_ids = poly_category['pdbx_strand_id'].as_array(str)
@@ -167,6 +166,7 @@ def get_sequence(pdbx_file, data_block=None):
         strand_id: sequence
         for sequence, strand_ids in zip(sequences, strand_ids)
         for strand_id in strand_ids
+        if sequence is not None
     }
 
     return sequence_dict
diff --git a/tests/database/test_rcsb.py b/tests/database/test_rcsb.py
index 715bb0dc5..415b713a4 100644
--- a/tests/database/test_rcsb.py
+++ b/tests/database/test_rcsb.py
@@ -142,7 +142,7 @@ def test_search_field(field, molecular_definition, params, ref_ids):
 def test_search_sequence():
     IDENTIY_CUTOFF = 0.9
     pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif"))
-    ref_sequence = pdbx.get_sequence(pdbx_file)[0]
+    ref_sequence = pdbx.get_sequence(pdbx_file)['A']
     query = rcsb.SequenceQuery(
         ref_sequence, "protein", min_identity=IDENTIY_CUTOFF
     )
diff --git a/tests/structure/test_sequence.py b/tests/structure/test_sequence.py
index 2d4efb516..098958824 100644
--- a/tests/structure/test_sequence.py
+++ b/tests/structure/test_sequence.py
@@ -53,7 +53,7 @@ def test_pdbx_sequence_consistency(path):
 def _find_best_match(sequence, ref_sequences):
     best_alignment = None
     best_identity = 0.0
-    for ref_sequence in ref_sequences:
+    for ref_sequence in ref_sequences.values():
         if type(sequence) != type(ref_sequence):
             continue
         if isinstance(sequence, seq.ProteinSequence):