biotite-dev · tjmier · Jun 16, 2024 · Jun 17, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/src/biotite/structure/io/pdbx/convert.py b/src/biotite/structure/io/pdbx/convert.py
@@ -111,7 +111,7 @@ def _filter(category, index):
 
 
 def get_sequence(pdbx_file, data_block=None):
-    """
+    """""
     Get the protein and nucleotide sequences from the
     ``entity_poly.pdbx_seq_one_letter_code_can`` entry.
 
@@ -134,21 +134,41 @@ def get_sequence(pdbx_file, data_block=None):
 
     Returns
     -------
-    sequences : list of Sequence
-        The protein and nucleotide sequences for each entity
-        (equivalent to chains in most cases).
+    sequence_dict : Dictionary of Sequences 
+        Dictionary keys are derived from ``entity_poly.pdbx_strand_id``
+        (often equivalent to chain_id and atom_site.auth_asym_id
+        in most cases). Dictionary values are sequences.
+
+    Notes
+    -----
+    The ``entity_poly.pdbx_seq_one_letter_code_can`` field contains the initial 
+    complete sequence. If the structure represents a truncated or spliced 
+    version of this initial sequence, it will include only a subset of the 
+    initial sequence. Use biotite.structure.get_residues to retrieve only 
+    the residues that are represented in the structure.
     """
     block = _get_block(pdbx_file, data_block)
-
     poly_category= block["entity_poly"]
+
     seq_string = poly_category["pdbx_seq_one_letter_code_can"].as_array(str)
     seq_type = poly_category["type"].as_array(str)
-    sequences = []
-    for string, stype in zip(seq_string, seq_type):
-        sequence = _convert_string_to_sequence(string, stype)
-        if sequence is not None:
-            sequences.append(sequence)
-    return sequences
+
+    sequences = [
+        _convert_string_to_sequence(string, stype)
+        for string, stype in zip(seq_string, seq_type)
+        if _convert_string_to_sequence(string, stype) is not None
+    ]
+
+    strand_ids = poly_category['pdbx_strand_id'].as_array(str)
+    strand_ids = [strand_id.split(",") for strand_id in strand_ids]
+
+    sequence_dict = {
+        strand_id: sequence
+        for sequence, strand_ids in zip(sequences, strand_ids)
+        for strand_id in strand_ids
+    }
+
+    return sequence_dict
 
 
 def get_model_count(pdbx_file, data_block=None):

diff --git a/tests/structure/test_pdbx.py b/tests/structure/test_pdbx.py
@@ -457,32 +457,32 @@ def test_get_sequence(format):
         File = pdbx.BinaryCIFFile
 
     pdbx_file = File.read(join(data_dir("structure"), f"5ugo.{format}"))
-    sequences = pdbx.get_sequence(pdbx_file)
+    sequences_1 = pdbx.get_sequence(pdbx_file)
     pdbx_file = File.read(join(data_dir("structure"), f"4gxy.{format}"))
-    sequences += pdbx.get_sequence(pdbx_file)
-    assert str(sequences[0]) == "CCGACGGCGCATCAGC"
-    assert type(sequences[0]) is seq.NucleotideSequence
-    assert str(sequences[1]) == "GCTGATGCGCC"
-    assert type(sequences[1]) is seq.NucleotideSequence
-    assert str(sequences[2]) == "GTCGG"
-    assert type(sequences[2]) is seq.NucleotideSequence
+    sequences_2 = pdbx.get_sequence(pdbx_file)
+    assert str(sequences_1['T']) == "CCGACGGCGCATCAGC"
+    assert type(sequences_1['T']) is seq.NucleotideSequence
+    assert str(sequences_1['P']) == "GCTGATGCGCC"
+    assert type(sequences_1['P']) is seq.NucleotideSequence
+    assert str(sequences_1['D']) == "GTCGG"
+    assert type(sequences_1['D']) is seq.NucleotideSequence
     assert (
-        str(sequences[3]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
+        str(sequences_1['A']) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN"
         "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD"
         "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD"
         "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS"
         "FTSESTKQPKLLHQVVEQLQKVHFITDTLSKGETKFMGVCQLPSKNDEKEYPHR"
         "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA"
         "GEPLPVDSEKDIFDYIQWKYREPKDRSE"
     )
-    assert type(sequences[3]) is seq.ProteinSequence
+    assert type(sequences_1['A']) is seq.ProteinSequence
     assert (
-        str(sequences[4]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
+        str(sequences_2['A']) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA"
         "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC"
         "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT"
         "CCGGAGTCAGGAAACCTGCCTGCCGTC"
     )
-    assert type(sequences[4]) is seq.NucleotideSequence
+    assert type(sequences_2['A']) is seq.NucleotideSequence
 
 
 def test_bcif_encoding():