biotite-dev · padix-key · Jun 19, 2023 · Jun 18, 2023 · Jun 18, 2023 · Jun 18, 2023
diff --git a/src/biotite/sequence/io/fasta/convert.py b/src/biotite/sequence/io/fasta/convert.py
@@ -16,7 +16,7 @@
            "get_alignment", "set_alignment"]
 
 
-def get_sequence(fasta_file, header=None):
+def get_sequence(fasta_file, seq_type=None, header=None):
     """
     Get a sequence from a :class:`FastaFile` instance.
 
@@ -28,6 +28,9 @@ def get_sequence(fasta_file, header=None):
     ----------
     fasta_file : FastaFile
         The :class:`FastaFile` to be accessed.
+    seq_type : Class, optional
+        The :class:`Sequence` subclass contained in the file. If not 
+        set, it is automatically detected.
     header : str, optional
         The header to get the sequence from. By default, the first
         sequence of the file is returned.
@@ -57,10 +60,10 @@ def get_sequence(fasta_file, header=None):
     # Determine the sequence type:
     # If NucleotideSequence can be created it is a DNA sequence,
     # otherwise protein sequence
-    return _convert_to_sequence(seq_str)
+    return _convert_to_sequence(seq_str, seq_type)
 
 
-def get_sequences(fasta_file):
+def get_sequences(fasta_file, seq_type=None):
     """
     Get dictionary from a :class:`FastaFile` instance,
     where headers are keys and sequences are values.
@@ -73,6 +76,9 @@ def get_sequences(fasta_file):
     ----------
     fasta_file : FastaFile
         The :class:`FastaFile` to be accessed.
+    seq_type : Class, optional
+        The :class:`Sequence` subclass contained in the file. If not 
+        set, it is automatically detected.
 
     Returns
     -------
@@ -90,7 +96,7 @@ def get_sequences(fasta_file):
     """
     seq_dict = OrderedDict()
     for header, seq_str in fasta_file.items():
-        seq_dict[header] = _convert_to_sequence(seq_str)
+        seq_dict[header] = _convert_to_sequence(seq_str, seq_type)
     return seq_dict
 
 
@@ -146,14 +152,17 @@ def set_sequences(fasta_file, sequence_dict, as_rna=False):
         fasta_file[header] = _convert_to_string(sequence, as_rna)
 
 
-def get_alignment(fasta_file, additional_gap_chars=("_",)):
+def get_alignment(fasta_file, seq_type=None, additional_gap_chars=("_",)):
     """
     Get an alignment from a :class:`FastaFile` instance.
 
     Parameters
     ----------
     fasta_file : FastaFile
         The :class:`FastaFile` to be accessed.
+    seq_type : Class, optional
+        The :class:`Sequence` subclass contained in the file. If not 
+        set, it is automatically detected
     additional_gap_chars : str, optional
         The characters to be treated as gaps.
 
@@ -168,7 +177,7 @@ def get_alignment(fasta_file, additional_gap_chars=("_",)):
         for i, seq_str in enumerate(seq_strings):
             seq_strings[i] = seq_str.replace(char, "-")
     # Remove gaps for creation of sequences
-    sequences = [_convert_to_sequence(seq_str.replace("-",""))
+    sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type)
                  for seq_str in seq_strings]
     trace = Alignment.trace_from_strings(seq_strings)
     return Alignment(sequences, trace, score=None)
@@ -199,26 +208,41 @@ def set_alignment(fasta_file, alignment, seq_names):
         fasta_file[seq_names[i]] = gapped_seq_strings[i]
 
 
-def _convert_to_sequence(seq_str):
+def _convert_to_sequence(seq_str, seq_type=None):
+
+    # Define preprocessing of preimplemented sequence types
+    process_protein_sequence = lambda x : x.upper().replace("U", "C")
+    process_nucleotide_sequence = (
+        lambda x : x.upper().replace("U","T").replace("X","N")
+    )
+
+    # Attempt to set manually selected sequence type
+    if seq_type is not None:
+        # Do preprocessing as done without manual selection
+        if seq_type == NucleotideSequence:
+            seq_str = process_nucleotide_sequence(seq_str)
+        elif seq_type == ProteinSequence:
+            if "U" in seq_str:
+                warnings.warn(
+                    "ProteinSequence objects do not support selenocysteine "
+                    "(U), occurrences were substituted by cysteine (C)"
+                )
+            seq_str = process_protein_sequence(seq_str)
+        # Return the converted sequence
+        return seq_type(seq_str)    
+
     # Biotite alphabets for nucleotide and proteins
-    # Biotite alphabets for nucleotide and proteins
-    # Biotite alphabets for nucleotide and proteins
-    # do not accept lower case letters
-    seq_str = seq_str.upper()
     try:
         # For nucleotides uracil is represented by thymine and
         # there is is only one letter for completely unknown nucleotides
-        return NucleotideSequence(seq_str.replace("U","T").replace("X","N"))
+        return NucleotideSequence(process_nucleotide_sequence(seq_str))
     except AlphabetError:
         pass
     try:
-        if "U" in seq_str:
-            warn = True
-            seq_str = seq_str.replace("U", "C")
-        else:
-            warn = False
-        prot_seq = ProteinSequence(seq_str)
+        prot_seq = ProteinSequence(process_protein_sequence(seq_str))
         # Raise Warning after conversion into 'ProteinSequence'
         # to wait for potential 'AlphabetError'
-        if warn:
+        if "U" in seq_str:
             warnings.warn(
                 "ProteinSequence objects do not support selenocysteine (U), "
                 "occurrences were substituted by cysteine (C)"

diff --git a/tests/sequence/test_fasta.py b/tests/sequence/test_fasta.py
@@ -40,22 +40,28 @@ def test_access_low_level():
 
 
 def test_access_high_level():
-    path = os.path.join(data_dir("sequence"), "nuc.fasta")
-    file = fasta.FastaFile.read(path)
-    sequences = fasta.get_sequences(file)
-    assert sequences == {
+    sequences_reference = {
         "dna sequence" : seq.NucleotideSequence("ACGCTACGT", False),
         "another dna sequence" : seq.NucleotideSequence("A", False),
         "third dna sequence" : seq.NucleotideSequence("ACGT", False),
         "rna sequence" : seq.NucleotideSequence("ACGT", False),
         "ambiguous rna sequence" : seq.NucleotideSequence("ACGTNN", True),
     }
+    path = os.path.join(data_dir("sequence"), "nuc.fasta")
+    file = fasta.FastaFile.read(path)
+    sequences = fasta.get_sequences(file)
+    assert sequences == sequences_reference
+    sequences_manual = fasta.get_sequences(file, seq.NucleotideSequence)
+    assert sequences_manual == sequences_reference    
 
 
 def test_sequence_conversion():
     path = os.path.join(data_dir("sequence"), "nuc.fasta")
     file = fasta.FastaFile.read(path)
     assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)
+    assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(
+        file, seq.NucleotideSequence
+    )
 
     seq_dict = fasta.get_sequences(file)
     file2 = fasta.FastaFile()
@@ -65,6 +71,11 @@ def test_sequence_conversion():
     # now guessed as protein sequence
     for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
         assert str(seq1) == str(seq2)
+    # Manually set sequence type
+    seq_dict2_manual = fasta.get_sequences(
+        file2, seq_type=seq.NucleotideSequence
+    )
+    assert seq_dict == seq_dict2_manual
 
     file3 = fasta.FastaFile()
     fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
@@ -75,6 +86,12 @@ def test_sequence_conversion():
     # Expect a warning for selenocysteine conversion
     with pytest.warns(UserWarning):
         assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)
+        assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(
+            file4, seq_type=seq.ProteinSequence
+        )
+    # Manually forcing a NucleotideSequence should raise an error
+    with pytest.raises(seq.AlphabetError):
+        fasta.get_sequence(file4, seq_type=seq.NucleotideSequence)
 
     path = os.path.join(data_dir("sequence"), "invalid.fasta")
     file5 = fasta.FastaFile.read(path)