Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add manual seq type #478

Merged
merged 10 commits into from
Jun 19, 2023
Merged
58 changes: 41 additions & 17 deletions src/biotite/sequence/io/fasta/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"get_alignment", "set_alignment"]


def get_sequence(fasta_file, header=None):
def get_sequence(fasta_file, seq_type=None, header=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for backwards compatibility reasons the new parameter should be the last parameter in the list, e.g. in case someone uses the header as positional argument in their code. Although Biotite is still in 0.xversion, where compatibility is not strictly required, I would prefer the more compatible way, if there is no clear advantage of the other option.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I placed the arguments based on gut feeling. However, I agree that compatibility should take priority here.

"""
Get a sequence from a :class:`FastaFile` instance.

Expand All @@ -28,6 +28,9 @@ def get_sequence(fasta_file, header=None):
----------
fasta_file : FastaFile
The :class:`FastaFile` to be accessed.
seq_type : Class, optional
The :class:`Sequence` subclass contained in the file. If not
set, it is automatically detected.
t0mdavid-m marked this conversation as resolved.
Show resolved Hide resolved
header : str, optional
The header to get the sequence from. By default, the first
sequence of the file is returned.
Expand Down Expand Up @@ -57,10 +60,10 @@ def get_sequence(fasta_file, header=None):
# Determine the sequence type:
# If NucleotideSequence can be created it is a DNA sequence,
# otherwise protein sequence
return _convert_to_sequence(seq_str)
return _convert_to_sequence(seq_str, seq_type)


def get_sequences(fasta_file):
def get_sequences(fasta_file, seq_type=None):
"""
Get dictionary from a :class:`FastaFile` instance,
where headers are keys and sequences are values.
Expand All @@ -73,6 +76,9 @@ def get_sequences(fasta_file):
----------
fasta_file : FastaFile
The :class:`FastaFile` to be accessed.
seq_type : Class, optional
The :class:`Sequence` subclass contained in the file. If not
set, it is automatically detected.

Returns
-------
Expand All @@ -90,7 +96,7 @@ def get_sequences(fasta_file):
"""
seq_dict = OrderedDict()
for header, seq_str in fasta_file.items():
seq_dict[header] = _convert_to_sequence(seq_str)
seq_dict[header] = _convert_to_sequence(seq_str, seq_type)
return seq_dict


Expand Down Expand Up @@ -146,14 +152,17 @@ def set_sequences(fasta_file, sequence_dict, as_rna=False):
fasta_file[header] = _convert_to_string(sequence, as_rna)


def get_alignment(fasta_file, additional_gap_chars=("_",)):
def get_alignment(fasta_file, seq_type=None, additional_gap_chars=("_",)):
"""
Get an alignment from a :class:`FastaFile` instance.

Parameters
----------
fasta_file : FastaFile
The :class:`FastaFile` to be accessed.
seq_type : Class, optional
The :class:`Sequence` subclass contained in the file. If not
set, it is automatically detected
additional_gap_chars : str, optional
The characters to be treated as gaps.

Expand All @@ -168,7 +177,7 @@ def get_alignment(fasta_file, additional_gap_chars=("_",)):
for i, seq_str in enumerate(seq_strings):
seq_strings[i] = seq_str.replace(char, "-")
# Remove gaps for creation of sequences
sequences = [_convert_to_sequence(seq_str.replace("-",""))
sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type)
for seq_str in seq_strings]
trace = Alignment.trace_from_strings(seq_strings)
return Alignment(sequences, trace, score=None)
Expand Down Expand Up @@ -199,26 +208,41 @@ def set_alignment(fasta_file, alignment, seq_names):
fasta_file[seq_names[i]] = gapped_seq_strings[i]


def _convert_to_sequence(seq_str):
def _convert_to_sequence(seq_str, seq_type=None):

# Define preprocessing of preimplemented sequence types
process_protein_sequence = lambda x : x.upper().replace("U", "C")
process_nucleotide_sequence = (
lambda x : x.upper().replace("U","T").replace("X","N")
)

# Attempt to set manually selected sequence type
if seq_type is not None:
# Do preprocessing as done without manual selection
if seq_type == NucleotideSequence:
seq_str = process_nucleotide_sequence(seq_str)
elif seq_type == ProteinSequence:
if "U" in seq_str:
warnings.warn(
"ProteinSequence objects do not support selenocysteine "
"(U), occurrences were substituted by cysteine (C)"
)
seq_str = process_protein_sequence(seq_str)
# Return the converted sequence
return seq_type(seq_str)

# Biotite alphabets for nucleotide and proteins
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part of the comment is also obsolete with the code changes.

Suggested change
# Biotite alphabets for nucleotide and proteins

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went over the comments and removed all remainders of the previous implementation. I also added/moved a comment describing the purpose of each lambda function.

# do not accept lower case letters
seq_str = seq_str.upper()
try:
# For nucleotides uracil is represented by thymine and
# there is is only one letter for completely unknown nucleotides
return NucleotideSequence(seq_str.replace("U","T").replace("X","N"))
return NucleotideSequence(process_nucleotide_sequence(seq_str))
except AlphabetError:
pass
try:
if "U" in seq_str:
warn = True
seq_str = seq_str.replace("U", "C")
else:
warn = False
prot_seq = ProteinSequence(seq_str)
prot_seq = ProteinSequence(process_protein_sequence(seq_str))
# Raise Warning after conversion into 'ProteinSequence'
# to wait for potential 'AlphabetError'
if warn:
if "U" in seq_str:
warnings.warn(
"ProteinSequence objects do not support selenocysteine (U), "
"occurrences were substituted by cysteine (C)"
Expand Down
25 changes: 21 additions & 4 deletions tests/sequence/test_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,28 @@ def test_access_low_level():


def test_access_high_level():
path = os.path.join(data_dir("sequence"), "nuc.fasta")
file = fasta.FastaFile.read(path)
sequences = fasta.get_sequences(file)
assert sequences == {
sequences_reference = {
"dna sequence" : seq.NucleotideSequence("ACGCTACGT", False),
"another dna sequence" : seq.NucleotideSequence("A", False),
"third dna sequence" : seq.NucleotideSequence("ACGT", False),
"rna sequence" : seq.NucleotideSequence("ACGT", False),
"ambiguous rna sequence" : seq.NucleotideSequence("ACGTNN", True),
}
path = os.path.join(data_dir("sequence"), "nuc.fasta")
file = fasta.FastaFile.read(path)
sequences = fasta.get_sequences(file)
assert sequences == sequences_reference
sequences_manual = fasta.get_sequences(file, seq.NucleotideSequence)
assert sequences_manual == sequences_reference
t0mdavid-m marked this conversation as resolved.
Show resolved Hide resolved


def test_sequence_conversion():
path = os.path.join(data_dir("sequence"), "nuc.fasta")
file = fasta.FastaFile.read(path)
assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file)
assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(
file, seq.NucleotideSequence
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also refactored the test by splitting it into three functionally different tests:

  • Ambiguous Sequences (i.e. nucleotide sequences which can also be loaded as amino acid sequences)
  • Protein Sequences (should fail when loaded as nucleotide sequences)
  • Invalid Sequences (should fail in all cases)


seq_dict = fasta.get_sequences(file)
file2 = fasta.FastaFile()
Expand All @@ -65,6 +71,11 @@ def test_sequence_conversion():
# now guessed as protein sequence
for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()):
assert str(seq1) == str(seq2)
# Manually set sequence type
seq_dict2_manual = fasta.get_sequences(
file2, seq_type=seq.NucleotideSequence
)
assert seq_dict == seq_dict2_manual
t0mdavid-m marked this conversation as resolved.
Show resolved Hide resolved

file3 = fasta.FastaFile()
fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG"))
Expand All @@ -75,6 +86,12 @@ def test_sequence_conversion():
# Expect a warning for selenocysteine conversion
with pytest.warns(UserWarning):
assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4)
assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(
file4, seq_type=seq.ProteinSequence
)
# Manually forcing a NucleotideSequence should raise an error
with pytest.raises(seq.AlphabetError):
fasta.get_sequence(file4, seq_type=seq.NucleotideSequence)

path = os.path.join(data_dir("sequence"), "invalid.fasta")
file5 = fasta.FastaFile.read(path)
Expand Down