-
Notifications
You must be signed in to change notification settings - Fork 103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add manual seq type #478
Add manual seq type #478
Changes from 4 commits
3fefaa8
8fcfca1
680c88d
6e4921e
628c5cb
bc7ffd0
39a6579
3d25d5c
47fcae1
a3f1174
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -16,7 +16,7 @@ | |||
"get_alignment", "set_alignment"] | ||||
|
||||
|
||||
def get_sequence(fasta_file, header=None): | ||||
def get_sequence(fasta_file, seq_type=None, header=None): | ||||
""" | ||||
Get a sequence from a :class:`FastaFile` instance. | ||||
|
||||
|
@@ -28,6 +28,9 @@ def get_sequence(fasta_file, header=None): | |||
---------- | ||||
fasta_file : FastaFile | ||||
The :class:`FastaFile` to be accessed. | ||||
seq_type : Class, optional | ||||
The :class:`Sequence` subclass contained in the file. If not | ||||
set, it is automatically detected. | ||||
t0mdavid-m marked this conversation as resolved.
Show resolved
Hide resolved
|
||||
header : str, optional | ||||
The header to get the sequence from. By default, the first | ||||
sequence of the file is returned. | ||||
|
@@ -57,10 +60,10 @@ def get_sequence(fasta_file, header=None): | |||
# Determine the sequence type: | ||||
# If NucleotideSequence can be created it is a DNA sequence, | ||||
# otherwise protein sequence | ||||
return _convert_to_sequence(seq_str) | ||||
return _convert_to_sequence(seq_str, seq_type) | ||||
|
||||
|
||||
def get_sequences(fasta_file): | ||||
def get_sequences(fasta_file, seq_type=None): | ||||
""" | ||||
Get dictionary from a :class:`FastaFile` instance, | ||||
where headers are keys and sequences are values. | ||||
|
@@ -73,6 +76,9 @@ def get_sequences(fasta_file): | |||
---------- | ||||
fasta_file : FastaFile | ||||
The :class:`FastaFile` to be accessed. | ||||
seq_type : Class, optional | ||||
The :class:`Sequence` subclass contained in the file. If not | ||||
set, it is automatically detected. | ||||
|
||||
Returns | ||||
------- | ||||
|
@@ -90,7 +96,7 @@ def get_sequences(fasta_file): | |||
""" | ||||
seq_dict = OrderedDict() | ||||
for header, seq_str in fasta_file.items(): | ||||
seq_dict[header] = _convert_to_sequence(seq_str) | ||||
seq_dict[header] = _convert_to_sequence(seq_str, seq_type) | ||||
return seq_dict | ||||
|
||||
|
||||
|
@@ -146,14 +152,17 @@ def set_sequences(fasta_file, sequence_dict, as_rna=False): | |||
fasta_file[header] = _convert_to_string(sequence, as_rna) | ||||
|
||||
|
||||
def get_alignment(fasta_file, additional_gap_chars=("_",)): | ||||
def get_alignment(fasta_file, seq_type=None, additional_gap_chars=("_",)): | ||||
""" | ||||
Get an alignment from a :class:`FastaFile` instance. | ||||
|
||||
Parameters | ||||
---------- | ||||
fasta_file : FastaFile | ||||
The :class:`FastaFile` to be accessed. | ||||
seq_type : Class, optional | ||||
The :class:`Sequence` subclass contained in the file. If not | ||||
set, it is automatically detected | ||||
additional_gap_chars : str, optional | ||||
The characters to be treated as gaps. | ||||
|
||||
|
@@ -168,7 +177,7 @@ def get_alignment(fasta_file, additional_gap_chars=("_",)): | |||
for i, seq_str in enumerate(seq_strings): | ||||
seq_strings[i] = seq_str.replace(char, "-") | ||||
# Remove gaps for creation of sequences | ||||
sequences = [_convert_to_sequence(seq_str.replace("-","")) | ||||
sequences = [_convert_to_sequence(seq_str.replace("-",""), seq_type) | ||||
for seq_str in seq_strings] | ||||
trace = Alignment.trace_from_strings(seq_strings) | ||||
return Alignment(sequences, trace, score=None) | ||||
|
@@ -199,26 +208,41 @@ def set_alignment(fasta_file, alignment, seq_names): | |||
fasta_file[seq_names[i]] = gapped_seq_strings[i] | ||||
|
||||
|
||||
def _convert_to_sequence(seq_str): | ||||
def _convert_to_sequence(seq_str, seq_type=None): | ||||
|
||||
# Define preprocessing of preimplemented sequence types | ||||
process_protein_sequence = lambda x : x.upper().replace("U", "C") | ||||
process_nucleotide_sequence = ( | ||||
lambda x : x.upper().replace("U","T").replace("X","N") | ||||
) | ||||
|
||||
# Attempt to set manually selected sequence type | ||||
if seq_type is not None: | ||||
# Do preprocessing as done without manual selection | ||||
if seq_type == NucleotideSequence: | ||||
seq_str = process_nucleotide_sequence(seq_str) | ||||
elif seq_type == ProteinSequence: | ||||
if "U" in seq_str: | ||||
warnings.warn( | ||||
"ProteinSequence objects do not support selenocysteine " | ||||
"(U), occurrences were substituted by cysteine (C)" | ||||
) | ||||
seq_str = process_protein_sequence(seq_str) | ||||
# Return the converted sequence | ||||
return seq_type(seq_str) | ||||
|
||||
# Biotite alphabets for nucleotide and proteins | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part of the comment is also obsolete with the code changes.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I went over the comments and removed all remainders of the previous implementation. I also added/moved a comment describing the purpose of each lambda function. |
||||
# do not accept lower case letters | ||||
seq_str = seq_str.upper() | ||||
try: | ||||
# For nucleotides uracil is represented by thymine and | ||||
# there is is only one letter for completely unknown nucleotides | ||||
return NucleotideSequence(seq_str.replace("U","T").replace("X","N")) | ||||
return NucleotideSequence(process_nucleotide_sequence(seq_str)) | ||||
except AlphabetError: | ||||
pass | ||||
try: | ||||
if "U" in seq_str: | ||||
warn = True | ||||
seq_str = seq_str.replace("U", "C") | ||||
else: | ||||
warn = False | ||||
prot_seq = ProteinSequence(seq_str) | ||||
prot_seq = ProteinSequence(process_protein_sequence(seq_str)) | ||||
# Raise Warning after conversion into 'ProteinSequence' | ||||
# to wait for potential 'AlphabetError' | ||||
if warn: | ||||
if "U" in seq_str: | ||||
warnings.warn( | ||||
"ProteinSequence objects do not support selenocysteine (U), " | ||||
"occurrences were substituted by cysteine (C)" | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,22 +40,28 @@ def test_access_low_level(): | |
|
||
|
||
def test_access_high_level(): | ||
path = os.path.join(data_dir("sequence"), "nuc.fasta") | ||
file = fasta.FastaFile.read(path) | ||
sequences = fasta.get_sequences(file) | ||
assert sequences == { | ||
sequences_reference = { | ||
"dna sequence" : seq.NucleotideSequence("ACGCTACGT", False), | ||
"another dna sequence" : seq.NucleotideSequence("A", False), | ||
"third dna sequence" : seq.NucleotideSequence("ACGT", False), | ||
"rna sequence" : seq.NucleotideSequence("ACGT", False), | ||
"ambiguous rna sequence" : seq.NucleotideSequence("ACGTNN", True), | ||
} | ||
path = os.path.join(data_dir("sequence"), "nuc.fasta") | ||
file = fasta.FastaFile.read(path) | ||
sequences = fasta.get_sequences(file) | ||
assert sequences == sequences_reference | ||
sequences_manual = fasta.get_sequences(file, seq.NucleotideSequence) | ||
assert sequences_manual == sequences_reference | ||
t0mdavid-m marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def test_sequence_conversion(): | ||
path = os.path.join(data_dir("sequence"), "nuc.fasta") | ||
file = fasta.FastaFile.read(path) | ||
assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence(file) | ||
assert seq.NucleotideSequence("ACGCTACGT") == fasta.get_sequence( | ||
file, seq.NucleotideSequence | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also refactored the test by splitting it into three functionally different tests:
|
||
|
||
seq_dict = fasta.get_sequences(file) | ||
file2 = fasta.FastaFile() | ||
|
@@ -65,6 +71,11 @@ def test_sequence_conversion(): | |
# now guessed as protein sequence | ||
for seq1, seq2 in zip(seq_dict.values(), seq_dict2.values()): | ||
assert str(seq1) == str(seq2) | ||
# Manually set sequence type | ||
seq_dict2_manual = fasta.get_sequences( | ||
file2, seq_type=seq.NucleotideSequence | ||
) | ||
assert seq_dict == seq_dict2_manual | ||
t0mdavid-m marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
file3 = fasta.FastaFile() | ||
fasta.set_sequence(file3, seq.NucleotideSequence("AACCTTGG")) | ||
|
@@ -75,6 +86,12 @@ def test_sequence_conversion(): | |
# Expect a warning for selenocysteine conversion | ||
with pytest.warns(UserWarning): | ||
assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence(file4) | ||
assert seq.ProteinSequence("YAHCGFRTGS") == fasta.get_sequence( | ||
file4, seq_type=seq.ProteinSequence | ||
) | ||
# Manually forcing a NucleotideSequence should raise an error | ||
with pytest.raises(seq.AlphabetError): | ||
fasta.get_sequence(file4, seq_type=seq.NucleotideSequence) | ||
|
||
path = os.path.join(data_dir("sequence"), "invalid.fasta") | ||
file5 = fasta.FastaFile.read(path) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think for backwards compatibility reasons the new parameter should be the last parameter in the list, e.g. in case someone uses the
header
as positional argument in their code. Although Biotite is still in0.x
version, where compatibility is not strictly required, I would prefer the more compatible way, if there is no clear advantage of the other option.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I placed the arguments based on gut feeling. However, I agree that compatibility should take priority here.