Skip to content

Commit

Permalink
Merge pull request #18 from martinghunt/refcheck_max_length
Browse files Browse the repository at this point in the history
Refcheck max length
  • Loading branch information
martinghunt committed Mar 24, 2015
2 parents 5b79d74 + 9a9feca commit a0bddc4
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 4 deletions.
10 changes: 9 additions & 1 deletion ariba/refcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@ class Error (Exception): pass


class Checker:
def __init__(self, infile, min_length=1):
def __init__(self, infile, min_length=1, max_length=10000):
self.infile = os.path.abspath(infile)
if not os.path.exists(self.infile):
raise Error('File not found: "' + self.infile + '". Cannot continue')

self.min_length = min_length
self.max_length = max_length


def check(self, error_code_on_exit=None):
Expand All @@ -21,6 +22,8 @@ def check(self, error_code_on_exit=None):
return False, 'Not a gene', seq
elif len(seq) < self.min_length:
return False, 'Too short', seq
elif len(seq) > self.max_length:
return False, 'Too long', seq

return True, None, None

Expand All @@ -42,6 +45,11 @@ def fix(self, outprefix):
print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
print(seq, file=bad_seqs_out_fh)
continue
elif len(seq) > self.max_length:
print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
print(seq, file=bad_seqs_out_fh)
continue


if not seq.looks_like_gene():
seq.revcomp()
Expand Down
9 changes: 7 additions & 2 deletions ariba/tasks/refcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,18 @@ def run():
usage = 'ariba refcheck [options] <infile>')
parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
parser.add_argument('-m', '--min_length', type=int, help='Minimum length in nucleotides of gene [%(default)s]', metavar='INT', default=6)
parser.add_argument('-n', '--max_length', type=int, help='Maximum length in nucleotides of gene [%(default)s]', metavar='INT', default=10000)
parser.add_argument('-o', '--outprefix', help='Prefix of output files. If this option is used, a fixed file will be output, together with information on what was changed in the input file. If this option is not used, the script dies if any input sequence is not OK')
parser.add_argument('infile', help='Input file containing genes to be checked', metavar='Filename')
options = parser.parse_args()

pyfastaq.sequences.genetic_code = options.genetic_code
checker = ariba.refcheck.Checker(options.infile, min_length=options.min_length)

checker = ariba.refcheck.Checker(
options.infile,
min_length=options.min_length,
max_length=options.max_length
)

if options.outprefix:
checker.fix(options.outprefix)
else:
Expand Down
2 changes: 2 additions & 0 deletions ariba/tests/data/refcheck_test_check_too_long.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>gene1
TTGTGGTGA
2 changes: 2 additions & 0 deletions ariba/tests/data/refcheck_test_fix_in.fa
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ TTGTCGTCGTCGTCGTAA
TTACGACGACGACGACGACAA
>not_a_gene
TTGTAATAATAA
>too_long
TTGTTGTTGTTGTTGTTGTCGTCGTCGTAA
2 changes: 2 additions & 0 deletions ariba/tests/data/refcheck_test_fix_out.removed.fa
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
TTGTCGTAA
>not_a_gene
TTGTAATAATAA
>too_long
TTGTTGTTGTTGTTGTTGTCGTCGTCGTAA
10 changes: 9 additions & 1 deletion ariba/tests/refcheck_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,19 @@ def test_check_file_fail_too_short(self):
self.assertEqual(c.check(), (False, 'Too short', seq))


def test_check_file_fail_too_long(self):
'''test check file fail long gene'''
infile = os.path.join(data_dir, 'refcheck_test_check_too_long.fa')
c = refcheck.Checker(infile, max_length=6)
seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
self.assertEqual(c.check(), (False, 'Too long', seq))


def test_check_fix(self):
'''test fix'''
infile = os.path.join(data_dir, 'refcheck_test_fix_in.fa')
tmp_prefix = 'tmp.refcheck_test_fix.out'
c = refcheck.Checker(infile, min_length=10)
c = refcheck.Checker(infile, min_length=10, max_length=25)
c.fix(tmp_prefix)
for x in ['fa', 'log', 'rename', 'removed.fa']:
expected = os.path.join(data_dir, 'refcheck_test_fix_out.' + x)
Expand Down

0 comments on commit a0bddc4

Please sign in to comment.