diff --git a/ariba/refcheck.py b/ariba/refcheck.py index 0e333d0a..cbe1dc66 100644 --- a/ariba/refcheck.py +++ b/ariba/refcheck.py @@ -5,12 +5,13 @@ class Error (Exception): pass class Checker: - def __init__(self, infile, min_length=1): + def __init__(self, infile, min_length=1, max_length=10000): self.infile = os.path.abspath(infile) if not os.path.exists(self.infile): raise Error('File not found: "' + self.infile + '". Cannot continue') self.min_length = min_length + self.max_length = max_length def check(self, error_code_on_exit=None): @@ -21,6 +22,8 @@ def check(self, error_code_on_exit=None): return False, 'Not a gene', seq elif len(seq) < self.min_length: return False, 'Too short', seq + elif len(seq) > self.max_length: + return False, 'Too long', seq return True, None, None @@ -42,6 +45,11 @@ def fix(self, outprefix): print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh) print(seq, file=bad_seqs_out_fh) continue + elif len(seq) > self.max_length: + print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh) + print(seq, file=bad_seqs_out_fh) + continue + if not seq.looks_like_gene(): seq.revcomp() diff --git a/ariba/tasks/refcheck.py b/ariba/tasks/refcheck.py index db61b95d..7f8f2f2e 100644 --- a/ariba/tasks/refcheck.py +++ b/ariba/tasks/refcheck.py @@ -9,13 +9,18 @@ def run(): usage = 'ariba refcheck [options] ') parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT') parser.add_argument('-m', '--min_length', type=int, help='Minimum length in nucleotides of gene [%(default)s]', metavar='INT', default=6) + parser.add_argument('-n', '--max_length', type=int, help='Maximum length in nucleotides of gene [%(default)s]', metavar='INT', default=10000) parser.add_argument('-o', '--outprefix', help='Prefix of output files. If this option is used, a fixed file will be output, together with information on what was changed in the input file. If this option is not used, the script dies if any input sequence is not OK') parser.add_argument('infile', help='Input file containing genes to be checked', metavar='Filename') options = parser.parse_args() pyfastaq.sequences.genetic_code = options.genetic_code - checker = ariba.refcheck.Checker(options.infile, min_length=options.min_length) - + checker = ariba.refcheck.Checker( + options.infile, + min_length=options.min_length, + max_length=options.max_length + ) + if options.outprefix: checker.fix(options.outprefix) else: diff --git a/ariba/tests/data/refcheck_test_check_too_long.fa b/ariba/tests/data/refcheck_test_check_too_long.fa new file mode 100644 index 00000000..6a210e67 --- /dev/null +++ b/ariba/tests/data/refcheck_test_check_too_long.fa @@ -0,0 +1,2 @@ +>gene1 +TTGTGGTGA diff --git a/ariba/tests/data/refcheck_test_fix_in.fa b/ariba/tests/data/refcheck_test_fix_in.fa index 75606448..8bdd6c0f 100644 --- a/ariba/tests/data/refcheck_test_fix_in.fa +++ b/ariba/tests/data/refcheck_test_fix_in.fa @@ -12,3 +12,5 @@ TTGTCGTCGTCGTCGTAA TTACGACGACGACGACGACAA >not_a_gene TTGTAATAATAA +>too_long +TTGTTGTTGTTGTTGTTGTCGTCGTCGTAA diff --git a/ariba/tests/data/refcheck_test_fix_out.removed.fa b/ariba/tests/data/refcheck_test_fix_out.removed.fa index f6a7d446..99e3040e 100644 --- a/ariba/tests/data/refcheck_test_fix_out.removed.fa +++ b/ariba/tests/data/refcheck_test_fix_out.removed.fa @@ -2,3 +2,5 @@ TTGTCGTAA >not_a_gene TTGTAATAATAA +>too_long +TTGTTGTTGTTGTTGTTGTCGTCGTCGTAA diff --git a/ariba/tests/refcheck_test.py b/ariba/tests/refcheck_test.py index abae3e5f..6171eb1d 100644 --- a/ariba/tests/refcheck_test.py +++ b/ariba/tests/refcheck_test.py @@ -32,11 +32,19 @@ def test_check_file_fail_too_short(self): self.assertEqual(c.check(), (False, 'Too short', seq)) + def test_check_file_fail_too_long(self): + '''test check file fail long gene''' + infile = os.path.join(data_dir, 'refcheck_test_check_too_long.fa') + c = refcheck.Checker(infile, max_length=6) + seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA') + self.assertEqual(c.check(), (False, 'Too long', seq)) + + def test_check_fix(self): '''test fix''' infile = os.path.join(data_dir, 'refcheck_test_fix_in.fa') tmp_prefix = 'tmp.refcheck_test_fix.out' - c = refcheck.Checker(infile, min_length=10) + c = refcheck.Checker(infile, min_length=10, max_length=25) c.fix(tmp_prefix) for x in ['fa', 'log', 'rename', 'removed.fa']: expected = os.path.join(data_dir, 'refcheck_test_fix_out.' + x)