diff --git a/CHANGELOG.md b/CHANGELOG.md index 7951d3cc..71f94e67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Change Log +[v2.14.3](https://github.com/sanger-pathogens/ariba/tree/v2.14.3) (2019-08-23) +[Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.2...v2.14.3) + +**Fixed bugs:** + +- Version 3.0.3 of CARD breaks prepareref [\#278](https://github.com/sanger-pathogens/ariba/issues/278) +- RT 667288: Change docker file Ariba git clone to a copy + [v2.14.2](https://github.com/sanger-pathogens/ariba/tree/v2.14.2) (2019-06-18) [Full Changelog](https://github.com/sanger-pathogens/ariba/compare/v2.14.1...v2.14.2) diff --git a/Dockerfile b/Dockerfile index 4a577ad7..aefacb9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,7 @@ MAINTAINER ariba-help@sanger.ac.uk ARG BOWTIE2_VERSION=2.2.9 ARG SPADES_VERSION=3.13.1 ARG ARIBA_TAG=master +ARG ARIBA_BUILD_DIR=/ariba RUN apt-get -qq update && \ apt-get install --no-install-recommends -y \ @@ -27,10 +28,12 @@ RUN apt-get -qq update && \ wget \ zlib1g-dev +# Install bowtie RUN wget -q http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/${BOWTIE2_VERSION}/bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \ && unzip bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip \ && rm -f bowtie2-${BOWTIE2_VERSION}-linux-x86_64.zip +# Install SPAdes RUN wget -q https://github.com/ablab/spades/releases/download/v${SPADES_VERSION}/SPAdes-${SPADES_VERSION}-Linux.tar.gz \ && tar -zxf SPAdes-${SPADES_VERSION}-Linux.tar.gz \ && rm -f SPAdes-${SPADES_VERSION}-Linux.tar.gz @@ -40,13 +43,15 @@ RUN wget -q https://github.com/ablab/spades/releases/download/v${SPADES_VERSION} ENV ARIBA_BOWTIE2=$PWD/bowtie2-${BOWTIE2_VERSION}/bowtie2 ARIBA_CDHIT=cdhit-est MPLBACKEND="agg" ENV PATH=$PATH:$PWD/SPAdes-${SPADES_VERSION}-Linux/bin -RUN cd /usr/local/bin && ln -s /usr/bin/python3 python && cd +RUN ln -s -f /usr/bin/python3 /usr/local/bin/python -RUN git clone https://github.com/sanger-pathogens/ariba.git \ - && cd ariba \ - && git checkout ${ARIBA_TAG} \ - && rm -rf .git \ +# Install Ariba +RUN mkdir -p $ARIBA_BUILD_DIR +COPY . $ARIBA_BUILD_DIR +RUN cd $ARIBA_BUILD_DIR \ + && python3 setup.py clean --all \ && python3 setup.py test \ - && python3 setup.py install + && python3 setup.py install \ + && rm -rf $ARIBA_BUILD_DIR CMD ariba diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py index 89ace9d1..d9720404 100644 --- a/ariba/ref_preparer.py +++ b/ariba/ref_preparer.py @@ -16,6 +16,8 @@ def __init__(self, version_report_lines=None, min_gene_length=6, max_gene_length=10000, + min_noncoding_length=6, + max_noncoding_length=20000, genetic_code=11, cdhit_min_id=0.9, cdhit_min_length=0.0, @@ -38,6 +40,8 @@ def __init__(self, self.all_coding = all_coding self.min_gene_length = min_gene_length self.max_gene_length = max_gene_length + self.min_noncoding_length = min_noncoding_length + self.max_noncoding_length = max_noncoding_length self.genetic_code = genetic_code self.cdhit_min_id = cdhit_min_id self.cdhit_min_length = cdhit_min_length @@ -177,6 +181,8 @@ def run(self, outdir): self.metadata_tsv_files, min_gene_length=self.min_gene_length, max_gene_length=self.max_gene_length, + min_noncoding_length = self.min_noncoding_length, + max_noncoding_length = self.max_noncoding_length, genetic_code=self.genetic_code, ) @@ -213,8 +219,9 @@ def run(self, outdir): pickle.dump(clusters, f) if number_of_removed_seqs > 0: - print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the log file 01.filter.check_genes.log for details. This will show them:', file=sys.stderr) + print('WARNING.', number_of_removed_seqs, 'sequence(s) excluded. Please see the 01.filter.check_genes.log and 01.filter.check_noncoding.log for details. This will show them:', file=sys.stderr) print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr) + print(' cat', os.path.join(outdir, '01.filter.check_noncoding.log'), file=sys.stderr) if number_of_bad_variants_logged > 0: print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr) diff --git a/ariba/reference_data.py b/ariba/reference_data.py index 369914b1..1df1b8e3 100644 --- a/ariba/reference_data.py +++ b/ariba/reference_data.py @@ -19,6 +19,8 @@ def __init__(self, rename_file=None, min_gene_length=6, max_gene_length=10000, + min_noncoding_length=6, + max_noncoding_length=20000, genetic_code=11, parameters_file=None, ): @@ -26,6 +28,8 @@ def __init__(self, self.seq_dicts = {} self.min_gene_length = min_gene_length self.max_gene_length = max_gene_length + self.min_noncoding_length = min_noncoding_length + self.max_noncoding_length = max_noncoding_length self.sequences, self.metadata = ReferenceData._load_input_files_and_check_seq_names(fasta_files, metadata_tsv_files) if len(self.sequences) == 0: @@ -208,7 +212,7 @@ def _filter_bad_variant_data(cls, sequences, all_metadata, out_prefix, removed_s for sequence_name, metadata_dict in sorted(all_metadata.items()): if sequence_name in removed_sequences: - print(sequence_name, 'was removed because does not look like a gene, so removing its metadata', file=log_fh) + print(sequence_name, 'was removed because it failed filtering checks, so removing its metadata', file=log_fh) log_lines += 1 del all_metadata[sequence_name] continue @@ -278,6 +282,16 @@ def _try_to_get_gene_seq(cls, seq, min_length, max_length): return got[0], 'KEEP\tMade into gene. strand=' + got[1] + ', frame=' + str(got[2]) + @classmethod + def _check_noncoding_seq(cls, seq, min_length, max_length): + if len(seq) < min_length: + return False, 'REMOVE\tToo short. Length: ' + str(len(seq)) + elif len(seq) > max_length: + return False, 'REMOVE\tToo long. Length: ' + str(len(seq)) + else: + return True, None + + @classmethod def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_gene_length): to_remove = set() @@ -308,11 +322,46 @@ def _remove_bad_genes(cls, sequences, metadata, log_file, min_gene_length, max_g return to_remove + @classmethod + def _remove_bad_noncoding_seqs(cls, sequences, metadata, log_file, min_noncoding_length, max_noncoding_length): + to_remove = set() + + if len(sequences) == 0: + return to_remove + + log_fh = pyfastaq.utils.open_file_write(log_file) + + for name in sorted(sequences): + if metadata[name]['seq_type'] != 'n': + continue + + valid, message = ReferenceData._check_noncoding_seq(sequences[name], min_noncoding_length, max_noncoding_length) + if not valid: + to_remove.add(name) + + if message is not None: + print(name, message, sep='\t', file=log_fh) + + pyfastaq.utils.close(log_fh) + + for name in to_remove: + sequences.pop(name) + + return to_remove + def sanity_check(self, outprefix): - removed_seqs = self._remove_bad_genes(self.sequences, self.metadata, outprefix + '.check_genes.log', self.min_gene_length, self.max_gene_length) - log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, removed_seqs) - return len(removed_seqs), log_lines + removed_gene_seqs = self._remove_bad_genes(self.sequences, + self.metadata, outprefix + '.check_genes.log', + self.min_gene_length, self.max_gene_length) + + removed_noncoding_seqs = self._remove_bad_noncoding_seqs(self.sequences, self.metadata, + outprefix + '.check_noncoding.log', self.min_noncoding_length, + self.max_noncoding_length) + + all_removed_seqs = removed_gene_seqs.union(removed_noncoding_seqs) + log_lines = ReferenceData._filter_bad_variant_data(self.sequences, self.metadata, outprefix, all_removed_seqs) + return len(all_removed_seqs), log_lines @classmethod def _new_seq_name(cls, name): diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py index 401892d2..aa7ac0ca 100644 --- a/ariba/tasks/prepareref.py +++ b/ariba/tasks/prepareref.py @@ -18,6 +18,8 @@ def run(options): version_report_lines=version_report_lines, min_gene_length=options.min_gene_length, max_gene_length=options.max_gene_length, + min_noncoding_length=options.min_noncoding_length, + max_noncoding_length=options.max_noncoding_length, genetic_code=options.genetic_code, cdhit_min_id=options.cdhit_min_id, cdhit_min_length=options.cdhit_min_length, diff --git a/ariba/tests/data/ref_preparer_test_run.in.4.fa b/ariba/tests/data/ref_preparer_test_run.in.4.fa new file mode 100644 index 00000000..d65d9ce1 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run.in.4.fa @@ -0,0 +1,17 @@ +>noncoding1-toolong +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding2 +CTACTGAT +>cannot_make_into_a_gene +AAAAAAAAAAAAAAAA +>noncoding3-tooshort +C +>gene1 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>noncoding4-toolong +CTACTGATCATCTACTATCTG +>noncoding5-tooshort +CTCTC +>gene2 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA + diff --git a/ariba/tests/data/ref_preparer_test_run.in.4.tsv b/ariba/tests/data/ref_preparer_test_run.in.4.tsv new file mode 100644 index 00000000..9231a6db --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run.in.4.tsv @@ -0,0 +1,8 @@ +cannot_make_into_a_gene 1 0 . . . +noncoding1-toolong 0 0 . . . +noncoding2 0 0 C4T . . +noncoding3-tooshort 0 0 C4T . . +noncoding4-toolong 0 0 C4T . . +noncoding5-tooshort 0 0 C4T . . +gene1 1 0 . . . +gene2 1 0 . . . \ No newline at end of file diff --git a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_metadata.log b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_metadata.log index d36e7dda..4ca6d0f3 100644 --- a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_metadata.log +++ b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_metadata.log @@ -1 +1 @@ -cannot_make_into_a_gene was removed because does not look like a gene, so removing its metadata +cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata diff --git a/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_noncoding.log b/ariba/tests/data/ref_preparer_test_run.out/01.filter.check_noncoding.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_noncoding.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_noncoding.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.info.txt new file mode 100644 index 00000000..b58a3f83 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.info.txt @@ -0,0 +1,3 @@ +input fasta file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.fa +input tsv file: /Users/kp11/workspace/applications/Ariba/ariba/ariba/tests/data/ref_preparer_test_run.in.4.tsv +genetic_code 1 diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.rename_info b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.rename_info new file mode 100644 index 00000000..b4bee957 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.rename_info @@ -0,0 +1,4 @@ +noncoding1-toolong noncoding1_toolong +noncoding3-tooshort noncoding3_tooshort +noncoding4-toolong noncoding4_toolong +noncoding5-tooshort noncoding5_tooshort diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.version_info.txt new file mode 100644 index 00000000..5687bb6b --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/00.version_info.txt @@ -0,0 +1,5 @@ +ARIBA run with this command: +setup.py prepareref test +from this directory: /Users/kp11/workspace/applications/Ariba/ariba + + diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_genes.log new file mode 100644 index 00000000..8a81fd57 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_genes.log @@ -0,0 +1,3 @@ +cannot_make_into_a_gene REMOVE Does not look like a gene (tried both strands and all reading frames) AAAAAAAAAAAAAAAA +gene1 KEEP Made into gene. strand=+, frame=0 +gene2 KEEP Made into gene. strand=+, frame=0 diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_metadata.log b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_metadata.log new file mode 100644 index 00000000..4381be55 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_metadata.log @@ -0,0 +1,5 @@ +cannot_make_into_a_gene was removed because it failed filtering checks, so removing its metadata +noncoding1_toolong was removed because it failed filtering checks, so removing its metadata +noncoding3_tooshort was removed because it failed filtering checks, so removing its metadata +noncoding4_toolong was removed because it failed filtering checks, so removing its metadata +noncoding5_tooshort was removed because it failed filtering checks, so removing its metadata diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_metadata.tsv new file mode 100644 index 00000000..226e72c6 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_metadata.tsv @@ -0,0 +1,3 @@ +gene1 1 0 . . . +gene2 1 0 . . . +noncoding2 0 0 C4T . . diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_noncoding.log b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_noncoding.log new file mode 100644 index 00000000..7e2db480 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/01.filter.check_noncoding.log @@ -0,0 +1,4 @@ +noncoding1_toolong REMOVE Too long. Length: 38 +noncoding3_tooshort REMOVE Too short. Length: 1 +noncoding4_toolong REMOVE Too long. Length: 21 +noncoding5_tooshort REMOVE Too short. Length: 5 diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.all.fa b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.all.fa new file mode 100644 index 00000000..e089f12d --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.all.fa @@ -0,0 +1,6 @@ +>gene1 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene2 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>noncoding2 +CTACTGAT diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.clusters.pickle new file mode 100644 index 00000000..18f3e9de Binary files /dev/null and b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.clusters.pickle differ diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.clusters.tsv new file mode 100644 index 00000000..4faab3e7 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.clusters.tsv @@ -0,0 +1 @@ +cluster gene1 gene2 diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.gene.fa b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.gene.fa new file mode 100644 index 00000000..7f0c0406 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.gene.fa @@ -0,0 +1,4 @@ +>gene1 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene2 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.gene.varonly.fa b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.gene.varonly.fa new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.noncoding.fa b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.noncoding.fa new file mode 100644 index 00000000..d25fd70c --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.noncoding.fa @@ -0,0 +1,2 @@ +>noncoding2 +CTACTGAT diff --git a/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.noncoding.varonly.fa b/ariba/tests/data/ref_preparer_test_run_noncoding_checks.out/02.cdhit.noncoding.varonly.fa new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/refdata_query_prepareref/01.filter.check_noncoding.log b/ariba/tests/data/refdata_query_prepareref/01.filter.check_noncoding.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/reference_data_remove_bad_noncoding.in.fa b/ariba/tests/data/reference_data_remove_bad_noncoding.in.fa new file mode 100644 index 00000000..037e5f88 --- /dev/null +++ b/ariba/tests/data/reference_data_remove_bad_noncoding.in.fa @@ -0,0 +1,10 @@ +>noncoding1 +AAAA +>noncoding2 +GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG +>noncoding3 +CCCCCC +>noncoding4 +TTTTTTTTTTTTTTT +>noncoding5 +AAAAAAAAAAAA diff --git a/ariba/tests/data/reference_data_remove_bad_noncoding.in.tsv b/ariba/tests/data/reference_data_remove_bad_noncoding.in.tsv new file mode 100644 index 00000000..bc496739 --- /dev/null +++ b/ariba/tests/data/reference_data_remove_bad_noncoding.in.tsv @@ -0,0 +1,5 @@ +noncoding1 0 0 . . . +noncoding2 0 0 . . . +noncoding3 0 0 . . . +noncoding4 0 0 . . . +noncoding5 0 0 . . . diff --git a/ariba/tests/data/reference_data_test_remove_bad_noncoding.log b/ariba/tests/data/reference_data_test_remove_bad_noncoding.log new file mode 100644 index 00000000..f6c97e1f --- /dev/null +++ b/ariba/tests/data/reference_data_test_remove_bad_noncoding.log @@ -0,0 +1,2 @@ +noncoding1 REMOVE Too short. Length: 4 +noncoding2 REMOVE Too long. Length: 133 diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py index 6dd901c1..8185f272 100644 --- a/ariba/tests/ref_preparer_test.py +++ b/ariba/tests/ref_preparer_test.py @@ -117,6 +117,7 @@ def test_run(self): test_files = [ '01.filter.check_metadata.tsv', '01.filter.check_genes.log', + '01.filter.check_noncoding.log', '01.filter.check_metadata.log', '02.cdhit.all.fa', '02.cdhit.clusters.tsv', @@ -152,6 +153,7 @@ def test_run_all_noncoding(self): '00.auto_metadata.tsv', '01.filter.check_metadata.tsv', '01.filter.check_genes.log', + '01.filter.check_noncoding.log', '01.filter.check_metadata.log', '02.cdhit.all.fa', '02.cdhit.clusters.tsv', @@ -168,6 +170,41 @@ def test_run_all_noncoding(self): common.rmtree(tmp_out) + def test_run_noncoding_checks(self): + '''test run with noncoding sequences that are outside of the allowed size range''' + fasta_in = [ + os.path.join(data_dir, 'ref_preparer_test_run.in.4.fa') + ] + tsv_in = [ + os.path.join(data_dir, 'ref_preparer_test_run.in.4.tsv') + ] + + extern_progs = external_progs.ExternalProgs() + refprep = ref_preparer.RefPreparer( + fasta_in, extern_progs, min_noncoding_length=6, max_noncoding_length=20, + metadata_tsv_files=tsv_in, genetic_code=1) + tmp_out = 'tmp.ref_preparer_test_run_noncoding_checks' + refprep.run(tmp_out) + expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_noncoding_checks.out') + test_files = [ + '01.filter.check_metadata.tsv', + '01.filter.check_genes.log', + '01.filter.check_noncoding.log', + '01.filter.check_metadata.log', + '02.cdhit.all.fa', + '02.cdhit.clusters.tsv', + '02.cdhit.gene.fa', + '02.cdhit.gene.varonly.fa', + '02.cdhit.noncoding.fa', + '02.cdhit.noncoding.varonly.fa', + ] + + for filename in test_files: + expected = os.path.join(expected_outdir, filename) + got = os.path.join(tmp_out, filename) + self.assertTrue(filecmp.cmp(expected, got, shallow=False)) + + common.rmtree(tmp_out) diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py index 076a1ed2..58bd3603 100644 --- a/ariba/tests/reference_data_test.py +++ b/ariba/tests/reference_data_test.py @@ -265,6 +265,19 @@ def test_try_to_get_gene_seq(self): for seq, got_seq, message in tests: self.assertEqual((got_seq, message), reference_data.ReferenceData._try_to_get_gene_seq(seq, 6, 99)) + def test_check_noncoding_seq(self): + '''Test _check_noncoding_seq''' + tests = [ + (pyfastaq.sequences.Fasta('x', 'A' * 3), False, 'REMOVE\tToo short. Length: 3'), + (pyfastaq.sequences.Fasta('x', 'A' * 21), False, 'REMOVE\tToo long. Length: 21'), + (pyfastaq.sequences.Fasta('x', 'A' * 5), True, None), + (pyfastaq.sequences.Fasta('x', 'A' * 4), True, None), + (pyfastaq.sequences.Fasta('x', 'A' * 20), True, None) + ] + + for seq, valid, message in tests: + self.assertEqual((valid, message), reference_data.ReferenceData._check_noncoding_seq(seq, 4, 20)) + def test_remove_bad_genes(self): '''Test _remove_bad_genes''' @@ -287,6 +300,29 @@ def test_remove_bad_genes(self): os.unlink(tmp_log) + def test_remove_bad_noncoding_seqs(self): + '''Test _remove_bad_noncoding_seqs''' + test_seq_dict = {} + fasta_file = os.path.join(data_dir, 'reference_data_remove_bad_noncoding.in.fa') + metadata_file = os.path.join(data_dir, 'reference_data_remove_bad_noncoding.in.tsv') + metadata = reference_data.ReferenceData._load_all_metadata_tsvs([metadata_file]) + pyfastaq.tasks.file_to_dict(fasta_file, test_seq_dict) + tmp_log = 'tmp.test_remove_bad_noncoding.log' + expected_removed = {'noncoding1','noncoding2'} + got_removed = reference_data.ReferenceData._remove_bad_noncoding_seqs(test_seq_dict, metadata, tmp_log, + min_noncoding_length=6, max_noncoding_length=15) + self.assertEqual(expected_removed, got_removed) + expected_dict = { + 'noncoding3': pyfastaq.sequences.Fasta('noncoding3', 'CCCCCC'), + 'noncoding4': pyfastaq.sequences.Fasta('noncoding4', 'TTTTTTTTTTTTTTT'), + 'noncoding5': pyfastaq.sequences.Fasta('noncoding5', 'AAAAAAAAAAAA') + } + self.assertEqual(expected_dict, test_seq_dict) + expected_log = os.path.join(data_dir, 'reference_data_test_remove_bad_noncoding.log') + self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False)) + os.unlink(tmp_log) + + def test_new_seq_name(self): '''Test _new_seq_name''' tests = [ diff --git a/scripts/ariba b/scripts/ariba index 38f5dee0..7151afdf 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -141,6 +141,8 @@ cdhit_group.add_argument('--cdhit_max_memory', type=int, help='Memory limit in M other_prep_group = subparser_prepareref.add_argument_group('other options') other_prep_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6) other_prep_group.add_argument('--max_gene_length', type=int, help='Maximum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=10000) +other_prep_group.add_argument('--min_noncoding_length', type=int, help='Minimum allowed length in nucleotides of non-coding sequences [%(default)s]', metavar='INT', default=6) +other_prep_group.add_argument('--max_noncoding_length', type=int, help='Maximum allowed length in nucleotides of non-coding sequences [%(default)s]', metavar='INT', default=20000) other_prep_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT') other_prep_group.add_argument('--force', action='store_true', help='Overwrite output directory, if it already exists') other_prep_group.add_argument('--threads', type=int, help='Number of threads (currently only applies to cdhit) [%(default)s]', default=1, metavar='INT') diff --git a/setup.py b/setup.py index c569b286..5330ca45 100644 --- a/setup.py +++ b/setup.py @@ -55,12 +55,12 @@ setup( ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod], name='ariba', - version='2.14.2', + version='2.14.3', description='ARIBA: Antibiotic Resistance Identification By Assembly', packages = find_packages(), package_data={'ariba': ['test_run_data/*', 'tb_data/*']}, author='Martin Hunt', - author_email='path-help@sanger.ac.uk', + author_email='ariba-help@sanger.ac.uk', url='https://github.com/sanger-pathogens/ariba', scripts=glob.glob('scripts/*'), test_suite='nose.collector', @@ -72,7 +72,7 @@ 'pyfastaq >= 3.12.0', 'pysam >= 0.9.1', 'pymummer<=0.10.3', - 'matplotlib>=3.1.0', + 'matplotlib >= 3.1.0', ], license='GPLv3', classifiers=[