From 1c5befe531bc459ed2d3af693bc1d4d35d5ca8ec Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 5 Aug 2016 12:28:15 +0000 Subject: [PATCH 1/3] New method _fasta_to_metadata --- ariba/ref_preparer.py | 13 +++++++++++++ .../ref_preparer_test_fasta_to_metadata.coding.tsv | 3 +++ .../data/ref_preparer_test_fasta_to_metadata.fa | 6 ++++++ ...ef_preparer_test_fasta_to_metadata.noncoding.tsv | 3 +++ ariba/tests/ref_preparer_test.py | 13 +++++++++++++ 5 files changed, 38 insertions(+) create mode 100644 ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv create mode 100644 ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa create mode 100644 ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py index 9fee5fd3..d7398eaa 100644 --- a/ariba/ref_preparer.py +++ b/ariba/ref_preparer.py @@ -1,6 +1,7 @@ import sys import os import pickle +import pyfastaq from ariba import reference_data class Error (Exception): pass @@ -42,6 +43,18 @@ def __init__(self, self.verbose = verbose + @classmethod + def _fasta_to_metadata(self, infile, outfile, all_coding): + seq_reader = pyfastaq.sequences.file_reader(infile) + f_out = pyfastaq.utils.open_file_write(outfile) + coding = '1' if all_coding else '0' + + for seq in seq_reader: + print(seq.id, coding, 0, '.', '.', '.', sep='\t', file=f_out) + + pyfastaq.utils.close(f_out) + + def _write_info_file(self, outfile): with open(outfile, 'w') as fout: for filename in self.fasta_files: diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv new file mode 100644 index 00000000..ac5cf27e --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv @@ -0,0 +1,3 @@ +seq1 1 0 . . . +seq2 1 0 . . . +seq3 1 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa new file mode 100644 index 00000000..be49b864 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa @@ -0,0 +1,6 @@ +>seq1 +CACTACAT +>seq2 +AAAA +>seq3 +GGGGG diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv new file mode 100644 index 00000000..c1c770b4 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv @@ -0,0 +1,3 @@ +seq1 0 0 . . . +seq2 0 0 . . . +seq3 0 0 . . . diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py index 604395e8..a338328d 100644 --- a/ariba/tests/ref_preparer_test.py +++ b/ariba/tests/ref_preparer_test.py @@ -9,6 +9,19 @@ class TestRefPreparer(unittest.TestCase): + def test_fasta_to_metadata(self): + '''test _fasta_to_metadata''' + infile = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.fa') + tmp_out = 'tmp.test_fasta_to_metadata.tsv' + expected_coding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.coding.tsv') + expected_noncoding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.noncoding.tsv') + ref_preparer.RefPreparer._fasta_to_metadata(infile, tmp_out, True) + self.assertTrue(filecmp.cmp(expected_coding, tmp_out, shallow=False)) + ref_preparer.RefPreparer._fasta_to_metadata(infile, tmp_out, False) + self.assertTrue(filecmp.cmp(expected_noncoding, tmp_out, shallow=False)) + os.unlink(tmp_out) + + def test_rename_clusters(self): '''test _rename_clusters''' clusters_in = { From 29ca3cd57974f60ac676a17606a336462432d24d Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 5 Aug 2016 12:50:38 +0000 Subject: [PATCH 2/3] Metadata files optional. Add all_coding option --- ariba/ref_preparer.py | 25 ++++++--- .../00.auto_metadata.tsv | 9 ++++ .../00.info.txt | 5 ++ .../00.version_info.txt | 5 ++ .../01.filter.check_genes.log | 0 .../01.filter.check_metadata.log | 0 .../01.filter.check_metadata.tsv | 9 ++++ .../02.cdhit.all.fa | 18 +++++++ .../02.cdhit.clusters.pickle | Bin 0 -> 346 bytes .../02.cdhit.clusters.tsv | 5 ++ .../02.cdhit.gene.fa | 0 .../02.cdhit.gene.varonly.fa | 0 .../02.cdhit.noncoding.fa | 18 +++++++ .../02.cdhit.noncoding.varonly.fa | 0 ariba/tests/ref_preparer_test.py | 49 ++++++++++++++++-- 15 files changed, 133 insertions(+), 10 deletions(-) create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa create mode 100644 ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py index d7398eaa..b34c6699 100644 --- a/ariba/ref_preparer.py +++ b/ariba/ref_preparer.py @@ -10,8 +10,9 @@ class Error (Exception): pass class RefPreparer: def __init__(self, fasta_files, - metadata_tsv_files, extern_progs, + metadata_tsv_files=None, + all_coding=None, version_report_lines=None, min_gene_length=6, max_gene_length=10000, @@ -31,7 +32,8 @@ def __init__(self, self.version_report_lines = version_report_lines self.fasta_files = fasta_files - self.metadata_tsv_files = metadata_tsv_files + self.metadata_tsv_files = [] if metadata_tsv_files is None else metadata_tsv_files + self.all_coding = all_coding self.min_gene_length = min_gene_length self.max_gene_length = max_gene_length self.genetic_code = genetic_code @@ -44,15 +46,12 @@ def __init__(self, @classmethod - def _fasta_to_metadata(self, infile, outfile, all_coding): + def _fasta_to_metadata(cls, infile, out_fh, all_coding): seq_reader = pyfastaq.sequences.file_reader(infile) - f_out = pyfastaq.utils.open_file_write(outfile) coding = '1' if all_coding else '0' for seq in seq_reader: - print(seq.id, coding, 0, '.', '.', '.', sep='\t', file=f_out) - - pyfastaq.utils.close(f_out) + print(seq.id, coding, 0, '.', '.', '.', sep='\t', file=out_fh) def _write_info_file(self, outfile): @@ -139,6 +138,18 @@ def run(self, outdir): print(file=f) print(*self.version_report_lines, sep='\n', file=f) + if self.all_coding is not None: + assert len(self.metadata_tsv_files) == 0 + assert self.all_coding in {'yes', 'no'} + self.metadata_tsv_files = [os.path.join(outdir, '00.auto_metadata.tsv')] + f_out = pyfastaq.utils.open_file_write(self.metadata_tsv_files[0]) + for fasta_file in self.fasta_files: + RefPreparer._fasta_to_metadata(fasta_file, f_out, self.all_coding=='yes') + pyfastaq.utils.close(f_out) + else: + assert self.all_coding is None + assert len(self.metadata_tsv_files) > 0 + self._write_info_file(os.path.join(outdir, '00.info.txt')) self.refdata = reference_data.ReferenceData( diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv new file mode 100644 index 00000000..d2dc5ffd --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv @@ -0,0 +1,9 @@ +gene1 0 0 . . . +gene2 0 0 . . . +gene3 0 0 . . . +gene4.var_only 0 0 . . . +noncoding1 0 0 . . . +noncoding2.var_only 0 0 . . . +noncoding3.var_only 0 0 . . . +noncoding4.var_only 0 0 . . . +cannot_make_into_a_gene 0 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt new file mode 100644 index 00000000..1b2cd69d --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt @@ -0,0 +1,5 @@ +input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa +input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa +input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa +input tsv file: tmp.ref_preparer_test_run/00.auto_metadata.tsv +genetic_code 1 diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt new file mode 100644 index 00000000..718ac6f2 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt @@ -0,0 +1,5 @@ +ARIBA run with this command: +python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer.test_run_all_noncoding +from this directory: /home/ubuntu/sanger-pathogens/ariba + + diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv new file mode 100644 index 00000000..626e9a3d --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv @@ -0,0 +1,9 @@ +cannot_make_into_a_gene 0 0 . . . +gene1 0 0 . . . +gene2 0 0 . . . +gene3 0 0 . . . +gene4.var_only 0 0 . . . +noncoding1 0 0 . . . +noncoding2.var_only 0 0 . . . +noncoding3.var_only 0 0 . . . +noncoding4.var_only 0 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa new file mode 100644 index 00000000..339405a0 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa @@ -0,0 +1,18 @@ +>cannot_make_into_a_gene +AAAAAAAAAAAAAAAA +>gene1 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene2 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene3 +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>gene4.var_only +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>noncoding1 +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding2.var_only +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding3.var_only +CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA +>noncoding4.var_only +CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle new file mode 100644 index 0000000000000000000000000000000000000000..684397fe40975a70c1d5943a25d5430a2159de69 GIT binary patch literal 346 zcmY+;T~ER=7zN-70x|_rK>UL1C59y%{E}Yv#x!L#xH#YDY@LZ0CfxA9Z4=qjyXNFP zJ@2vmiy-3tAP5eM)bcannot_make_into_a_gene +AAAAAAAAAAAAAAAA +>gene1 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene2 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene3 +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>gene4.var_only +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>noncoding1 +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding2.var_only +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding3.var_only +CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA +>noncoding4.var_only +CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py index a338328d..23b15741 100644 --- a/ariba/tests/ref_preparer_test.py +++ b/ariba/tests/ref_preparer_test.py @@ -15,10 +15,15 @@ def test_fasta_to_metadata(self): tmp_out = 'tmp.test_fasta_to_metadata.tsv' expected_coding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.coding.tsv') expected_noncoding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.noncoding.tsv') - ref_preparer.RefPreparer._fasta_to_metadata(infile, tmp_out, True) + + with open(tmp_out, 'w') as f: + ref_preparer.RefPreparer._fasta_to_metadata(infile, f, True) self.assertTrue(filecmp.cmp(expected_coding, tmp_out, shallow=False)) - ref_preparer.RefPreparer._fasta_to_metadata(infile, tmp_out, False) + + with open(tmp_out, 'w') as f: + ref_preparer.RefPreparer._fasta_to_metadata(infile, f, False) self.assertTrue(filecmp.cmp(expected_noncoding, tmp_out, shallow=False)) + os.unlink(tmp_out) @@ -85,7 +90,7 @@ def test_run(self): ] extern_progs = external_progs.ExternalProgs() - refprep = ref_preparer.RefPreparer(fasta_in, tsv_in, extern_progs, genetic_code=1) + refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, metadata_tsv_files=tsv_in, genetic_code=1) tmp_out = 'tmp.ref_preparer_test_run' refprep.run(tmp_out) expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out') @@ -109,3 +114,41 @@ def test_run(self): shutil.rmtree(tmp_out) + + def test_run_all_noncoding(self): + '''test run with no metadata input, all sequences are noncoding''' + fasta_in = [ + os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'), + os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'), + os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'), + ] + + extern_progs = external_progs.ExternalProgs() + refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1) + tmp_out = 'tmp.ref_preparer_test_run' + refprep.run(tmp_out) + expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out') + + test_files = [ + '00.auto_metadata.tsv', + '01.filter.check_metadata.tsv', + '01.filter.check_genes.log', + '01.filter.check_metadata.log', + '02.cdhit.all.fa', + '02.cdhit.clusters.tsv', + '02.cdhit.gene.fa', + '02.cdhit.gene.varonly.fa', + '02.cdhit.noncoding.fa', + '02.cdhit.noncoding.varonly.fa', + ] + + for filename in test_files: + expected = os.path.join(expected_outdir, filename) + got = os.path.join(tmp_out, filename) + self.assertTrue(filecmp.cmp(expected, got, shallow=False)) + + shutil.rmtree(tmp_out) + + + + From 31bcf7d15aee6d4e574f18fb1d303c38ff65f7ce Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 5 Aug 2016 13:03:26 +0000 Subject: [PATCH 3/3] Add --all_coding option to prepareref --- ariba/tasks/prepareref.py | 3 ++- scripts/ariba | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py index 7a7591ab..ef52684a 100644 --- a/ariba/tasks/prepareref.py +++ b/ariba/tasks/prepareref.py @@ -12,8 +12,9 @@ def run(options): preparer = ref_preparer.RefPreparer( options.fasta_files, - options.tsv_files, extern_progs, + metadata_tsv_files=options.tsv_files, + all_coding=options.all_coding, version_report_lines=version_report_lines, min_gene_length=options.min_gene_length, max_gene_length=options.max_gene_length, diff --git a/scripts/ariba b/scripts/ariba index 43c37a3e..bb6627e3 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -61,11 +61,13 @@ subparser_prepareref = subparsers.add_parser( help='Prepare reference data for input to "run"', usage='ariba prepareref [options] ', description='Prepare reference data for running the pipeline with "ariba run"', - epilog='REQUIRED: -f and -m must each be used at least once', + epilog='REQUIRED: -f/--fasta, and also either -m/--metadata or --all_coding must be used', ) input_group = subparser_prepareref.add_argument_group('input files options') input_group.add_argument('-f', '--fasta', action='append', dest='fasta_files', required=True, help='REQUIRED. Name of fasta file. Can be used more than once if your sequences are spread over more than on file', metavar='FILENAME') -input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', required=True, help='REQUIRED. Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file', metavar='FILENAME') +meta_group = input_group.add_mutually_exclusive_group(required=True) +meta_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', help='Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file. Incompatible with --all_coding', metavar='FILENAME') +meta_group.add_argument('--all_coding', choices=['yes', 'no'], help='Use this if you only have a fasta of presence absence sequences as input, and no metadata. Use "yes" if all sequences are coding, or "no" if they are all non-coding. Incompatible with -m/--metadata') cdhit_group = subparser_prepareref.add_argument_group('cd-hit options') cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')