diff --git a/ariba/ref_preparer.py b/ariba/ref_preparer.py index 9fee5fd3..b34c6699 100644 --- a/ariba/ref_preparer.py +++ b/ariba/ref_preparer.py @@ -1,6 +1,7 @@ import sys import os import pickle +import pyfastaq from ariba import reference_data class Error (Exception): pass @@ -9,8 +10,9 @@ class Error (Exception): pass class RefPreparer: def __init__(self, fasta_files, - metadata_tsv_files, extern_progs, + metadata_tsv_files=None, + all_coding=None, version_report_lines=None, min_gene_length=6, max_gene_length=10000, @@ -30,7 +32,8 @@ def __init__(self, self.version_report_lines = version_report_lines self.fasta_files = fasta_files - self.metadata_tsv_files = metadata_tsv_files + self.metadata_tsv_files = [] if metadata_tsv_files is None else metadata_tsv_files + self.all_coding = all_coding self.min_gene_length = min_gene_length self.max_gene_length = max_gene_length self.genetic_code = genetic_code @@ -42,6 +45,15 @@ def __init__(self, self.verbose = verbose + @classmethod + def _fasta_to_metadata(cls, infile, out_fh, all_coding): + seq_reader = pyfastaq.sequences.file_reader(infile) + coding = '1' if all_coding else '0' + + for seq in seq_reader: + print(seq.id, coding, 0, '.', '.', '.', sep='\t', file=out_fh) + + def _write_info_file(self, outfile): with open(outfile, 'w') as fout: for filename in self.fasta_files: @@ -126,6 +138,18 @@ def run(self, outdir): print(file=f) print(*self.version_report_lines, sep='\n', file=f) + if self.all_coding is not None: + assert len(self.metadata_tsv_files) == 0 + assert self.all_coding in {'yes', 'no'} + self.metadata_tsv_files = [os.path.join(outdir, '00.auto_metadata.tsv')] + f_out = pyfastaq.utils.open_file_write(self.metadata_tsv_files[0]) + for fasta_file in self.fasta_files: + RefPreparer._fasta_to_metadata(fasta_file, f_out, self.all_coding=='yes') + pyfastaq.utils.close(f_out) + else: + assert self.all_coding is None + assert len(self.metadata_tsv_files) > 0 + self._write_info_file(os.path.join(outdir, '00.info.txt')) self.refdata = reference_data.ReferenceData( diff --git a/ariba/tasks/prepareref.py b/ariba/tasks/prepareref.py index 7a7591ab..ef52684a 100644 --- a/ariba/tasks/prepareref.py +++ b/ariba/tasks/prepareref.py @@ -12,8 +12,9 @@ def run(options): preparer = ref_preparer.RefPreparer( options.fasta_files, - options.tsv_files, extern_progs, + metadata_tsv_files=options.tsv_files, + all_coding=options.all_coding, version_report_lines=version_report_lines, min_gene_length=options.min_gene_length, max_gene_length=options.max_gene_length, diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv new file mode 100644 index 00000000..ac5cf27e --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.coding.tsv @@ -0,0 +1,3 @@ +seq1 1 0 . . . +seq2 1 0 . . . +seq3 1 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa new file mode 100644 index 00000000..be49b864 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.fa @@ -0,0 +1,6 @@ +>seq1 +CACTACAT +>seq2 +AAAA +>seq3 +GGGGG diff --git a/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv new file mode 100644 index 00000000..c1c770b4 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_fasta_to_metadata.noncoding.tsv @@ -0,0 +1,3 @@ +seq1 0 0 . . . +seq2 0 0 . . . +seq3 0 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv new file mode 100644 index 00000000..d2dc5ffd --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.auto_metadata.tsv @@ -0,0 +1,9 @@ +gene1 0 0 . . . +gene2 0 0 . . . +gene3 0 0 . . . +gene4.var_only 0 0 . . . +noncoding1 0 0 . . . +noncoding2.var_only 0 0 . . . +noncoding3.var_only 0 0 . . . +noncoding4.var_only 0 0 . . . +cannot_make_into_a_gene 0 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt new file mode 100644 index 00000000..1b2cd69d --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.info.txt @@ -0,0 +1,5 @@ +input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.1.fa +input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.2.fa +input fasta file: /home/ubuntu/sanger-pathogens/ariba/ariba/tests/data/ref_preparer_test_run.in.3.fa +input tsv file: tmp.ref_preparer_test_run/00.auto_metadata.tsv +genetic_code 1 diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt new file mode 100644 index 00000000..718ac6f2 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/00.version_info.txt @@ -0,0 +1,5 @@ +ARIBA run with this command: +python3 -m unittest prepareref ariba.tests.ref_preparer_test.TestRefPreparer.test_run_all_noncoding +from this directory: /home/ubuntu/sanger-pathogens/ariba + + diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_genes.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.log new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv new file mode 100644 index 00000000..626e9a3d --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/01.filter.check_metadata.tsv @@ -0,0 +1,9 @@ +cannot_make_into_a_gene 0 0 . . . +gene1 0 0 . . . +gene2 0 0 . . . +gene3 0 0 . . . +gene4.var_only 0 0 . . . +noncoding1 0 0 . . . +noncoding2.var_only 0 0 . . . +noncoding3.var_only 0 0 . . . +noncoding4.var_only 0 0 . . . diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa new file mode 100644 index 00000000..339405a0 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.all.fa @@ -0,0 +1,18 @@ +>cannot_make_into_a_gene +AAAAAAAAAAAAAAAA +>gene1 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene2 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene3 +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>gene4.var_only +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>noncoding1 +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding2.var_only +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding3.var_only +CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA +>noncoding4.var_only +CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle new file mode 100644 index 00000000..684397fe Binary files /dev/null and b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.pickle differ diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv new file mode 100644 index 00000000..31dbee3b --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.clusters.tsv @@ -0,0 +1,5 @@ +cluster_1 gene1 gene2 +cluster_2 cannot_make_into_a_gene +gene4+ gene3 gene4.var_only +noncoding- noncoding1 noncoding2.var_only noncoding3.var_only +noncoding4 noncoding4.var_only diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.fa new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.gene.varonly.fa new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa new file mode 100644 index 00000000..339405a0 --- /dev/null +++ b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.fa @@ -0,0 +1,18 @@ +>cannot_make_into_a_gene +AAAAAAAAAAAAAAAA +>gene1 +ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene2 +ATGGATCGTGAAGCGATGACCCATGAAGCGACCGAACGCTAA +>gene3 +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>gene4.var_only +ATGACCGAAAGCAGCGAACGCGCGTGCACCTAA +>noncoding1 +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding2.var_only +CTACTGATCATCTACTATCTGCATCGATGCCTGATCTA +>noncoding3.var_only +CTACTGATTATCTACTATCTGCATCGATGCCTGATCTA +>noncoding4.var_only +CAACCACATGCAGTCATGCAACCAACACTCTCATCTAA diff --git a/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa b/ariba/tests/data/ref_preparer_test_run_all_noncoding.out/02.cdhit.noncoding.varonly.fa new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/ref_preparer_test.py b/ariba/tests/ref_preparer_test.py index 604395e8..23b15741 100644 --- a/ariba/tests/ref_preparer_test.py +++ b/ariba/tests/ref_preparer_test.py @@ -9,6 +9,24 @@ class TestRefPreparer(unittest.TestCase): + def test_fasta_to_metadata(self): + '''test _fasta_to_metadata''' + infile = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.fa') + tmp_out = 'tmp.test_fasta_to_metadata.tsv' + expected_coding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.coding.tsv') + expected_noncoding = os.path.join(data_dir, 'ref_preparer_test_fasta_to_metadata.noncoding.tsv') + + with open(tmp_out, 'w') as f: + ref_preparer.RefPreparer._fasta_to_metadata(infile, f, True) + self.assertTrue(filecmp.cmp(expected_coding, tmp_out, shallow=False)) + + with open(tmp_out, 'w') as f: + ref_preparer.RefPreparer._fasta_to_metadata(infile, f, False) + self.assertTrue(filecmp.cmp(expected_noncoding, tmp_out, shallow=False)) + + os.unlink(tmp_out) + + def test_rename_clusters(self): '''test _rename_clusters''' clusters_in = { @@ -72,7 +90,7 @@ def test_run(self): ] extern_progs = external_progs.ExternalProgs() - refprep = ref_preparer.RefPreparer(fasta_in, tsv_in, extern_progs, genetic_code=1) + refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, metadata_tsv_files=tsv_in, genetic_code=1) tmp_out = 'tmp.ref_preparer_test_run' refprep.run(tmp_out) expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out') @@ -96,3 +114,41 @@ def test_run(self): shutil.rmtree(tmp_out) + + def test_run_all_noncoding(self): + '''test run with no metadata input, all sequences are noncoding''' + fasta_in = [ + os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'), + os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'), + os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'), + ] + + extern_progs = external_progs.ExternalProgs() + refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1) + tmp_out = 'tmp.ref_preparer_test_run' + refprep.run(tmp_out) + expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out') + + test_files = [ + '00.auto_metadata.tsv', + '01.filter.check_metadata.tsv', + '01.filter.check_genes.log', + '01.filter.check_metadata.log', + '02.cdhit.all.fa', + '02.cdhit.clusters.tsv', + '02.cdhit.gene.fa', + '02.cdhit.gene.varonly.fa', + '02.cdhit.noncoding.fa', + '02.cdhit.noncoding.varonly.fa', + ] + + for filename in test_files: + expected = os.path.join(expected_outdir, filename) + got = os.path.join(tmp_out, filename) + self.assertTrue(filecmp.cmp(expected, got, shallow=False)) + + shutil.rmtree(tmp_out) + + + + diff --git a/scripts/ariba b/scripts/ariba index 43c37a3e..bb6627e3 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -61,11 +61,13 @@ subparser_prepareref = subparsers.add_parser( help='Prepare reference data for input to "run"', usage='ariba prepareref [options] ', description='Prepare reference data for running the pipeline with "ariba run"', - epilog='REQUIRED: -f and -m must each be used at least once', + epilog='REQUIRED: -f/--fasta, and also either -m/--metadata or --all_coding must be used', ) input_group = subparser_prepareref.add_argument_group('input files options') input_group.add_argument('-f', '--fasta', action='append', dest='fasta_files', required=True, help='REQUIRED. Name of fasta file. Can be used more than once if your sequences are spread over more than on file', metavar='FILENAME') -input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', required=True, help='REQUIRED. Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file', metavar='FILENAME') +meta_group = input_group.add_mutually_exclusive_group(required=True) +meta_group.add_argument('-m', '--metadata', action='append', dest='tsv_files', help='Name of tsv file of metadata about the input sequences. Can be used more than once if your metadata is spread over more than one file. Incompatible with --all_coding', metavar='FILENAME') +meta_group.add_argument('--all_coding', choices=['yes', 'no'], help='Use this if you only have a fasta of presence absence sequences as input, and no metadata. Use "yes" if all sequences are coding, or "no" if they are all non-coding. Incompatible with -m/--metadata') cdhit_group = subparser_prepareref.add_argument_group('cd-hit options') cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')