diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py index 1ae8d393..dbc3c1a2 100644 --- a/ariba/ref_genes_getter.py +++ b/ariba/ref_genes_getter.py @@ -12,9 +12,12 @@ class Error (Exception): pass from ariba import common, card_record, vfdb_parser +argannot_ref = '"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n' + + class RefGenesGetter: def __init__(self, ref_db, genetic_code=11): - allowed_ref_dbs = {'card', 'argannot', 'plasmidfinder', 'resfinder','vfdb'} + allowed_ref_dbs = {'card', 'argannot', 'plasmidfinder', 'resfinder','srst2_argannot', 'vfdb'} if ref_db not in allowed_ref_dbs: raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db) self.ref_db=ref_db @@ -234,7 +237,7 @@ def _get_from_argannot(self, outprefix): print('You can use them with ARIBA like this:') print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') - print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n') + print(argannot_ref) def _get_from_plasmidfinder(self, outprefix): @@ -289,6 +292,40 @@ def _get_from_plasmidfinder(self, outprefix): print('"PlasmidFinder and pMLST: in silico detection and typing of plasmids", Carattoli et al 2014, PMID: 24777092\n') + def _get_from_srst2_argannot(self, outprefix): + srst2_version = '0.2.0' + srst2_url = 'https://github.com/katholt/srst2/raw/v' + srst2_version + '/data/ARGannot.r1.fasta' + srst2_fa = outprefix + '.original.fa' + command = 'wget -O ' + srst2_fa + ' ' + srst2_url + common.syscall(command, verbose=True) + + final_fasta = outprefix + '.fa' + final_tsv = outprefix + '.tsv' + + f_out_fa = pyfastaq.utils.open_file_write(final_fasta) + f_out_meta = pyfastaq.utils.open_file_write(final_tsv) + seq_reader = pyfastaq.sequences.file_reader(srst2_fa) + + for seq in seq_reader: + original_id = seq.id + name, extra = seq.id.split() + cluster_id, cluster_name, allele_name, allele_id = name.split('__') + seq.id = cluster_name + '.' + name + print(seq, file=f_out_fa) + print(seq.id, 1, 0, '.', '.', 'Original name: ' + original_id, sep='\t', file=f_out_meta) + + pyfastaq.utils.close(f_out_fa) + pyfastaq.utils.close(f_out_meta) + + print('Finished downloading and converting data. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') + print('You can use them with ARIBA like this:') + print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') + print('If you use this downloaded data, please cite:') + print('"SRST2: Rapid genomic surveillance for public health and hospital microbiology labs",\nInouye et al 2014, Genome Medicine, PMID: 25422674\n') + print(argannot_ref) + print('and in your methods say that the ARG-ANNOT sequences were used from version', srst2_version, 'of SRST2.') + + def _get_from_vfdb(self, outprefix): outprefix = os.path.abspath(outprefix) tmpdir = outprefix + '.tmp.download' diff --git a/ariba/summary.py b/ariba/summary.py index f1b767fb..ecf7d304 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -380,7 +380,6 @@ def run(self): # sanity check same number of columns in headers and matrix lengths = {len(x) for x in matrix} - print(lengths, len(phandango_header), len(csv_header)) assert len(lengths) == 1 assert len(matrix[0]) == len(phandango_header) == len(csv_header) diff --git a/scripts/ariba b/scripts/ariba index 5140aed5..43c37a3e 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -42,7 +42,7 @@ subparser_flag.set_defaults(func=ariba.tasks.flag.run) #---------------------------- getref ------------------------------------ -allowed_dbs = ['argannot', 'card', 'plasmidfinder', 'resfinder','vfdb'] +allowed_dbs = ['argannot', 'card', 'plasmidfinder', 'resfinder', 'srst2_argannot', 'vfdb'] subparser_getref = subparsers.add_parser( 'getref', help='Download reference data', @@ -138,7 +138,8 @@ assembly_group.add_argument('--assembly_cov', type=int, help='Target read covera assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs [%(default)s]', default=10, metavar='INT') other_group = subparser_run.add_argument_group('Other options') -other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT') +#other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT') +other_group.add_argument('--threads', type=int, help=argparse.SUPPRESS, default=1, metavar='INT') other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)') other_group.add_argument('--gene_nt_extend', type=int, help='Max number of nucleotides to extend ends of gene matches to look for start/stop codons [%(default)s]', default=30, metavar='INT') other_group.add_argument('--unique_threshold', type=float, help='If proportion of bases in gene assembled more than once is <= this value, then the flag unique_contig is set [%(default)s]', default=0.03, metavar='FLOAT (between 0 and 1)') @@ -179,7 +180,8 @@ subparser_test = subparsers.add_parser( description='Run ARIBA on a small made up built-in test dataset' ) -subparser_test.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT') +#subparser_test.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT') +subparser_test.add_argument('--threads', type=int, help=argparse.SUPPRESS, default=1, metavar='INT') subparser_test.add_argument('outdir', help='Name of output directory') subparser_test.set_defaults(func=ariba.tasks.test.run)