diff --git a/.travis.yml b/.travis.yml index 5aa20414..d7d11b10 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ addons: - libgfortran3 - libncurses5-dev python: -- '3.5' +- '3.6' sudo: false install: - source ./install_dependencies.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 736fc25e..ec0c98ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,9 @@ # Change Log +## [v2.14.0] author schultzm +** Features from schultzm pull request on fork `ncbi` +- Added feature to download NCBI Antimicrobial Resistance Gene Database +- Fixed bug in fermilite path + ## [v2.13.5](https://github.com/sanger-pathogens/ariba/tree/v2.13.5) (2019-03-26) diff --git a/MANIFEST.in b/MANIFEST.in index 395c07b3..e69de29b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +0,0 @@ -recursive-include third_party *.h diff --git a/README.md b/README.md index 17253907..dbd45cf3 100644 --- a/README.md +++ b/README.md @@ -39,15 +39,15 @@ The input is a FASTA file of reference sequences (can be a mix of genes and nonc ## Quick Start Get reference data, for instance from [CARD](https://card.mcmaster.ca/). See [getref](https://github.com/sanger-pathogens/ariba/wiki/Task%3A-getref) for a full list. - ariba getref card out.card + ariba getref ncbi out.ncbi Prepare reference data for ARIBA: - ariba prepareref -f out.card.fa -m out.card.tsv out.card.prepareref + ariba prepareref -f out.ncbi.fa -m out.ncbi.tsv out.ncbi.prepareref Run local assemblies and call variants: - ariba run out.card.prepareref reads1.fastq reads2.fastq out.run + ariba run out.ncbi.prepareref reads1.fastq reads2.fastq out.run Summarise data from several runs: @@ -91,6 +91,10 @@ If the tests all pass, install: python3 setup.py install +Alternatively, install directly from github using: + + pip3 install git+https://github.com/sanger-pathogens/ariba.git #--user + ### Docker ARIBA can be run in a Docker container. First install Docker, then install ARIBA: diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py index a94764e9..1bf95d85 100644 --- a/ariba/ref_genes_getter.py +++ b/ariba/ref_genes_getter.py @@ -20,6 +20,7 @@ class Error (Exception): pass 'vfdb_core', 'vfdb_full', 'virulencefinder', + 'ncbi',#added by schultzm } argannot_ref = '"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n' @@ -461,7 +462,7 @@ def _get_from_vfdb_common(self, outprefix, filename, info_text): @classmethod def _fix_virulencefinder_fasta_file(cls, infile, outfile): '''Some line breaks are missing in the FASTA files from - viruslence finder. Which means there are lines like this: + virulence finder. Which means there are lines like this: AAGATCCAATAACTGAAGATGTTGAACAAACAATTCATAATATTTATGGTCAATATGCTATTTTCGTTGA AGGTGTTGCGCATTTACCTGGACATCTCTCTCCATTATTAAAAAAATTACTACTTAAATCTTTATAA>coa:1:BA000018.3 ATGAAAAAGCAAATAATTTCGCTAGGCGCATTAGCAGTTGCATCTAGCTTATTTACATGGGATAACAAAG @@ -541,5 +542,124 @@ def _get_from_virulencefinder(self, outprefix): print('If you use this downloaded data, please cite:') print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n') + + + def _get_from_ncbi(self, outprefix, test=None): ## author github:schultzm + """ + Download the NCBI-curated Bacterial Antimicrobial Resistance Reference Gene Database. + Uses BioPython to do the data collection and extraction. + Author: schultzm (github) May 31, 2019. + + >>> from Bio import Entrez + >>> import getpass + >>> import socket + >>> BIOPROJECT = "PRJNA313047" + >>> RETMAX = 100 + >>> import getpass + >>> import socket + >>> Entrez.email = getpass.getuser()+'@'+socket.getfqdn() + >>> search_results = Entrez.read(Entrez.esearch(db="nucleotide", + ... term=BIOPROJECT, + ... retmax=RETMAX, + ... usehistory="y", + ... idtype="acc")) + >>> webenv = search_results["WebEnv"] + >>> query_key = search_results["QueryKey"] + >>> target_accn = " NG_061627.1" + >>> records = Entrez.efetch(db="nucleotide", + ... rettype="gbwithparts", + ... retmode="text", + ... retstart=0, + ... retmax=RETMAX, + ... webenv=webenv, + ... query_key=query_key, + ... idtype="acc") + >>> from Bio.Alphabet import generic_dna + >>> from Bio import SeqIO + >>> from Bio.Seq import Seq + >>> from Bio.SeqRecord import SeqRecord + >>> for gb_record in SeqIO.parse(records, "genbank"): + ... if gb_record.id == 'NG_061627.1': + ... gb_record + SeqRecord(seq=Seq('TAATCCTTGGAAACCTTAGAAATTGATGGAGGATCTTAACAAGATCCTGACATA...GGC', IUPACAmbiguousDNA()), id='NG_061627.1', name='NG_061627', description='Klebsiella pneumoniae SCKLB88 mcr-8 gene for phosphoethanolamine--lipid A transferase MCR-8.2, complete CDS', dbxrefs=['BioProject:PRJNA313047']) + + """ + + outprefix = os.path.abspath(outprefix) + final_fasta = outprefix + '.fa' + final_tsv = outprefix + '.tsv' + + # Download the database as genbank using Bio.Entrez + from Bio import Entrez + import getpass + import socket + import sys + BIOPROJECT = "PRJNA313047" ## https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJNA313047 + RETMAX=100000000 + Entrez.email = getpass.getuser()+'@'+socket.getfqdn() + # See section 9.15 Using the history and WebEnv in + # http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec:entrez-webenv + search_results = Entrez.read(Entrez.esearch(db="nucleotide", + term=BIOPROJECT, + retmax=RETMAX, + usehistory="y", + idtype="acc")) + acc_list = search_results["IdList"] + webenv = search_results["WebEnv"] + query_key = search_results["QueryKey"] + if test: + return acc_list + #up to here + if len(acc_list) > 0: + print(f"E-fetching {len(acc_list)} genbank records from BioProject {BIOPROJECT} and writing to. This may take a while.", file=sys.stderr) + records = Entrez.efetch(db="nucleotide", + rettype="gbwithparts", retmode="text", + retstart=0, retmax=RETMAX, + webenv=webenv, query_key=query_key, + idtype="acc") + #pull out the records as fasta from the genbank + from Bio.Alphabet import generic_dna + from Bio import SeqIO + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + + print(f"Parsing genbank records.") + with open(final_fasta, "w") as f_out_fa, \ + open(final_tsv, "w") as f_out_tsv: + for idx, gb_record in enumerate(SeqIO.parse(records, "genbank")): + print(f"'{gb_record.id}'") + n=0 + record_new=[] + for index, feature in enumerate(gb_record.features): + if feature.type == 'CDS': + n+=1 + gb_feature = gb_record.features[index] + id = None + try: + id = gb_feature.qualifiers["allele"] + except: + try: + try: + id = gb_feature.qualifiers["gene"] + except: + id = gb_feature.qualifiers["locus_tag"] + except KeyError: + print(f"gb_feature.qualifer not found", file=sys.stderr) + accession = gb_record.id + seq_out = Seq(str(gb_feature.extract(gb_record.seq)), generic_dna) + record_new.append(SeqRecord(seq_out, + id=f"{id[0]}.{accession}", + description="")) + if len(record_new) == 1: + print(f"Processing record {idx+1} of {len(acc_list)} (accession {accession})", file=sys.stderr) + f_out_fa.write(f"{record_new[0].format('fasta').rstrip()}\n") + f_out_tsv.write(f"{id[0]}.{accession}\t1\t0\t.\t.\t{gb_feature.qualifiers['product'][0]}\n") + if idx == len(acc_list)-1: + print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n') + print('You can use them with ARIBA like this:') + print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') + + else: + print(f"Nothing to do. Exiting.") def run(self, outprefix): exec('self._get_from_' + self.ref_db + '(outprefix)') diff --git a/ariba/tests/__init__.py b/ariba/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ariba/tests/ncbi_getter_test.py b/ariba/tests/ncbi_getter_test.py new file mode 100644 index 00000000..542d0d67 --- /dev/null +++ b/ariba/tests/ncbi_getter_test.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import unittest +import os +from ariba.ref_genes_getter import RefGenesGetter + +class TestNcbiGetter(unittest.TestCase): + def setUp(self): + self.ncbi_db = RefGenesGetter('ncbi')._get_from_ncbi('ncbi.test', 'test') + # self.ncbi_db = RefGenesGetter.run('ncbi') + + def test_ncbi(self): + ''' + Test that more than 4000 records have been found on NCBI AMR DB. + ''' + self.assertTrue(len(self.ncbi_db) > 4000) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/scripts/ariba b/scripts/ariba index 404c5d57..38f5dee0 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -62,7 +62,7 @@ subparser_getref = subparsers.add_parser( description='Download reference data from one of a few supported public resources', ) subparser_getref.add_argument('--debug', action='store_true', help='Do not delete temporary downloaded files') -subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Applies to: card, megares, plasmidfinder, resfinder, srst2_argannot, virulencefinder. For plasmid/res/virulencefinder: default is to get latest from bitbucket - supply git commit hash to get a specific version from bitbucket, or use "old " to get from old website. For srst2_argannot: default is latest version r2, use r1 to get the older version') +subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Applies to: card, megares, ncbi, plasmidfinder, resfinder, srst2_argannot, virulencefinder. For plasmid/res/virulencefinder: default is to get latest from bitbucket - supply git commit hash to get a specific version from bitbucket, or use "old " to get from old website. For srst2_argannot: default is latest version r2, use r1 to get the older version') subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name") subparser_getref.add_argument('outprefix', help='Prefix of output filenames') subparser_getref.set_defaults(func=ariba.tasks.getref.run) diff --git a/setup.py b/setup.py index d323aedd..2ffa6bb7 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ setup( ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod], name='ariba', - version='2.13.5', + version='2.14.0', description='ARIBA: Antibiotic Resistance Identification By Assembly', packages = find_packages(), package_data={'ariba': ['test_run_data/*', 'tb_data/*']}, @@ -73,6 +73,7 @@ 'pyfastaq >= 3.12.0', 'pysam >= 0.9.1', 'pymummer<=0.10.3', + 'matplotlib>=3.1.0', ], license='GPLv3', classifiers=[