sanger-pathogens · kpepper · Jun 4, 2019 · Sep 11, 2018 · Sep 11, 2018 · Sep 13, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -8,7 +8,7 @@ addons:
     - libgfortran3
     - libncurses5-dev
 python:
-- '3.5'
+- '3.6'
 sudo: false
 install:
 - source ./install_dependencies.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,9 @@
 # Change Log
+## [v2.14.0] author schultzm
+** Features from schultzm pull request on fork `ncbi`  
+- Added feature to download NCBI Antimicrobial Resistance Gene Database  
+- Fixed bug in fermilite path  
+
 
 ## [v2.13.5](https://github.com/sanger-pathogens/ariba/tree/v2.13.5) (2019-03-26)
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +0,0 @@
-recursive-include third_party *.h

diff --git a/README.md b/README.md
@@ -39,15 +39,15 @@ The input is a FASTA file of reference sequences (can be a mix of genes and nonc
 ## Quick Start
 Get reference data, for instance from [CARD](https://card.mcmaster.ca/). See [getref](https://github.com/sanger-pathogens/ariba/wiki/Task%3A-getref) for a full list.
 
-    ariba getref card out.card
+    ariba getref ncbi out.ncbi
 
 Prepare reference data for ARIBA:
 
-    ariba prepareref -f out.card.fa -m out.card.tsv out.card.prepareref
+    ariba prepareref -f out.ncbi.fa -m out.ncbi.tsv out.ncbi.prepareref
 
 Run local assemblies and call variants:
 
-    ariba run out.card.prepareref reads1.fastq reads2.fastq out.run
+    ariba run out.ncbi.prepareref reads1.fastq reads2.fastq out.run
 
 Summarise data from several runs:
 
@@ -91,6 +91,10 @@ If the tests all pass, install:
 
     python3 setup.py install
 
+Alternatively, install directly from github using:
+
+    pip3 install git+https://github.com/sanger-pathogens/ariba.git #--user
+
 ### Docker
 ARIBA can be run in a Docker container. First install Docker, then install ARIBA:
 

diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
@@ -20,6 +20,7 @@ class Error (Exception): pass
     'vfdb_core',
     'vfdb_full',
     'virulencefinder',
+    'ncbi',#added by schultzm
 }
 
 argannot_ref = '"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n'
@@ -461,7 +462,7 @@ def _get_from_vfdb_common(self, outprefix, filename, info_text):
     @classmethod
     def _fix_virulencefinder_fasta_file(cls, infile, outfile):
         '''Some line breaks are missing in the FASTA files from
-        viruslence finder. Which means there are lines like this:
+        virulence finder. Which means there are lines like this:
         AAGATCCAATAACTGAAGATGTTGAACAAACAATTCATAATATTTATGGTCAATATGCTATTTTCGTTGA
         AGGTGTTGCGCATTTACCTGGACATCTCTCTCCATTATTAAAAAAATTACTACTTAAATCTTTATAA>coa:1:BA000018.3
         ATGAAAAAGCAAATAATTTCGCTAGGCGCATTAGCAGTTGCATCTAGCTTATTTACATGGGATAACAAAG
@@ -541,5 +542,124 @@ def _get_from_virulencefinder(self, outprefix):
         print('If you use this downloaded data, please cite:')
         print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
 
+
+
+    def _get_from_ncbi(self, outprefix, test=None): ## author github:schultzm
+        """
+        Download the NCBI-curated Bacterial Antimicrobial Resistance Reference Gene Database.
+        Uses BioPython to do the data collection and extraction.
+        Author: schultzm (github) May 31, 2019.
+
+        >>> from Bio import Entrez
+        >>> import getpass
+        >>> import socket
+        >>> BIOPROJECT = "PRJNA313047"
+        >>> RETMAX = 100
+        >>> import getpass
+        >>> import socket
+        >>> Entrez.email = getpass.getuser()+'@'+socket.getfqdn()
+        >>> search_results = Entrez.read(Entrez.esearch(db="nucleotide",
+        ...                                             term=BIOPROJECT,
+        ...                                             retmax=RETMAX,
+        ...                                             usehistory="y",
+        ...                                             idtype="acc"))
+        >>> webenv = search_results["WebEnv"]
+        >>> query_key = search_results["QueryKey"]
+        >>> target_accn = "    NG_061627.1"
+        >>> records = Entrez.efetch(db="nucleotide",
+        ...                         rettype="gbwithparts",
+        ...                         retmode="text",
+        ...                         retstart=0,
+        ...                         retmax=RETMAX,
+        ...                         webenv=webenv,
+        ...                         query_key=query_key,
+        ...                         idtype="acc")
+        >>> from Bio.Alphabet import generic_dna
+        >>> from Bio import SeqIO
+        >>> from Bio.Seq import Seq
+        >>> from Bio.SeqRecord import SeqRecord
+        >>> for gb_record in SeqIO.parse(records, "genbank"):
+        ...     if gb_record.id == 'NG_061627.1':
+        ...         gb_record
+        SeqRecord(seq=Seq('TAATCCTTGGAAACCTTAGAAATTGATGGAGGATCTTAACAAGATCCTGACATA...GGC', IUPACAmbiguousDNA()), id='NG_061627.1', name='NG_061627', description='Klebsiella pneumoniae SCKLB88 mcr-8 gene for phosphoethanolamine--lipid A transferase MCR-8.2, complete CDS', dbxrefs=['BioProject:PRJNA313047'])
+
+        """
+
+        outprefix = os.path.abspath(outprefix)
+        final_fasta = outprefix + '.fa'
+        final_tsv = outprefix + '.tsv'
+
+        # Download the database as genbank using Bio.Entrez
+        from Bio import Entrez
+        import getpass
+        import socket
+        import sys
+        BIOPROJECT = "PRJNA313047" ## https://www.ncbi.nlm.nih.gov/bioproject/?term=PRJNA313047
+        RETMAX=100000000
+        Entrez.email = getpass.getuser()+'@'+socket.getfqdn()
+        # See section 9.15  Using the history and WebEnv in
+        # http://biopython.org/DIST/docs/tutorial/Tutorial.html#sec:entrez-webenv
+        search_results = Entrez.read(Entrez.esearch(db="nucleotide",
+                                                    term=BIOPROJECT,
+                                                    retmax=RETMAX,
+                                                    usehistory="y",
+                                                    idtype="acc"))
+        acc_list = search_results["IdList"]
+        webenv = search_results["WebEnv"]
+        query_key = search_results["QueryKey"]
+        if test:
+            return acc_list
+            #up to here
+        if len(acc_list) > 0:
+            print(f"E-fetching {len(acc_list)} genbank records from BioProject {BIOPROJECT} and writing to.  This may take a while.", file=sys.stderr)
+            records = Entrez.efetch(db="nucleotide",
+                                               rettype="gbwithparts", retmode="text",
+                                               retstart=0, retmax=RETMAX,
+                                               webenv=webenv, query_key=query_key,
+                                               idtype="acc")
+            #pull out the records as fasta from the genbank
+            from Bio.Alphabet import generic_dna
+            from Bio import SeqIO
+            from Bio.Seq import Seq
+            from Bio.SeqRecord import SeqRecord
+
+            print(f"Parsing genbank records.")
+            with open(final_fasta, "w") as f_out_fa, \
+                 open(final_tsv, "w") as f_out_tsv:
+                for idx, gb_record in enumerate(SeqIO.parse(records, "genbank")):
+                    print(f"'{gb_record.id}'")
+                    n=0
+                    record_new=[]
+                    for index, feature in enumerate(gb_record.features):
+                        if feature.type == 'CDS':
+                            n+=1
+                            gb_feature = gb_record.features[index]
+                            id = None
+                            try:
+                                id = gb_feature.qualifiers["allele"]
+                            except:
+                                try:
+                                    try:
+                                        id = gb_feature.qualifiers["gene"]
+                                    except:
+                                        id = gb_feature.qualifiers["locus_tag"]
+                                except KeyError:
+                                    print(f"gb_feature.qualifer not found", file=sys.stderr)
+                            accession = gb_record.id
+                            seq_out = Seq(str(gb_feature.extract(gb_record.seq)), generic_dna)
+                            record_new.append(SeqRecord(seq_out,
+                                         id=f"{id[0]}.{accession}",
+                                         description=""))
+                    if len(record_new) == 1:
+                        print(f"Processing record {idx+1} of {len(acc_list)} (accession {accession})", file=sys.stderr)
+                        f_out_fa.write(f"{record_new[0].format('fasta').rstrip()}\n")
+                        f_out_tsv.write(f"{id[0]}.{accession}\t1\t0\t.\t.\t{gb_feature.qualifiers['product'][0]}\n")
+                    if idx == len(acc_list)-1:
+                        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
+                        print('You can use them with ARIBA like this:')
+                        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
+
+        else:
+            print(f"Nothing to do. Exiting.")    
     def run(self, outprefix):
         exec('self._get_from_' + self.ref_db + '(outprefix)')
diff --git a/ariba/tests/__init__.py b/ariba/tests/__init__.py
diff --git a/ariba/tests/ncbi_getter_test.py b/ariba/tests/ncbi_getter_test.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+import unittest
+import os
+from ariba.ref_genes_getter import RefGenesGetter
+
+class TestNcbiGetter(unittest.TestCase):
+    def setUp(self):
+        self.ncbi_db = RefGenesGetter('ncbi')._get_from_ncbi('ncbi.test', 'test')
+        # self.ncbi_db = RefGenesGetter.run('ncbi')
+
+    def test_ncbi(self):
+        '''
+        Test that more than 4000 records have been found on NCBI AMR DB.
+        '''
+        self.assertTrue(len(self.ncbi_db) > 4000)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/scripts/ariba b/scripts/ariba
@@ -62,7 +62,7 @@ subparser_getref = subparsers.add_parser(
     description='Download reference data from one of a few supported public resources',
 )
 subparser_getref.add_argument('--debug', action='store_true', help='Do not delete temporary downloaded files')
-subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Applies to: card, megares, plasmidfinder, resfinder, srst2_argannot, virulencefinder. For plasmid/res/virulencefinder: default is to get latest from bitbucket - supply git commit hash to get a specific version from bitbucket, or use "old " to get from old website. For srst2_argannot: default is latest version r2, use r1 to get the older version')
+subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Applies to: card, megares, ncbi, plasmidfinder, resfinder, srst2_argannot, virulencefinder. For plasmid/res/virulencefinder: default is to get latest from bitbucket - supply git commit hash to get a specific version from bitbucket, or use "old " to get from old website. For srst2_argannot: default is latest version r2, use r1 to get the older version')
 subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name")
 subparser_getref.add_argument('outprefix', help='Prefix of output filenames')
 subparser_getref.set_defaults(func=ariba.tasks.getref.run)

diff --git a/setup.py b/setup.py
@@ -55,7 +55,7 @@
 setup(
     ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
     name='ariba',
-    version='2.13.5',
+    version='2.14.0',
     description='ARIBA: Antibiotic Resistance Identification By Assembly',
     packages = find_packages(),
     package_data={'ariba': ['test_run_data/*', 'tb_data/*']},
@@ -73,6 +73,7 @@
         'pyfastaq >= 3.12.0',
         'pysam >= 0.9.1',
         'pymummer<=0.10.3',
+        'matplotlib>=3.1.0',
     ],
     license='GPLv3',
     classifiers=[