From b8d4d682f171ae31aa155e082abf748bae95de24 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Fri, 12 Aug 2016 13:37:51 +0100 Subject: [PATCH] Add option to choose download version --- ariba/ref_genes_getter.py | 42 +++++++++++++++++++++++++++++++++++---- ariba/tasks/getref.py | 6 +++++- scripts/ariba | 1 + 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py index dbc3c1a2..56f31c3d 100644 --- a/ariba/ref_genes_getter.py +++ b/ariba/ref_genes_getter.py @@ -16,7 +16,7 @@ class Error (Exception): pass class RefGenesGetter: - def __init__(self, ref_db, genetic_code=11): + def __init__(self, ref_db, genetic_code=11, version=None): allowed_ref_dbs = {'card', 'argannot', 'plasmidfinder', 'resfinder','srst2_argannot', 'vfdb'} if ref_db not in allowed_ref_dbs: raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db) @@ -24,6 +24,7 @@ def __init__(self, ref_db, genetic_code=11): self.genetic_code = genetic_code self.max_download_attempts = 3 self.sleep_time = 2 + self.version = version pyfastaq.sequences.genetic_code = self.genetic_code @@ -41,6 +42,30 @@ def _download_file(self, url, outfile): print(' done', flush=True) + def _get_card_versions(self, tmp_file): + print('Getting available CARD versions') + self._download_file('https://card.mcmaster.ca/download', tmp_file) + p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.gz)"''') + versions = {} + + with open(tmp_file) as f: + for line in f: + got = p.findall(line) + for match in got: + key = tuple([int(x) for x in match[1].split('.')]) + versions[key] = 'https://card.mcmaster.ca' + match[0] + + if len(versions) == 0: + raise Error('Error getting CARD versions. Cannot continue') + + print('Found versions:') + + for key, url in sorted(versions.items()): + print('.'.join([str(x) for x in key]), url, sep='\t') + + os.unlink(tmp_file) + return versions + def _get_from_card(self, outprefix): outprefix = os.path.abspath(outprefix) @@ -53,8 +78,17 @@ def _get_from_card(self, outprefix): except: raise Error('Error mkdir/chdir ' + tmpdir) - card_version = '1.0.9' - card_tarball_url = 'https://card.mcmaster.ca/download/0/broadstreet-v' + card_version + '.tar.gz' + versions = self._get_card_versions('download.html') + if self.version is not None: + key = tuple([int(x) for x in self.version.split('.')]) + if key not in versions: + raise Error('Error! Did not find requested version ' + self.version) + else: + key = sorted(list(versions.keys()))[-1] + self.version = '.'.join([str(x) for x in key]) + + print('Getting version', self.version) + card_tarball_url = versions[key] card_tarball = 'card.tar.gz' print('Working in temporary directory', tmpdir) print('Downloading data from card:', card_tarball_url, flush=True) @@ -149,7 +183,7 @@ def _get_from_card(self, outprefix): print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n') print('If you use this downloaded data, please cite:') print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175') - print('and in your methods say that version', card_version, 'of the database was used') + print('and in your methods say that version', self.version, 'of the database was used') def _get_from_resfinder(self, outprefix): diff --git a/ariba/tasks/getref.py b/ariba/tasks/getref.py index d83e028c..a9292c14 100644 --- a/ariba/tasks/getref.py +++ b/ariba/tasks/getref.py @@ -3,6 +3,10 @@ def run(options): - getter = ref_genes_getter.RefGenesGetter(options.db, genetic_code=options.genetic_code) + getter = ref_genes_getter.RefGenesGetter( + options.db, + genetic_code=options.genetic_code, + version=options.version + ) getter.run(options.outprefix) diff --git a/scripts/ariba b/scripts/ariba index 696a00a9..d11a1ef0 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -50,6 +50,7 @@ subparser_getref = subparsers.add_parser( description='Download reference data from one of a few supported public resources', ) subparser_getref.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT') +subparser_getref.add_argument('--version', help='Version of reference data to download. If not used, gets the latest version. Only applies to card') subparser_getref.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs, metavar="DB name") subparser_getref.add_argument('outprefix', help='Prefix of output filenames') subparser_getref.set_defaults(func=ariba.tasks.getref.run)