diff --git a/README.md b/README.md index 55990e56..cee5f85b 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,10 @@ Dependencies: * [samtools and bcftools] [samtools] version >= 1.2 * [SSPACE-basic scaffolder] [sspace] * [GapFiller] [gapfiller] - * [SMALT] [smalt] - * [Velvet] [velvet] + * [MUMmer] [mummer] vesrion >= 3.23 + * [SMALT] [smalt] version >= 0.7.4 + * Either [SPAdes] [spades] version >= 3.5.0 or [Velvet] [velvet] version >= 1.2.07 + (SPAdes is recommended) Install with pip: @@ -35,7 +37,7 @@ Usage The installation will put a single script called `ariba` in your path. The usage is: - ariba [options] + ariba [options] Run just `ariba` to get a list of tasks. Use `-h` or `--help` with a task to get the help, for example @@ -48,8 +50,10 @@ To run the pipeline to identify the genes present in a pair of FASTQ files: - [smalt]: https://www.sanger.ac.uk/resources/software/smalt/ - [sspace]: http://www.baseclear.com/genomics/bioinformatics/basetools/SSPACE [gapfiller]: http://www.baseclear.com/genomics/bioinformatics/basetools/gapfiller + [mummer]: http://mummer.sourceforge.net/ [samtools]: http://www.htslib.org/ + [smalt]: https://www.sanger.ac.uk/resources/software/smalt/ + [spades]: http://bioinf.spbau.ru/spades + [sspace]: http://www.baseclear.com/genomics/bioinformatics/basetools/SSPACE [velvet]: http://www.ebi.ac.uk/~zerbino/velvet/ diff --git a/ariba/__init__.py b/ariba/__init__.py index bc52112f..2c86f770 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -3,6 +3,7 @@ 'cluster', 'clusters', 'common', + 'external_progs', 'flag', 'histogram', 'link', diff --git a/ariba/cluster.py b/ariba/cluster.py index 37eff498..6e762cab 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -20,8 +20,6 @@ def __init__(self, nucmer_min_id=90, nucmer_min_len=50, nucmer_breaklen=50, - spades='spades.py', - spades_only_assembler=False, sspace_k=20, reads_insert=500, sspace_sd=0.4, @@ -36,8 +34,10 @@ def __init__(self, bcftools_exe='bcftools', gapfiller_exe='GapFiller.pl', samtools_exe='samtools', + spades_exe='spades.py', sspace_exe='SSPACE_Basic_v2.0.pl', velvet_exe='velvet', # prefix of velvet{g,h} + spades_other=None, ): self.root_dir = os.path.abspath(root_dir) @@ -71,8 +71,8 @@ def __init__(self, self._set_assembly_kmer(assembly_kmer) self.assembler = assembler assert self.assembler in ['velvet', 'spades'] - self.spades = spades - self.spades_only_assembler = spades_only_assembler + self.spades_exe = spades_exe + self.spades_other = spades_other self.bcftools_exe = bcftools_exe @@ -140,6 +140,17 @@ def _set_assembly_kmer(self, k): def _assemble_with_velvet(self): + # map reads to reference gene to make BAM input to velvet columbus + mapping.run_smalt( + self.reads1, + self.reads2, + self.gene_fa, + self.gene_bam[:-4], + threads=self.threads, + sort=True, + verbose=self.verbose, + ) + cmd = ' '.join([ self.velveth, self.assembler_dir, @@ -185,15 +196,16 @@ def _assemble_with_velvet(self): def _assemble_with_spades(self, unittest=False): cmd = ' '.join([ - self.spades, + self.spades_exe, '-1', self.reads1, '-2', self.reads2, '-o', self.assembler_dir, '-k', str(self.assembly_kmer), '--threads', str(self.threads), + '--trusted-contigs', self.gene_fa, ]) - if self.spades_only_assembler: - cmd += ' --only-assembler' + if self.spades_other is not None: + cmd += ' ' + self.spades_other cwd = os.getcwd() os.chdir(self.assembly_dir) @@ -659,16 +671,6 @@ def _make_report_lines(self): def run(self): - # map reads to reference gene - mapping.run_smalt( - self.reads1, - self.reads2, - self.gene_fa, - self.gene_bam[:-4], - threads=self.threads, - sort=True, - ) - if self.assembler == 'velvet': self._assemble_with_velvet() elif self.assembler == 'spades': @@ -696,6 +698,7 @@ def run(self): self.final_assembly_bam[:-4], threads=self.threads, sort=True, + verbose=self.verbose, ) self._parse_assembly_bam() diff --git a/ariba/clusters.py b/ariba/clusters.py index 31c4116b..cf041ebc 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -22,7 +22,7 @@ def __init__(self, smalt_k=13, smalt_s=2, smalt_min_id=0.9, - spades_only_assembler=False, + spades_other=None, max_insert=1000, min_scaff_depth=10, nucmer_min_id=90, @@ -33,6 +33,7 @@ def __init__(self, bcftools_exe='bcftools', gapfiller_exe='GapFiller.pl', samtools_exe='samtools', + spades_exe='spades.py', sspace_exe='SSPACE_Basic_v2.0.pl', velvet_exe='velvet', # prefix of velvet{g,h} ): @@ -44,7 +45,7 @@ def __init__(self, self.assembler = assembler assert self.assembler in ['velvet', 'spades'] self.assembly_kmer = assembly_kmer - self.spades_only_assembler = spades_only_assembler + self.spades_other = spades_other self.bam_prefix = os.path.join(self.outdir, 'map_all_reads') self.bam = self.bam_prefix + '.bam' @@ -83,6 +84,7 @@ def __init__(self, self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages self.samtools_exe = samtools_exe + self.spades_exe = spades_exe self.sspace_exe = shutil.which(sspace_exe) if self.sspace_exe is None: @@ -106,7 +108,8 @@ def _map_reads(self): index_k=self.smalt_k, index_s=self.smalt_s, threads=self.threads, - minid=self.smalt_min_id + minid=self.smalt_min_id, + verbose=self.verbose, ) @@ -201,9 +204,11 @@ def _set_insert_size_data(self): (x, self.insert_size, pc95, self.insert_sspace_sd) = self.insert_hist.stats() self.insert_proper_pair_max = 1.1 * pc95 if self.verbose: + print('\nInsert size information from reads mapped to reference genes:') print('Insert size:', self.insert_size, sep='\t') print('Insert sspace sd:', self.insert_sspace_sd, sep='\t') print('Max insert:', self.insert_proper_pair_max, sep='\t') + print() def _write_gene_fa(self, gene_name, outfile): @@ -223,9 +228,12 @@ def _init_and_run_clusters(self): if len(self.cluster_to_dir) == 0: raise Error('Did not get any reads mapped to genes. Cannot continue') + counter = 0 + for gene in sorted(self.cluster_to_dir): + counter += 1 if self.verbose: - print('Running', gene) + print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)) + ':', gene) new_dir = self.cluster_to_dir[gene] self._write_gene_fa(gene, os.path.join(new_dir, 'gene.fa')) self.clusters[gene] = cluster.Cluster( @@ -237,7 +245,6 @@ def _init_and_run_clusters(self): nucmer_min_id=self.nucmer_min_id, nucmer_min_len=self.nucmer_min_len, nucmer_breaklen=self.nucmer_breaklen, - spades_only_assembler=self.spades_only_assembler, sspace_k=self.min_scaff_depth, reads_insert=self.insert_size, sspace_sd=self.insert_sspace_sd, @@ -248,8 +255,10 @@ def _init_and_run_clusters(self): bcftools_exe=self.bcftools_exe, gapfiller_exe=self.gapfiller_exe, samtools_exe=self.samtools_exe, + spades_exe=self.spades_exe, sspace_exe=self.sspace_exe, velvet_exe=self.velvet, + spades_other=self.spades_other ) self.clusters[gene].run() @@ -293,8 +302,20 @@ def _write_reports(self): def run(self): + if self.verbose: + print('{:_^79}'.format(' Mapping reads to reference genes ')) self._map_reads() + if self.verbose: + print('Finished mapping\n') + print('{:_^79}'.format(' Generating clusters ')) self._bam_to_clusters_reads() self._set_insert_size_data() + if self.verbose: + print('{:_^79}'.format(' Assembling each cluster ')) self._init_and_run_clusters() + if self.verbose: + print('Finished assembling clusters\n') + print('{:_^79}'.format(' Writing report files ')) self._write_reports() + if self.verbose: + print('Finished writing report files. All done!') diff --git a/ariba/external_progs.py b/ariba/external_progs.py new file mode 100644 index 00000000..5a8d024c --- /dev/null +++ b/ariba/external_progs.py @@ -0,0 +1,100 @@ +import shutil +import subprocess +import os +from distutils.version import LooseVersion +import re +import sys +import pyfastaq +from ariba import common + +class Error (Exception): pass + +def is_in_path(prog): + return shutil.which(prog) is not None + + +prog_to_version_cmd = { + 'bcftools': ('bcftools', re.compile('^Version: ([0-9\.]+)')), + 'nucmer': ('nucmer --version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')), + 'smalt': ('smalt version', re.compile('^Version: ([0-9\.]+)')), + 'spades': ('spades.py', re.compile('^SPAdes genome assembler v.([0-9\.]+)')), + 'samtools': ('samtools', re.compile('^Version: ([0-9\.]+)')), + 'sspace': (None, re.compile('^Usage: .*pl \[SSPACE_(.*)\]')), + 'gapfiller': (None, re.compile('^Usage: .*pl \[GapFiller_(.*)\]')), + 'velvetg': ('velvetg', re.compile('Version ([0-9\.]+)')), + 'velveth': ('velveth', re.compile('Version ([0-9\.]+)')), +} + + +def get_version(prog, path=None): + assert prog in prog_to_version_cmd + if path is None: + path = prog + if not is_in_path(path): + raise Error('Error getting version of ' + path + ' - not found in path.') + + if prog in ['sspace', 'gapfiller']: + cmd = 'perl ' + os.path.realpath(shutil.which(path)) + regex = prog_to_version_cmd[prog][1] + else: + cmd, regex = prog_to_version_cmd[prog] + + cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1] + for line in cmd_output: + hits = regex.search(line) + if hits: + return hits.group(1) + return 'UNKNOWN ...\n I tried running this to get the version: "' + cmd + '"\n and the output didn\'t match this regular expression: "' + regex.pattern + '"' + + + +def check_versions(opts, verbose=False): + if verbose: + print('{:_^79}'.format(' Checking dependencies and their versions ')) + print('tool', 'path', 'version', sep='\t') + + l = [ + ('bcftools', opts.bcftools), + ('nucmer', None), + ('smalt', opts.smalt), + ('samtools', opts.samtools), + ('sspace', opts.sspace), + ('gapfiller', opts.gapfiller), + ] + + if opts.assembler == 'spades': + l.append(('spades', opts.spades)) + elif opts.assembler == 'velvet': + l.append(('velvetg', opts.velvet + 'g')) + l.append(('velveth', opts.velvet + 'h')) + else: + raise Error('Assembler ' + opts.assembler + ' not recognised. Cannot continue') + + min_versions = { + 'bcftools': '1.2', + 'samtools': '1.2', + 'nucmer': '3.1', + 'spades': '3.5.0', + 'smalt': '0.7.4', + 'velvetg': '1.2.07', + 'velveth': '1.2.07', + } + + errors = [] + + for t in l: + version = get_version(t[0], path=t[1]) + if verbose: + print(t[0], t[1], version, sep='\t') + + if t[0] in min_versions and LooseVersion(version) < LooseVersion(min_versions[t[0]]): + errors.append(' '.join(['Found version', version, 'of', t[0], 'which is too low! Please update to at least', min_versions[t[0]]])) + + if len(errors): + for e in errors: + print(e, file=sys.stderr) + raise Error('Cannot continue. Some dependencies need updating') + + if verbose: + print('\n... dependencies look OK.\n') diff --git a/ariba/mapping.py b/ariba/mapping.py index 3c2622eb..b4718830 100644 --- a/ariba/mapping.py +++ b/ariba/mapping.py @@ -15,7 +15,8 @@ def run_smalt( max_insert=1000, minid=0.9, sort=False, - extra_smalt_map_ops='-x' + extra_smalt_map_ops='-x', + verbose=False ): if extra_smalt_map_ops is None: extra_smalt_map_ops = '' @@ -59,15 +60,15 @@ def run_smalt( intermediate_bam = final_bam map_cmd += ' -bS -T ' + ref_fa + ' - > ' + intermediate_bam - common.syscall(index_cmd) - common.syscall(map_cmd) + common.syscall(index_cmd, verbose=verbose) + common.syscall(map_cmd, verbose=verbose) if sort: threads = min(4, threads) thread_mem = int(500 / threads) sort_cmd = 'samtools sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix index_cmd = 'samtools index ' + final_bam - common.syscall(sort_cmd) - common.syscall(index_cmd) + common.syscall(sort_cmd, verbose=verbose) + common.syscall(index_cmd, verbose=verbose) for fname in clean_files: os.unlink(fname) diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py index fbe17dc3..e1a233fc 100644 --- a/ariba/tasks/run.py +++ b/ariba/tasks/run.py @@ -22,26 +22,31 @@ def run(): nucmer_group.add_argument('--nucmer_min_len', type=int, help='Minimum alignment length (delta-filter -i) [%(default)s]', default=50, metavar='INT') nucmer_group.add_argument('--nucmer_breaklen', type=int, help='Value to use for -breaklen when running nucmer [%(default)s]', default=50, metavar='INT') - executables_group = parser.add_argument_group('executables locations') - executables_group.add_argument('--bcftools', help='Path to bcftools executable [%(default)s]', default='bcftools', metavar='PATH') - executables_group.add_argument('--gapfiller', help='Path to GapFiller.pl [%(default)s]', default='GapFiller.pl', metavar='PATH') - executables_group.add_argument('--samtools', help='Path to samtools executable [%(default)s]', default='samtools', metavar='PATH') - executables_group.add_argument('--sspace', help='Path to SSPACE_Basic_v2.0.pl [%(default)s]', default='SSPACE_Basic_v2.0.pl', metavar='PATH') - executables_group.add_argument('--velvet', help='Path to prefix of velvet{g,h} [%(default)s]', default='velvet', metavar='PATH') + assembly_group = parser.add_argument_group('Assembly options') + allowed_assemblers = ['velvet', 'spades'] + assembly_group.add_argument('--assembler', help='Assembler to use. Available options: ' + ','.join(allowed_assemblers) + ' [%(default)s]', choices=allowed_assemblers, default='spades', metavar='Assembler') + assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs. This is also the value used for sspace -k when scaffolding [%(default)s]', default=10, metavar='INT') + assembly_group.add_argument('--assembler_k', help='kmer size to use with assembler. Default is 2/3 of the read length', metavar='INT') + assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads') other_group = parser.add_argument_group('Other options') other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1 (default) or 4', choices=[1,4], default=1, metavar='INT') - other_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs. This is also the value used for sspace -k when scaffolding [%(default)s]', default=10, metavar='INT') - allowed_assemblers = ['velvet', 'spades'] - other_group.add_argument('--assembler', help='Assembler to use. Although SPAdes can be used, it is experimental and velvet is currently recommended. Available options: ' + ','.join(allowed_assemblers) + ' [%(default)s]', choices=allowed_assemblers, default='velvet', metavar='Assembler') - other_group.add_argument('--assembler_k', help='kmer size to use with assembler. Default is 2/3 of the read length', metavar='INT') - other_group.add_argument('--spades_only_assembler', action='store_true', help='Use --only-assembler when running SPAdes') other_group.add_argument('--threads', type=int, help='Number of threads for smalt and spades [%(default)s]', default=1, metavar='INT') other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)') other_group.add_argument('--unique_threshold', type=float, help='If proportion of bases in gene assembled more than once is <= this value, then the flag unique_contig is set [%(default)s]', default=0.03, metavar='FLOAT (between 0 and 1)') other_group.add_argument('--verbose', action='store_true', help='Be verbose') - options = parser.parse_args() + executables_group = parser.add_argument_group('executables locations') + executables_group.add_argument('--bcftools', help='Path to bcftools executable [%(default)s]', default='bcftools', metavar='PATH') + executables_group.add_argument('--gapfiller', help='Path to GapFiller executable [%(default)s]', default='GapFiller.pl', metavar='PATH') + executables_group.add_argument('--samtools', help='Path to samtools executable [%(default)s]', default='samtools', metavar='PATH') + executables_group.add_argument('--smalt', help='Path to SMALT executable [%(default)s]', default='smalt', metavar='PATH') + executables_group.add_argument('--spades', help='Path to SPAdes executable [%(default)s]', default='spades.py', metavar='PATH') + executables_group.add_argument('--sspace', help='Path to SSPACE executable [%(default)s]', default='SSPACE_Basic_v2.0.pl', metavar='PATH') + executables_group.add_argument('--velvet', help='Path to prefix of velvet{g,h} [%(default)s]', default='velvet', metavar='PATH') + + options = parser.parse_args() + ariba.external_progs.check_versions(options, verbose=options.verbose) pyfastaq.sequences.codon2aa = pyfastaq.genetic_codes.codes[options.genetic_code] c = ariba.clusters.Clusters( @@ -60,12 +65,13 @@ def run(): nucmer_min_id=options.nucmer_min_id, nucmer_min_len=options.nucmer_min_len, nucmer_breaklen=options.nucmer_breaklen, - spades_only_assembler=options.spades_only_assembler, + spades_other=options.spades_other, assembled_threshold=options.assembled_threshold, unique_threshold=options.unique_threshold, bcftools_exe=options.bcftools, gapfiller_exe=options.gapfiller, samtools_exe=options.samtools, + spades_exe=options.spades, sspace_exe=options.sspace, velvet_exe=options.velvet, ) diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index e9db8ea3..af46113e 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -71,14 +71,14 @@ def test_set_assembly_kmer(self): clean_cluster_dir(cluster_dir) - def test_assemble_with_velvet(self): - '''test _assemble_with_velvet''' - cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_velvet') - clean_cluster_dir(cluster_dir, exclude=set(['gene.reads_mapped.unsorted.bam'])) - c = cluster.Cluster(cluster_dir) - c._assemble_with_velvet() - self.assertEqual(c.status_flag.to_number(), 0) - clean_cluster_dir(cluster_dir, exclude=set(['gene.reads_mapped.unsorted.bam'])) + #def test_assemble_with_velvet(self): + # '''test _assemble_with_velvet''' + # cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_velvet') + # clean_cluster_dir(cluster_dir, exclude=set(['gene.reads_mapped.unsorted.bam'])) + # c = cluster.Cluster(cluster_dir) + # c._assemble_with_velvet() + # self.assertEqual(c.status_flag.to_number(), 0) + # clean_cluster_dir(cluster_dir, exclude=set(['gene.reads_mapped.unsorted.bam'])) def test_assemble_with_spades(self): diff --git a/scripts/ariba b/scripts/ariba index 1316a7b3..2dfaa0d5 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -19,7 +19,7 @@ ordered_tasks = [ def print_usage_and_exit(): - print('Usage: ariba [options]', file=sys.stderr) + print('Usage: ariba [options] ', file=sys.stderr) print('\nTo get minimal usage for a command use:\nariba command', file=sys.stderr) print('\nTo get full help for a command use one of:\nariba command -h\nariba command --help\n', file=sys.stderr) print('\nAvailable commands:\n', file=sys.stderr) diff --git a/setup.py b/setup.py index 4c50e353..75f24f65 100644 --- a/setup.py +++ b/setup.py @@ -27,11 +27,6 @@ sys.exit(1) -if not( (shutil.which('velveth') and shutil.which('velvetg')) or shutil.which('spades.py') ): - print('Must have velvet (velveth and velvetg) or spades.py installed. Cannot continue', file=sys.stderr) - sys.exit(1) - - print('... all dependencies found')