Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use spades #3

Merged
merged 10 commits into from
Feb 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ Dependencies:
* [samtools and bcftools] [samtools] version >= 1.2
* [SSPACE-basic scaffolder] [sspace]
* [GapFiller] [gapfiller]
* [SMALT] [smalt]
* [Velvet] [velvet]
* [MUMmer] [mummer] vesrion >= 3.23
* [SMALT] [smalt] version >= 0.7.4
* Either [SPAdes] [spades] version >= 3.5.0 or [Velvet] [velvet] version >= 1.2.07
(SPAdes is recommended)

Install with pip:

Expand All @@ -35,7 +37,7 @@ Usage
The installation will put a single script called `ariba` in your path.
The usage is:

ariba <command> [options]
ariba <command> [options] <required arguments>

Run just `ariba` to get a list of tasks. Use `-h` or `--help`
with a task to get the help, for example
Expand All @@ -48,8 +50,10 @@ To run the pipeline to identify the genes present in a pair of FASTQ files:



[smalt]: https://www.sanger.ac.uk/resources/software/smalt/
[sspace]: http://www.baseclear.com/genomics/bioinformatics/basetools/SSPACE
[gapfiller]: http://www.baseclear.com/genomics/bioinformatics/basetools/gapfiller
[mummer]: http://mummer.sourceforge.net/
[samtools]: http://www.htslib.org/
[smalt]: https://www.sanger.ac.uk/resources/software/smalt/
[spades]: http://bioinf.spbau.ru/spades
[sspace]: http://www.baseclear.com/genomics/bioinformatics/basetools/SSPACE
[velvet]: http://www.ebi.ac.uk/~zerbino/velvet/
1 change: 1 addition & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
'cluster',
'clusters',
'common',
'external_progs',
'flag',
'histogram',
'link',
Expand Down
37 changes: 20 additions & 17 deletions ariba/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ def __init__(self,
nucmer_min_id=90,
nucmer_min_len=50,
nucmer_breaklen=50,
spades='spades.py',
spades_only_assembler=False,
sspace_k=20,
reads_insert=500,
sspace_sd=0.4,
Expand All @@ -36,8 +34,10 @@ def __init__(self,
bcftools_exe='bcftools',
gapfiller_exe='GapFiller.pl',
samtools_exe='samtools',
spades_exe='spades.py',
sspace_exe='SSPACE_Basic_v2.0.pl',
velvet_exe='velvet', # prefix of velvet{g,h}
spades_other=None,
):

self.root_dir = os.path.abspath(root_dir)
Expand Down Expand Up @@ -71,8 +71,8 @@ def __init__(self,
self._set_assembly_kmer(assembly_kmer)
self.assembler = assembler
assert self.assembler in ['velvet', 'spades']
self.spades = spades
self.spades_only_assembler = spades_only_assembler
self.spades_exe = spades_exe
self.spades_other = spades_other

self.bcftools_exe = bcftools_exe

Expand Down Expand Up @@ -140,6 +140,17 @@ def _set_assembly_kmer(self, k):


def _assemble_with_velvet(self):
# map reads to reference gene to make BAM input to velvet columbus
mapping.run_smalt(
self.reads1,
self.reads2,
self.gene_fa,
self.gene_bam[:-4],
threads=self.threads,
sort=True,
verbose=self.verbose,
)

cmd = ' '.join([
self.velveth,
self.assembler_dir,
Expand Down Expand Up @@ -185,15 +196,16 @@ def _assemble_with_velvet(self):

def _assemble_with_spades(self, unittest=False):
cmd = ' '.join([
self.spades,
self.spades_exe,
'-1', self.reads1,
'-2', self.reads2,
'-o', self.assembler_dir,
'-k', str(self.assembly_kmer),
'--threads', str(self.threads),
'--trusted-contigs', self.gene_fa,
])
if self.spades_only_assembler:
cmd += ' --only-assembler'
if self.spades_other is not None:
cmd += ' ' + self.spades_other

cwd = os.getcwd()
os.chdir(self.assembly_dir)
Expand Down Expand Up @@ -659,16 +671,6 @@ def _make_report_lines(self):


def run(self):
# map reads to reference gene
mapping.run_smalt(
self.reads1,
self.reads2,
self.gene_fa,
self.gene_bam[:-4],
threads=self.threads,
sort=True,
)

if self.assembler == 'velvet':
self._assemble_with_velvet()
elif self.assembler == 'spades':
Expand Down Expand Up @@ -696,6 +698,7 @@ def run(self):
self.final_assembly_bam[:-4],
threads=self.threads,
sort=True,
verbose=self.verbose,
)
self._parse_assembly_bam()

Expand Down
31 changes: 26 additions & 5 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self,
smalt_k=13,
smalt_s=2,
smalt_min_id=0.9,
spades_only_assembler=False,
spades_other=None,
max_insert=1000,
min_scaff_depth=10,
nucmer_min_id=90,
Expand All @@ -33,6 +33,7 @@ def __init__(self,
bcftools_exe='bcftools',
gapfiller_exe='GapFiller.pl',
samtools_exe='samtools',
spades_exe='spades.py',
sspace_exe='SSPACE_Basic_v2.0.pl',
velvet_exe='velvet', # prefix of velvet{g,h}
):
Expand All @@ -44,7 +45,7 @@ def __init__(self,
self.assembler = assembler
assert self.assembler in ['velvet', 'spades']
self.assembly_kmer = assembly_kmer
self.spades_only_assembler = spades_only_assembler
self.spades_other = spades_other

self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
self.bam = self.bam_prefix + '.bam'
Expand Down Expand Up @@ -83,6 +84,7 @@ def __init__(self,
self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages

self.samtools_exe = samtools_exe
self.spades_exe = spades_exe

self.sspace_exe = shutil.which(sspace_exe)
if self.sspace_exe is None:
Expand All @@ -106,7 +108,8 @@ def _map_reads(self):
index_k=self.smalt_k,
index_s=self.smalt_s,
threads=self.threads,
minid=self.smalt_min_id
minid=self.smalt_min_id,
verbose=self.verbose,
)


Expand Down Expand Up @@ -201,9 +204,11 @@ def _set_insert_size_data(self):
(x, self.insert_size, pc95, self.insert_sspace_sd) = self.insert_hist.stats()
self.insert_proper_pair_max = 1.1 * pc95
if self.verbose:
print('\nInsert size information from reads mapped to reference genes:')
print('Insert size:', self.insert_size, sep='\t')
print('Insert sspace sd:', self.insert_sspace_sd, sep='\t')
print('Max insert:', self.insert_proper_pair_max, sep='\t')
print()


def _write_gene_fa(self, gene_name, outfile):
Expand All @@ -223,9 +228,12 @@ def _init_and_run_clusters(self):
if len(self.cluster_to_dir) == 0:
raise Error('Did not get any reads mapped to genes. Cannot continue')

counter = 0

for gene in sorted(self.cluster_to_dir):
counter += 1
if self.verbose:
print('Running', gene)
print('\nAssembling cluster', counter, 'of', str(len(self.cluster_to_dir)) + ':', gene)
new_dir = self.cluster_to_dir[gene]
self._write_gene_fa(gene, os.path.join(new_dir, 'gene.fa'))
self.clusters[gene] = cluster.Cluster(
Expand All @@ -237,7 +245,6 @@ def _init_and_run_clusters(self):
nucmer_min_id=self.nucmer_min_id,
nucmer_min_len=self.nucmer_min_len,
nucmer_breaklen=self.nucmer_breaklen,
spades_only_assembler=self.spades_only_assembler,
sspace_k=self.min_scaff_depth,
reads_insert=self.insert_size,
sspace_sd=self.insert_sspace_sd,
Expand All @@ -248,8 +255,10 @@ def _init_and_run_clusters(self):
bcftools_exe=self.bcftools_exe,
gapfiller_exe=self.gapfiller_exe,
samtools_exe=self.samtools_exe,
spades_exe=self.spades_exe,
sspace_exe=self.sspace_exe,
velvet_exe=self.velvet,
spades_other=self.spades_other
)

self.clusters[gene].run()
Expand Down Expand Up @@ -293,8 +302,20 @@ def _write_reports(self):


def run(self):
if self.verbose:
print('{:_^79}'.format(' Mapping reads to reference genes '))
self._map_reads()
if self.verbose:
print('Finished mapping\n')
print('{:_^79}'.format(' Generating clusters '))
self._bam_to_clusters_reads()
self._set_insert_size_data()
if self.verbose:
print('{:_^79}'.format(' Assembling each cluster '))
self._init_and_run_clusters()
if self.verbose:
print('Finished assembling clusters\n')
print('{:_^79}'.format(' Writing report files '))
self._write_reports()
if self.verbose:
print('Finished writing report files. All done!')
100 changes: 100 additions & 0 deletions ariba/external_progs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import shutil
import subprocess
import os
from distutils.version import LooseVersion
import re
import sys
import pyfastaq
from ariba import common

class Error (Exception): pass

def is_in_path(prog):
return shutil.which(prog) is not None


prog_to_version_cmd = {
'bcftools': ('bcftools', re.compile('^Version: ([0-9\.]+)')),
'nucmer': ('nucmer --version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
'smalt': ('smalt version', re.compile('^Version: ([0-9\.]+)')),
'spades': ('spades.py', re.compile('^SPAdes genome assembler v.([0-9\.]+)')),
'samtools': ('samtools', re.compile('^Version: ([0-9\.]+)')),
'sspace': (None, re.compile('^Usage: .*pl \[SSPACE_(.*)\]')),
'gapfiller': (None, re.compile('^Usage: .*pl \[GapFiller_(.*)\]')),
'velvetg': ('velvetg', re.compile('Version ([0-9\.]+)')),
'velveth': ('velveth', re.compile('Version ([0-9\.]+)')),
}


def get_version(prog, path=None):
assert prog in prog_to_version_cmd
if path is None:
path = prog
if not is_in_path(path):
raise Error('Error getting version of ' + path + ' - not found in path.')

if prog in ['sspace', 'gapfiller']:
cmd = 'perl ' + os.path.realpath(shutil.which(path))
regex = prog_to_version_cmd[prog][1]
else:
cmd, regex = prog_to_version_cmd[prog]

cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1]
for line in cmd_output:
hits = regex.search(line)
if hits:
return hits.group(1)
return 'UNKNOWN ...\n I tried running this to get the version: "' + cmd + '"\n and the output didn\'t match this regular expression: "' + regex.pattern + '"'



def check_versions(opts, verbose=False):
if verbose:
print('{:_^79}'.format(' Checking dependencies and their versions '))
print('tool', 'path', 'version', sep='\t')

l = [
('bcftools', opts.bcftools),
('nucmer', None),
('smalt', opts.smalt),
('samtools', opts.samtools),
('sspace', opts.sspace),
('gapfiller', opts.gapfiller),
]

if opts.assembler == 'spades':
l.append(('spades', opts.spades))
elif opts.assembler == 'velvet':
l.append(('velvetg', opts.velvet + 'g'))
l.append(('velveth', opts.velvet + 'h'))
else:
raise Error('Assembler ' + opts.assembler + ' not recognised. Cannot continue')

min_versions = {
'bcftools': '1.2',
'samtools': '1.2',
'nucmer': '3.1',
'spades': '3.5.0',
'smalt': '0.7.4',
'velvetg': '1.2.07',
'velveth': '1.2.07',
}

errors = []

for t in l:
version = get_version(t[0], path=t[1])
if verbose:
print(t[0], t[1], version, sep='\t')

if t[0] in min_versions and LooseVersion(version) < LooseVersion(min_versions[t[0]]):
errors.append(' '.join(['Found version', version, 'of', t[0], 'which is too low! Please update to at least', min_versions[t[0]]]))

if len(errors):
for e in errors:
print(e, file=sys.stderr)
raise Error('Cannot continue. Some dependencies need updating')

if verbose:
print('\n... dependencies look OK.\n')
11 changes: 6 additions & 5 deletions ariba/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def run_smalt(
max_insert=1000,
minid=0.9,
sort=False,
extra_smalt_map_ops='-x'
extra_smalt_map_ops='-x',
verbose=False
):
if extra_smalt_map_ops is None:
extra_smalt_map_ops = ''
Expand Down Expand Up @@ -59,15 +60,15 @@ def run_smalt(
intermediate_bam = final_bam

map_cmd += ' -bS -T ' + ref_fa + ' - > ' + intermediate_bam
common.syscall(index_cmd)
common.syscall(map_cmd)
common.syscall(index_cmd, verbose=verbose)
common.syscall(map_cmd, verbose=verbose)

if sort:
threads = min(4, threads)
thread_mem = int(500 / threads)
sort_cmd = 'samtools sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
index_cmd = 'samtools index ' + final_bam
common.syscall(sort_cmd)
common.syscall(index_cmd)
common.syscall(sort_cmd, verbose=verbose)
common.syscall(index_cmd, verbose=verbose)
for fname in clean_files:
os.unlink(fname)
Loading