Skip to content

Commit

Permalink
Merge pull request #102 from martinghunt/minimap_remove_noise
Browse files Browse the repository at this point in the history
Minimap remove noise
  • Loading branch information
martinghunt authored Jul 19, 2016
2 parents bbf7805 + a94ac4e commit 29242b7
Show file tree
Hide file tree
Showing 41 changed files with 1,493 additions and 137 deletions.
2 changes: 2 additions & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
'histogram',
'link',
'mapping',
'mash',
'read_filter',
'read_store',
'refdata_query',
'reference_data',
Expand Down
19 changes: 15 additions & 4 deletions ariba/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import shutil
import pyfastaq
import pymummer
from ariba import common, mapping, bam_parse, external_progs
from ariba import common, faidx, mapping, bam_parse, external_progs, mash

class Error (Exception): pass

Expand All @@ -12,14 +12,14 @@ def __init__(self,
reads1,
reads2,
ref_fasta,
ref_fastas,
working_dir,
final_assembly_fa,
final_assembly_bam,
log_fh,
scaff_name_prefix='scaffold',
kmer=0,
assembler='spades',
bowtie2_preset='very-sensitive-local',
max_insert=1000,
min_scaff_depth=10,
min_scaff_length=50,
Expand All @@ -36,15 +36,16 @@ def __init__(self,
self.reads1 = os.path.abspath(reads1)
self.reads2 = os.path.abspath(reads2)
self.ref_fasta = os.path.abspath(ref_fasta)
self.ref_fastas = os.path.abspath(ref_fastas)
self.working_dir = os.path.abspath(working_dir)
self.final_assembly_fa = os.path.abspath(final_assembly_fa)
self.final_assembly_bam = os.path.abspath(final_assembly_bam)
self.log_fh = log_fh
self.scaff_name_prefix = scaff_name_prefix

self.ref_seq_name = None
self.assembly_kmer = self._get_assembly_kmer(kmer, reads1, reads2)
self.assembler = assembler
self.bowtie2_preset = bowtie2_preset
self.max_insert = max_insert
self.min_scaff_depth = min_scaff_depth
self.min_scaff_length = min_scaff_length
Expand Down Expand Up @@ -74,6 +75,7 @@ def __init__(self,
self.gapfill_dir = os.path.join(self.working_dir, 'Gapfill')
self.gapfilled_scaffolds = os.path.join(self.working_dir, 'scaffolds.gapfilled.fa')
self.gapfilled_length_filtered = os.path.join(self.working_dir, 'scaffolds.gapfilled.length_filtered.fa')
self.mash_dist_file = os.path.join(self.working_dir, 'mash_dist_to_ref_seqs')


@staticmethod
Expand Down Expand Up @@ -344,6 +346,16 @@ def run(self):
self.log_fh = None
return

masher = mash.Masher(self.ref_fastas, self.gapfilled_length_filtered, self.log_fh, self.extern_progs)
self.ref_seq_name = masher.run(self.mash_dist_file)
if self.ref_seq_name is None:
print('Could not determine closest reference sequence', file=self.log_fh)
self.log_fh = None
return

faidx.write_fa_subset({self.ref_seq_name}, self.ref_fastas, self.ref_fasta, samtools_exe=self.extern_progs.exe('samtools'), verbose=True, verbose_filehandle=self.log_fh)
print('Closest reference sequence according to mash: ', self.ref_seq_name, file=self.log_fh)

contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa, min_id=self.nucmer_min_id, min_length=self.nucmer_min_len, breaklen=self.nucmer_breaklen)
self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)
Expand All @@ -357,7 +369,6 @@ def run(self):
sort=True,
samtools=self.extern_progs.exe('samtools'),
bowtie2=self.extern_progs.exe('bowtie2'),
bowtie2_preset=self.bowtie2_preset,
verbose=True,
verbose_filehandle=self.log_fh
)
Expand Down
123 changes: 56 additions & 67 deletions ariba/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import shutil
import sys
import pyfastaq
from ariba import assembly, assembly_compare, assembly_variants, bam_parse, best_seq_chooser, external_progs, flag, mapping, report, samtools_variants
from ariba import assembly, assembly_compare, assembly_variants, bam_parse, external_progs, flag, mapping, mash, read_filter, report, samtools_variants

class Error (Exception): pass

Expand All @@ -17,8 +17,8 @@ def __init__(self,
root_dir,
name,
refdata,
total_reads,
total_reads_bases,
total_reads=None,
total_reads_bases=None,
fail_file=None,
read_store=None,
reference_names=None,
Expand All @@ -42,7 +42,6 @@ def __init__(self,
assembled_threshold=0.95,
unique_threshold=0.03,
max_gene_nt_extend=30,
bowtie2_preset='very-sensitive-local',
spades_other_options=None,
clean=True,
extern_progs=None,
Expand Down Expand Up @@ -90,8 +89,6 @@ def __init__(self,
self.bcf_min_dv_over_dp = bcf_min_dv_over_dp
self.bcf_min_qual = bcf_min_qual

self.bowtie2_preset = bowtie2_preset

self.threads = threads
self.assembled_threshold = assembled_threshold
self.unique_threshold = unique_threshold
Expand Down Expand Up @@ -155,25 +152,36 @@ def _receive_signal(self, signum, stack):

def _input_files_exist(self):
assert self.read_store is None
if not (os.path.exists(self.all_reads1) and os.path.exists(self.all_reads2)):
raise Error('Error making cluster. Reads files not found')
if not os.path.exists(self.references_fa):
raise Error('Error making cluster. References fasta not found')


def _set_up_input_files(self):
if self.logfile is None:
self.logfile = os.path.join(self.root_dir, 'log.txt')

if os.path.exists(self.root_dir):
self._input_files_exist()
seqreader = pyfastaq.sequences.file_reader(self.references_fa)
self.reference_names = set([x.id for x in seqreader])
self.log_fh = pyfastaq.utils.open_file_write(self.logfile)
else:
assert self.read_store is not None
assert self.reference_names is not None
try:
os.mkdir(self.root_dir)
except:
raise Error('Error making directory ' + self.root_dir)

self.refdata.write_seqs_to_fasta(self.references_fa, self.reference_names)
self.log_fh = pyfastaq.utils.open_file_write(self.logfile)
self.read_store.get_reads(self.name, self.all_reads1, self.all_reads2)
rfilter = read_filter.ReadFilter(self.read_store, self.references_fa, self.name, self.log_fh, self.extern_progs)
self.total_reads, self.total_reads_bases = rfilter.run(self.all_reads1, self.all_reads2)
self.refdata.write_seqs_to_fasta(self.references_fa, self.reference_names)

self.longest_ref_length = max([len(self.refdata.sequence(name)) for name in self.reference_names])


def _clean_file(self, filename):
if self.clean:
Expand Down Expand Up @@ -210,9 +218,7 @@ def _clean(self):


@staticmethod
def _number_of_reads_for_assembly(reference_fa, insert_size, total_bases, total_reads, coverage):
file_reader = pyfastaq.sequences.file_reader(reference_fa)
ref_length = sum([len(x) for x in file_reader])
def _number_of_reads_for_assembly(ref_length, insert_size, total_bases, total_reads, coverage):
assert ref_length > 0
ref_length += 2 * insert_size
mean_read_length = total_bases / total_reads
Expand Down Expand Up @@ -266,11 +272,6 @@ def run(self):
if not os.path.exists(fname):
raise Error('File ' + fname + ' not found. Cannot continue')

if self.logfile is None:
self.logfile = os.path.join(self.root_dir, 'log.txt')

self.log_fh = pyfastaq.utils.open_file_write(self.logfile)

original_dir = os.getcwd()
os.chdir(self.root_dir)

Expand All @@ -295,61 +296,46 @@ def run(self):

def _run(self):
print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True)

print('Choosing best reference sequence:', file=self.log_fh, flush=True)
seq_chooser = best_seq_chooser.BestSeqChooser(
self.all_reads1,
self.all_reads2,
self.references_fa,
self.log_fh,
samtools_exe=self.extern_progs.exe('samtools'),
bowtie2_exe=self.extern_progs.exe('bowtie2'),
bowtie2_preset=self.bowtie2_preset,
threads=1,
wanted_reads = self._number_of_reads_for_assembly(self.longest_ref_length, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage)
made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed)
print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True)
print('Assembling reads:', file=self.log_fh, flush=True)

self.assembly = assembly.Assembly(
self.reads_for_assembly1,
self.reads_for_assembly2,
self.reference_fa,
self.references_fa,
self.assembly_dir,
self.final_assembly_fa,
self.final_assembly_bam,
self.log_fh,
scaff_name_prefix=self.name,
kmer=self.assembly_kmer,
assembler=self.assembler,
spades_other_options=self.spades_other_options,
sspace_k=self.sspace_k,
sspace_sd=self.sspace_sd,
reads_insert=self.reads_insert,
extern_progs=self.extern_progs,
clean=self.clean
)
self.ref_sequence = seq_chooser.best_seq(self.reference_fa)
self._clean_file(self.references_fa)
self._clean_file(self.references_fa + '.fai')

if self.ref_sequence is None:
self.status_flag.add('ref_seq_choose_fail')
self.assembled_ok = False
else:
wanted_reads = self._number_of_reads_for_assembly(self.reference_fa, self.reads_insert, self.total_reads_bases, self.total_reads, self.assembly_coverage)
made_reads = self._make_reads_for_assembly(wanted_reads, self.total_reads, self.all_reads1, self.all_reads2, self.reads_for_assembly1, self.reads_for_assembly2, random_seed=self.random_seed)
print('\nUsing', made_reads, 'from a total of', self.total_reads, 'for assembly.', file=self.log_fh, flush=True)
print('Assembling reads:', file=self.log_fh, flush=True)
self.assembly.run()
self.assembled_ok = self.assembly.assembled_ok
self._clean_file(self.reads_for_assembly1)
self._clean_file(self.reads_for_assembly2)
if self.clean:
print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True)
shutil.rmtree(self.assembly_dir)


if self.assembled_ok and self.assembly.ref_seq_name is not None:
self.ref_sequence = self.refdata.sequence(self.assembly.ref_seq_name)
is_gene, is_variant_only = self.refdata.sequence_type(self.ref_sequence.id)
self.is_gene = '1' if is_gene == 'p' else '0'
self.is_variant_only = '1' if is_variant_only else '0'
self.assembly = assembly.Assembly(
self.reads_for_assembly1,
self.reads_for_assembly2,
self.reference_fa,
self.assembly_dir,
self.final_assembly_fa,
self.final_assembly_bam,
self.log_fh,
scaff_name_prefix=self.ref_sequence.id,
kmer=self.assembly_kmer,
assembler=self.assembler,
spades_other_options=self.spades_other_options,
sspace_k=self.sspace_k,
sspace_sd=self.sspace_sd,
reads_insert=self.reads_insert,
extern_progs=self.extern_progs,
clean=self.clean
)

self.assembly.run()
self.assembled_ok = self.assembly.assembled_ok
self._clean_file(self.reads_for_assembly1)
self._clean_file(self.reads_for_assembly2)
if self.clean:
print('Deleting Assembly directory', self.assembly_dir, file=self.log_fh, flush=True)
shutil.rmtree(self.assembly_dir)

if self.assembled_ok:
print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh, flush=True)

mapping.run_bowtie2(
Expand All @@ -361,7 +347,7 @@ def _run(self):
sort=True,
samtools=self.extern_progs.exe('samtools'),
bowtie2=self.extern_progs.exe('bowtie2'),
bowtie2_preset=self.bowtie2_preset,
bowtie2_preset='very-sensitive-local',
verbose=True,
verbose_filehandle=self.log_fh
)
Expand Down Expand Up @@ -426,9 +412,12 @@ def _run(self):

if self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file):
self.status_flag.add('variants_suggest_collapsed_repeat')
else:
elif not self.assembled_ok:
print('\nAssembly failed\n', file=self.log_fh, flush=True)
self.status_flag.add('assembly_fail')
elif self.assembly.ref_seq_name is None:
print('\nCould not get closest reference sequence\n', file=self.log_fh, flush=True)
self.status_flag.add('ref_seq_choose_fail')


print('\nMaking report lines', file=self.log_fh, flush=True)
Expand Down
8 changes: 1 addition & 7 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def __init__(self,
assembled_threshold=0.95,
unique_threshold=0.03,
max_gene_nt_extend=30,
bowtie2_preset='very-sensitive-local',
clean=True,
tmp_dir=None,
):
Expand Down Expand Up @@ -108,7 +107,6 @@ def __init__(self,
self.verbose = verbose

self.max_insert = max_insert
self.bowtie2_preset = bowtie2_preset

self.insert_hist_bin = 10
self.insert_hist = histogram.Histogram(self.insert_hist_bin)
Expand All @@ -132,7 +130,6 @@ def __init__(self,
self.pool = None
self.fails_dir = os.path.join(self.outdir ,'.fails')
self.clusters_all_ran_ok = True
self.inital_mapping_tool = 'bowtie2'

for d in [self.outdir, self.logs_dir, self.fails_dir]:
try:
Expand Down Expand Up @@ -275,7 +272,7 @@ def _map_and_cluster_reads(self):
os.unlink(reads_file_for_read_store)

if self.verbose:
print('Found', self.proper_pairs, 'proper read pairs')
print('Found', self.proper_pairs, 'proper read pairs from minimap')
print('Total clusters to perform local assemblies:', len(self.cluster_to_dir), flush=True)


Expand Down Expand Up @@ -381,8 +378,6 @@ def _init_and_run_clusters(self):
new_dir,
cluster_name,
self.refdata,
self.cluster_read_counts[cluster_name],
self.cluster_base_counts[cluster_name],
fail_file=os.path.join(self.fails_dir, cluster_name),
read_store=self.read_store,
reference_names=self.cluster_ids[cluster_name],
Expand All @@ -406,7 +401,6 @@ def _init_and_run_clusters(self):
assembled_threshold=self.assembled_threshold,
unique_threshold=self.unique_threshold,
max_gene_nt_extend=self.max_gene_nt_extend,
bowtie2_preset=self.bowtie2_preset,
spades_other_options=self.spades_other,
clean=self.clean,
extern_progs=self.extern_progs,
Expand Down
8 changes: 7 additions & 1 deletion ariba/external_progs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ class Error (Exception): pass
'bcftools': 'bcftools',
'bowtie2': 'bowtie2',
'cdhit': 'cd-hit-est',
'cdhit2d': 'cd-hit-est-2d',
'gapfiller': 'GapFiller.pl',
'mash': 'mash',
'nucmer' : 'nucmer',
'samtools': 'samtools',
'spades': 'spades.py',
Expand All @@ -29,7 +31,9 @@ class Error (Exception): pass
'bcftools': ('', re.compile('^Version: ([0-9\.]+)')),
'bowtie2': ('--version', re.compile('.*bowtie2.*version (.*)$')),
'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
'cdhit2d': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
'gapfiller': ('', re.compile('^Usage: .*pl \[GapFiller_(.*)\]')),
'mash': ('', re.compile('^Mash version (.*)$')),
'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
'samtools': ('', re.compile('^Version: ([0-9\.]+)')),
'spades': ('', re.compile('^SPAdes genome assembler v\.?([0-9\.]+)')),
Expand All @@ -41,7 +45,9 @@ class Error (Exception): pass
min_versions = {
'bcftools': '1.2',
'bowtie2': '2.1.0',
'cd-hit': '4.6',
'cdhit': '4.6',
'cdhit2d': '4.6',
'mash': '1.1',
'nucmer': '3.1',
'samtools': '1.2',
'spades': '3.5.0',
Expand Down
Loading

0 comments on commit 29242b7

Please sign in to comment.