Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New task prepareref #56

Merged
merged 33 commits into from
Apr 20, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c73c975
Initial ref_preparer template code
Apr 19, 2016
eae8785
Tests pass _get_ref_files()
Apr 19, 2016
ee2de9e
Print filename when verbose
Apr 19, 2016
2e52d1c
Add run() method
Apr 19, 2016
6bcaa69
Add initial (partially working) version of prepareref
Apr 19, 2016
fafaa4e
Run cdhit; tidy up option groups
Apr 19, 2016
af7bda1
Do not get cluster ids - they are written to file anyway
Apr 19, 2016
9165536
write info.txt file
Apr 19, 2016
18e0986
Add method _load_reference_data_info_file
Apr 20, 2016
568b71c
Add method _load_reference_data_from_dir
martinghunt Apr 20, 2016
ddb5247
Split bowtie2-build into separate function
martinghunt Apr 20, 2016
5466bf1
Run bowtie2-build, get cd-hit-est from external_progs
martinghunt Apr 20, 2016
21b445a
pickle cluster ids dict
martinghunt Apr 20, 2016
a30e4b0
load pickled cluster ids dict
martinghunt Apr 20, 2016
4f5a717
Move function sam_to_fastq from clusters to mapping module
martinghunt Apr 20, 2016
d07a5c9
Move sam_pair_to_insert from Clusters to mapping
martinghunt Apr 20, 2016
74edff9
Samtools faidx the cluster representatives fasta
martinghunt Apr 20, 2016
36ce4d4
Fix write_info_file test
martinghunt Apr 20, 2016
3643558
Force threads to be 1
martinghunt Apr 20, 2016
86468ae
Clusters takes ref directory instead of object as input
martinghunt Apr 20, 2016
f4b4a6c
Fix instructions to use prepareref
martinghunt Apr 20, 2016
a7136c2
Name output fasta to match expected by prepareref
martinghunt Apr 20, 2016
b32751b
Warn about duplicate sequence names and just keep the longest
martinghunt Apr 20, 2016
1c4a2ac
Prefix sequence names with filename
martinghunt Apr 20, 2016
9cd56af
Bug fix oops emptied cluster_ids dict
martinghunt Apr 20, 2016
52d12b5
Bug fix getting reference files
martinghunt Apr 20, 2016
998fdc7
ref loading and cdhit now moved to prepareref
martinghunt Apr 20, 2016
37ca606
test runs prepareref as well
martinghunt Apr 20, 2016
80a391b
Add samtools_variants
martinghunt Apr 20, 2016
f7a1242
Remove unused imports
martinghunt Apr 20, 2016
0352543
Remove unused imports
martinghunt Apr 20, 2016
964822c
Remove task refcheck
martinghunt Apr 20, 2016
e9c2d48
Remove unused import argparse
martinghunt Apr 20, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
'mapping',
'reference_data',
'ref_genes_getter',
'ref_preparer',
'report',
'report_filter',
'scaffold_graph',
'samtools_variants',
'sequence_metadata',
'sequence_variant',
'summary',
Expand Down
1 change: 1 addition & 0 deletions ariba/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def _assemble_with_spades(self, unittest=False):
'-2', self.reads2,
'-o', self.assembler_dir,
'-k', str(self.assembly_kmer),
'--threads 1', # otherwise defaults to 16!
'--untrusted-contigs', self.ref_fasta,
])
if self.spades_other_options is not None:
Expand Down
143 changes: 54 additions & 89 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import copy
import pickle
import itertools
import sys
import shutil
import openpyxl
import multiprocessing
import pysam
import pyfastaq
from ariba import cluster, common, mapping, histogram, report, report_filter
from ariba import cluster, common, mapping, histogram, report, report_filter, reference_data

class Error (Exception): pass

Expand All @@ -23,7 +24,7 @@ def _run_cluster(obj, verbose):

class Clusters:
def __init__(self,
refdata,
refdata_dir,
reads_1,
reads_2,
outdir,
Expand All @@ -42,12 +43,10 @@ def __init__(self,
assembled_threshold=0.95,
unique_threshold=0.03,
bowtie2_preset='very-sensitive-local',
cdhit_seq_identity_threshold=0.9,
cdhit_length_diff_cutoff=0.9,
run_cd_hit=True,
clean=1,
):
self.refdata = refdata
self.refdata_dir = os.path.abspath(refdata_dir)
self.refdata, self.cluster_ids = self._load_reference_data_from_dir(refdata_dir)
self.reads_1 = os.path.abspath(reads_1)
self.reads_2 = os.path.abspath(reads_2)
self.outdir = os.path.abspath(outdir)
Expand All @@ -61,11 +60,9 @@ def __init__(self,
self.assembly_coverage = assembly_coverage
self.spades_other = spades_other

self.refdata_files_prefix = os.path.join(self.outdir, 'refdata')
self.cdhit_files_prefix = os.path.join(self.outdir, 'cdhit')
self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit')
self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa'
self.cluster_ids = {}
self.bam_prefix = self.cdhit_cluster_representatives_fa + '.map_reads'
self.bam_prefix = os.path.join(self.outdir, 'map_reads_to_cluster_reps')
self.bam = self.bam_prefix + '.bam'
self.report_file_all_tsv = os.path.join(self.outdir, 'report.all.tsv')
self.report_file_all_xls = os.path.join(self.outdir, 'report.all.xls')
Expand Down Expand Up @@ -96,28 +93,58 @@ def __init__(self,
self.cluster_read_counts = {} # gene name -> number of reads
self.cluster_base_counts = {} # gene name -> number of bases

self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold
self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff
self.run_cd_hit = run_cd_hit

for d in [self.outdir, self.clusters_outdir]:
try:
os.mkdir(d)
except:
raise Error('Error mkdir ' + d)


def _run_cdhit(self):
self.cluster_ids = self.refdata.cluster_with_cdhit(
self.refdata_files_prefix + '.01.check_variants',
self.cdhit_files_prefix,
seq_identity_threshold=self.cdhit_seq_identity_threshold,
threads=self.threads,
length_diff_cutoff=self.cdhit_length_diff_cutoff,
nocluster=not self.run_cd_hit,
verbose=self.verbose,
@classmethod
def _load_reference_data_info_file(cls, filename):
data = {
'genetic_code': None
}

with open(filename) as f:
for line in f:
key, val = line.rstrip().split('\t')
if key in data:
data[key] = val

if None in data.values():
missing_values = [x for x in data if data[x] is None]
raise Error('Error reading reference info file ' + filename + '. These values not found: ' + ','.join(missing_values))

data['genetic_code'] = int(data['genetic_code'])
return data


@staticmethod
def _load_reference_data_from_dir(indir):
if not os.path.exists(indir):
raise Error('Error loading reference data. Input directory ' + indir + ' not found. Cannot continue')

variants_only_fa = os.path.join(indir, 'refcheck.01.check_variants.variants_only.fa')
presence_absence_fa = os.path.join(indir, 'refcheck.01.check_variants.presence_absence.fa')
non_coding_fa = os.path.join(indir, 'refcheck.01.check_variants.non_coding.fa')
metadata_tsv = os.path.join(indir, 'refcheck.01.check_variants.tsv')
info_file = os.path.join(indir, 'info.txt')
clusters_file = os.path.join(indir, 'cdhit.clusters.pickle')
params = Clusters._load_reference_data_info_file(info_file)
refdata = reference_data.ReferenceData(
presence_absence_fa=presence_absence_fa if os.path.exists(presence_absence_fa) else None,
variants_only_fa=variants_only_fa if os.path.exists(variants_only_fa) else None,
non_coding_fa=non_coding_fa if os.path.exists(non_coding_fa) else None,
metadata_tsv=metadata_tsv if os.path.exists(metadata_tsv) else None,
genetic_code=params['genetic_code'],
)

with open(clusters_file, 'rb') as f:
cluster_ids = pickle.load(f)

return refdata, cluster_ids


def _map_reads_to_clustered_genes(self):
mapping.run_bowtie2(
Expand All @@ -134,40 +161,6 @@ def _map_reads_to_clustered_genes(self):
)


def _sam_to_fastq(self, s):
name = s.qname
if s.is_read1:
name += '/1'
elif s.is_read2:
name += '/2'
else:
raise Error('Read ' + name + ' must be first or second of pair according to flag. Cannot continue')

seq = pyfastaq.sequences.Fastq(name, common.decode(s.seq), common.decode(s.qual))
if s.is_reverse:
seq.revcomp()

return seq


def _sam_pair_to_insert(self, s1, s2):
if s1.is_unmapped or s2.is_unmapped or (s1.tid != s2.tid) or (s1.is_reverse == s2.is_reverse):
return None

# If here, reads are both mapped to the same ref, and in opposite orientations
if s1.is_reverse:
end = s1.reference_end - 1
start = s2.reference_start
else:
end = s2.reference_end - 1
start = s1.reference_start

if start < end:
return end - start + 1
else:
return None


def _bam_to_clusters_reads(self):
'''Sets up Cluster directories (one for each gene that has reads that mapped to it), writes reads fwd and rev files. Also gathers histogram data of insert size'''
filehandles_1 = {} # gene name -> filehandle of fwd_reads
Expand All @@ -186,12 +179,12 @@ def _bam_to_clusters_reads(self):
if not sam1.is_unmapped:
ref_seqs.add(sam_reader.getrname(sam1.tid))

read1 = self._sam_to_fastq(sam1)
read2 = self._sam_to_fastq(s)
read1 = mapping.sam_to_fastq(sam1)
read2 = mapping.sam_to_fastq(s)
if read1.id.endswith('/2'):
read1, read2 = read2, read1

insert = self._sam_pair_to_insert(s, sam1)
insert = mapping.sam_pair_to_insert(s, sam1)
if insert is not None:
self.insert_hist.add(insert)

Expand Down Expand Up @@ -349,31 +342,13 @@ def _clean(self):
print(' ... not deleting anything because --clean 0 used')
return

to_delete= [
self.bam,
self.cdhit_cluster_representatives_fa,
self.cdhit_cluster_representatives_fa + '.fai',
self.cdhit_files_prefix + '.non_coding.cdhit',
self.cdhit_files_prefix + '.presence_absence.cdhit',
self.cdhit_files_prefix + '.variants_only.cdhit',
]
to_delete= [self.bam]

if self.clean == 2:
if self.verbose:
print(' rm -r', self.clusters_outdir)
shutil.rmtree(self.clusters_outdir)

to_delete.extend([
self.cdhit_files_prefix + '.clusters.tsv',
self.refdata_files_prefix + '.00.check_fasta_presence_absence.log',
self.refdata_files_prefix + '.00.check_fasta_variants_only.log',
self.refdata_files_prefix + '.01.check_variants.log',
self.refdata_files_prefix + '.01.check_variants.non_coding.fa',
self.refdata_files_prefix + '.01.check_variants.presence_absence.fa',
self.refdata_files_prefix + '.01.check_variants.tsv',
self.refdata_files_prefix + '.01.check_variants.variants_only.fa',
])

for filename in to_delete:
if os.path.exists(filename):
if self.verbose:
Expand All @@ -400,16 +375,6 @@ def run(self):
self.write_versions_file(cwd)

if self.verbose:
print('{:_^79}'.format(' Checking reference data '), flush=True)
self.refdata.sanity_check(self.refdata_files_prefix)

if self.verbose:
print()
print('{:_^79}'.format(' Running cd-hit '), flush=True)
self._run_cdhit()

if self.verbose:
print('Finished cd-hit\n')
print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True)
self._map_reads_to_clustered_genes()

Expand Down
76 changes: 68 additions & 8 deletions ariba/mapping.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
import os
import sys
import pysam
import pyfastaq
from ariba import common

class Error (Exception): pass


def bowtie2_index(ref_fa, outprefix, bowtie2='bowtie2', verbose=False, verbose_filehandle=sys.stdout):
expected_files = [outprefix + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
file_missing = False
for filename in expected_files:
if not os.path.exists(filename):
file_missing = True
break

if not file_missing:
return

cmd = ' '.join([
bowtie2 + '-build',
'-q',
ref_fa,
outprefix
])

common.syscall(cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)


def run_bowtie2(
reads_fwd,
reads_rev,
Expand All @@ -20,16 +42,15 @@ def run_bowtie2(
verbose=False,
verbose_filehandle=sys.stdout,
remove_both_unmapped=False,
clean_index=True,
):

map_index = out_prefix + '.map_index'
clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
index_cmd = ' '.join([
bowtie2 + '-build',
'-q',
ref_fa,
map_index
])

if clean_index:
clean_files = [map_index + '.' + x + '.bt2' for x in ['1', '2', '3', '4', 'rev.1', 'rev.2']]
else:
clean_files = []

final_bam = out_prefix + '.bam'
if sort:
Expand Down Expand Up @@ -60,7 +81,7 @@ def run_bowtie2(

map_cmd = ' '.join(map_cmd)

common.syscall(index_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
bowtie2_index(ref_fa, map_index, bowtie2=bowtie2, verbose=verbose, verbose_filehandle=verbose_filehandle)
common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)

if sort:
Expand Down Expand Up @@ -96,3 +117,42 @@ def get_total_alignment_score(bam):
pass
return total


def sam_to_fastq(sam):
'''Given a pysam alignment, returns the sequence a Fastq object.
Reverse complements as required and add suffix /1 or /2 as appropriate from the flag'''
name = sam.qname
if sam.is_read1:
name += '/1'
elif sam.is_read2:
name += '/2'
else:
raise Error('Read ' + name + ' must be first or second of pair according to flag. Cannot continue')

seq = pyfastaq.sequences.Fastq(name, common.decode(sam.seq), common.decode(sam.qual))
if sam.is_reverse:
seq.revcomp()

return seq


def sam_pair_to_insert(s1, s2):
'''Returns insert size from pair of sam records, as long as their orientation is "innies".
Otherwise returns None.'''
if s1.is_unmapped or s2.is_unmapped or (s1.tid != s2.tid) or (s1.is_reverse == s2.is_reverse):
return None

# If here, reads are both mapped to the same ref, and in opposite orientations
if s1.is_reverse:
end = s1.reference_end - 1
start = s2.reference_start
else:
end = s2.reference_end - 1
start = s1.reference_start

if start < end:
return end - start + 1
else:
return None


Loading