Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various improvements #34

Merged
merged 7 commits into from
Sep 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions ariba/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(self,
self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam')
self.final_assembly_read_depths = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
self.final_assembly_vcf = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.vcf')
self.final_assembled_genes_fa = os.path.join(self.root_dir, 'assembly.genes.fa')
self.final_assembly = {}
self.mummer_variants = {}
self.variant_depths = {}
Expand Down Expand Up @@ -525,6 +526,38 @@ def _nucmer_hits_to_gene_cov_per_contig(self):
return cov


@staticmethod
def _nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, outfile):
f = pyfastaq.utils.open_file_write(outfile)

for contig in sorted(nucmer_hits):
for hit in nucmer_hits[contig]:
qry_coords = hit.qry_coords()
fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
if hit.on_same_strand():
strand = '+'
else:
fa.revcomp()
strand = '-'
ref_coords = hit.ref_coords()
fa.id = '.'.join([
ref_gene.id,
str(ref_coords.start + 1),
str(ref_coords.end + 1),
contig,
str(qry_coords.start + 1),
str(qry_coords.end + 1),
strand
])

if hit.hit_length_ref == hit.ref_length:
fa.id += '.complete'

print(fa, file=f)

pyfastaq.utils.close(f)


def _whole_gene_covered_by_nucmer_hits(self):
covered = self._nucmer_hits_to_ref_coords()
pyfastaq.intervals.merge_overlapping_in_list(covered)
Expand Down Expand Up @@ -1046,6 +1079,7 @@ def run(self):
self._update_flag_from_nucmer_file()
self._make_assembly_vcf()
self._get_vcf_variant_counts()
self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa)

self._make_report_lines()
self._clean()
43 changes: 35 additions & 8 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def __init__(self,
run_cd_hit=True,
clean=1,
):
self.db_fasta = os.path.abspath(db_fasta)
self.reads_1 = os.path.abspath(reads_1)
self.reads_2 = os.path.abspath(reads_2)
self.outdir = os.path.abspath(outdir)
Expand All @@ -57,12 +56,13 @@ def __init__(self,
self.assembly_kmer = assembly_kmer
self.spades_other = spades_other

self.db_fasta_clustered = os.path.join(self.outdir, 'genes.clustered.fa')
self.db_fasta_clustered = os.path.join(self.outdir, 'input_genes.clustered.fa')
self.cluster_ids = {}
self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
self.bam = self.bam_prefix + '.bam'
self.report_file_tsv = os.path.join(self.outdir, 'report.tsv')
self.report_file_xls = os.path.join(self.outdir, 'report.xls')
self.catted_assembled_genes_fasta = os.path.join(self.outdir, 'assembled_genes.fa')
self.threads = threads
self.verbose = verbose

Expand Down Expand Up @@ -120,6 +120,10 @@ def __init__(self,
except:
raise Error('Error mkdir ' + d)

self.db_fasta = os.path.join(self.outdir, 'input_genes.not_clustered.fa')
pyfastaq.tasks.to_fasta(db_fasta, self.db_fasta, check_unique=True)


def _run_cdhit(self):
r = cdhit.Runner(
self.db_fasta,
Expand Down Expand Up @@ -357,12 +361,27 @@ def _write_reports(self):
workbook.save(self.report_file_xls)


def _write_catted_assembled_genes_fasta(self):
f = pyfastaq.utils.open_file_write(self.catted_assembled_genes_fasta)

for gene in sorted(self.clusters):
cluster_fasta = self.clusters[gene].final_assembled_genes_fa
if os.path.exists(cluster_fasta):
file_reader = pyfastaq.sequences.file_reader(cluster_fasta)
for seq in file_reader:
print(seq, file=f)

pyfastaq.utils.close(f)


def _clean(self):
to_clean = [
[
],
[
self.bam
self.bam,
self.db_fasta,
self.db_fasta + '.fai',
],
[
self.db_fasta_clustered,
Expand Down Expand Up @@ -400,14 +419,22 @@ def run(self):
print('Finished mapping\n')
print('{:_^79}'.format(' Generating clusters '), flush=True)
self._bam_to_clusters_reads()
self._set_insert_size_data()
if self.verbose:
print('{:_^79}'.format(' Assembling each cluster '), flush=True)
self._init_and_run_clusters()
if len(self.cluster_to_dir) > 0:
self._set_insert_size_data()
if self.verbose:
print('{:_^79}'.format(' Assembling each cluster '), flush=True)
self._init_and_run_clusters()
if self.verbose:
print('Finished assembling clusters\n')
else:
if self.verbose:
print('No reads mapped. Skipping all assemblies', flush=True)
print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr)

if self.verbose:
print('Finished assembling clusters\n')
print('{:_^79}'.format(' Writing report files '), flush=True)
self._write_reports()
self._write_catted_assembled_genes_fasta()
if self.verbose:
print('Finished writing report files. Cleaning files', flush=True)
self._clean()
Expand Down
2 changes: 1 addition & 1 deletion ariba/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import subprocess

version = '0.4.1'
version = '0.5.0'

def syscall(cmd, allow_fail=False, verbose=False):
if verbose:
Expand Down
100 changes: 54 additions & 46 deletions ariba/refcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,78 +5,86 @@ class Error (Exception): pass


class Checker:
def __init__(self, infile, min_length=1, max_length=10000):
def __init__(self, infile, min_length=1, max_length=10000, outprefix=None):
self.infile = os.path.abspath(infile)
if not os.path.exists(self.infile):
raise Error('File not found: "' + self.infile + '". Cannot continue')

self.min_length = min_length
self.max_length = max_length
self.outprefix = outprefix


def check(self, error_code_on_exit=None):
def run(self):
file_reader = pyfastaq.sequences.file_reader(self.infile)

for seq in file_reader:
if not seq.looks_like_gene():
return False, 'Not a gene', seq
elif len(seq) < self.min_length:
return False, 'Too short', seq
elif len(seq) > self.max_length:
return False, 'Too long', seq

return True, None, None


def fix(self, outprefix):
file_reader = pyfastaq.sequences.file_reader(self.infile)
old2new_out = outprefix + '.rename'
fasta_out = outprefix + '.fa'
bad_seqs_out = outprefix + '.removed.fa'
log_out = outprefix + '.log'
names = {}
old2new_out_fh = pyfastaq.utils.open_file_write(old2new_out)
fasta_out_fh = pyfastaq.utils.open_file_write(fasta_out)
bad_seqs_out_fh = pyfastaq.utils.open_file_write(bad_seqs_out)
log_out_fh = pyfastaq.utils.open_file_write(log_out)

if self.outprefix is not None:
old2new_out = self.outprefix + '.rename'
fasta_out = self.outprefix + '.fa'
bad_seqs_out = self.outprefix + '.removed.fa'
log_out = self.outprefix + '.log'
old2new_out_fh = pyfastaq.utils.open_file_write(old2new_out)
fasta_out_fh = pyfastaq.utils.open_file_write(fasta_out)
bad_seqs_out_fh = pyfastaq.utils.open_file_write(bad_seqs_out)
log_out_fh = pyfastaq.utils.open_file_write(log_out)

for seq in file_reader:
seq.seq = seq.seq.upper()
if len(seq) < self.min_length:
print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
print(seq, file=bad_seqs_out_fh)
continue
if self.outprefix is None:
return False, 'Too short', seq
else:
print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
print(seq, file=bad_seqs_out_fh)
continue
elif len(seq) > self.max_length:
print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
print(seq, file=bad_seqs_out_fh)
continue

if self.outprefix is None:
return False, 'Too long', seq
else:
print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
print(seq, file=bad_seqs_out_fh)
continue

if not seq.looks_like_gene():
seq.revcomp()
if seq.looks_like_gene():
print(seq.id, 'Reverse complemented', sep='\t', file=log_out_fh)
if self.outprefix is None:
return False, 'Not a gene', seq
else:
print(seq.id, 'Does not look like a gene. Skipping', sep='\t', file=log_out_fh)
seq.revcomp()
print(seq, file=bad_seqs_out_fh)
continue

if seq.looks_like_gene():
print(seq.id, 'Reverse complemented', sep='\t', file=log_out_fh)
else:
print(seq.id, 'Does not look like a gene. Skipping', sep='\t', file=log_out_fh)
seq.revcomp()
print(seq, file=bad_seqs_out_fh)
continue

original_id = seq.id
# replace unwanted characters with underscores
to_replace = ' '
seq.id = seq.id.translate(str.maketrans(to_replace, '_' * len(to_replace)))

if self.outprefix is None and original_id != seq.id:
seq.id = original_id
return False, 'Name has spaces', seq

if seq.id in names:
names[seq.id] += 1
seq.id += '.' + str(names[seq.id])
if self.outprefix is None:
return False, 'Duplicate name', seq
else:
names[seq.id] += 1
seq.id += '.' + str(names[seq.id])
else:
names[seq.id] = 1

print(original_id, seq.id, sep='\t', file=old2new_out_fh)
print(seq, file=fasta_out_fh)
if self.outprefix is not None:
print(original_id, seq.id, sep='\t', file=old2new_out_fh)
print(seq, file=fasta_out_fh)

pyfastaq.utils.close(fasta_out_fh)
pyfastaq.utils.close(bad_seqs_out_fh)
pyfastaq.utils.close(log_out_fh)
pyfastaq.utils.close(old2new_out_fh)
if self.outprefix is not None:
pyfastaq.utils.close(fasta_out_fh)
pyfastaq.utils.close(bad_seqs_out_fh)
pyfastaq.utils.close(log_out_fh)
pyfastaq.utils.close(old2new_out_fh)

return True, None, None
20 changes: 10 additions & 10 deletions ariba/tasks/refcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ def run():
checker = ariba.refcheck.Checker(
options.infile,
min_length=options.min_length,
max_length=options.max_length
max_length=options.max_length,
outprefix=options.outprefix
)

if options.outprefix:
checker.fix(options.outprefix)
else:
ok, reason, seq = checker.check()
if not ok:
print('The following sequence not OK, for the reason:', reason)
print(seq)
sys.exit(1)

ok, reason, seq = checker.run()

if options.outprefix is None and not ok:
print('The following sequence not OK, for the reason:', reason)
print(seq)
sys.exit(1)

43 changes: 36 additions & 7 deletions ariba/tests/cluster_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def clean_cluster_dir(d, exclude=None):
def file2lines(filename):
f = pyfastaq.utils.open_file_read(filename)
lines = f.readlines()
pyfastaq.utils.close(f)
pyfastaq.utils.close(f)
return lines


Expand Down Expand Up @@ -303,8 +303,8 @@ def test_nucmer_hits_to_percent_identity(self):
]
}
expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
c._nucmer_hits_to_percent_identity()
self.assertEqual(expected, c.percent_identities)
c._nucmer_hits_to_percent_identity()
self.assertEqual(expected, c.percent_identities)


def test_nucmer_hits_to_scaff_coords(self):
Expand Down Expand Up @@ -396,12 +396,41 @@ def test_nucmer_hits_to_gene_cov_per_contig(self):
pymummer.alignment.Alignment('\t'.join(hits[2])),
]
}

expected = {'contig1': 85, 'contig2': 11}
self.assertEqual(expected, c._nucmer_hits_to_gene_cov_per_contig())
clean_cluster_dir(cluster_dir)


def test_nucmer_hits_to_assembled_gene_sequences(self):
'''test _nucmer_hits_to_assembled_gene_sequences'''
ref_gene = pyfastaq.sequences.Fasta('ref_gene', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
assembly = {
'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
}

hits = [
['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_gene', 'contig1'],
['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_gene', 'contig2']
]
nucmer_hits = {
'contig1': [
pymummer.alignment.Alignment('\t'.join(hits[0])),
],
'contig2': [
pymummer.alignment.Alignment('\t'.join(hits[1])),
]
}

assembly_fasta = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.assembly.fa')
tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa'
expected_outfile = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa')
cluster.Cluster._nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, tmp_outfile)
self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False))
os.unlink(tmp_outfile)


def test_whole_gene_covered_by_nucmer_hits(self):
'''test _whole_gene_covered_by_nucmer_hits'''
cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
Expand Down Expand Up @@ -637,7 +666,7 @@ def test_get_assembly_read_depths(self):
( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
( ('ref1', 4), ('C', 'AC', 41, '0,42') )
]

for t in tests:
self.assertEqual(c._get_assembly_read_depths(t[0][0], t[0][1]), t[1])

Expand All @@ -652,7 +681,7 @@ def test_get_samtools_variant_positions(self):
('16__cat_2_M35190.scaffold.1', 179),
('16__cat_2_M35190.scaffold.1', 263),
('16__cat_2_M35190.scaffold.6', 93)
]
]
self.assertEqual(expected, c._get_samtools_variant_positions())


Expand Down Expand Up @@ -764,4 +793,4 @@ def test_make_report_lines_assembly_fail(self):
]
self.assertEqual(expected, c.report_lines)
clean_cluster_dir(cluster_dir)

Loading