From b3d00f884fabaed4fddfd269a0c9af040c42061a Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:09:20 +0000 Subject: [PATCH 01/25] Start writing ReadStore, new method _sort_file --- ariba/read_store.py | 22 +++++++++++++++++++ ariba/tests/data/read_store_test_sort_file.in | 10 +++++++++ .../tests/data/read_store_test_sort_file.out | 10 +++++++++ ariba/tests/read_store_test.py | 21 ++++++++++++++++++ 4 files changed, 63 insertions(+) create mode 100644 ariba/read_store.py create mode 100644 ariba/tests/data/read_store_test_sort_file.in create mode 100644 ariba/tests/data/read_store_test_sort_file.out create mode 100644 ariba/tests/read_store_test.py diff --git a/ariba/read_store.py b/ariba/read_store.py new file mode 100644 index 00000000..5cafebda --- /dev/null +++ b/ariba/read_store.py @@ -0,0 +1,22 @@ +import pysam +import pyfastaq +import os +from ariba import common + +class Error (Exception): pass + +class ReadStore: + def __init__(self, infile, outfile, log_fh=None): + self.infile = os.path.abspath(infile) + self.outfile = os.path.abspath(outfile) + self.log_fh = log_fh + + if not os.path.exists(self.infile): + raise Error('File not found ' + self.infile + '. Cannot continue') + + + @staticmethod + def _sort_file(infile, outfile, log_fh=None): + cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile + verbose = log_fh is not None + common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh) diff --git a/ariba/tests/data/read_store_test_sort_file.in b/ariba/tests/data/read_store_test_sort_file.in new file mode 100644 index 00000000..497d6298 --- /dev/null +++ b/ariba/tests/data/read_store_test_sort_file.in @@ -0,0 +1,10 @@ +cluster2 1 ACGT ABCD +cluster3 2 ACGT ABCD +cluster2 2 ACGT ABCD +cluster2 3 ACGT ABCD +cluster2 4 ACGT ABCD +cluster2 11 ACGT ABCD +cluster2 12 ACGT ABCD +cluster1 2 ACGT ABCD +cluster1 1 ACGT ABCD +cluster3 1 ACGT ABCD diff --git a/ariba/tests/data/read_store_test_sort_file.out b/ariba/tests/data/read_store_test_sort_file.out new file mode 100644 index 00000000..e1b0556d --- /dev/null +++ b/ariba/tests/data/read_store_test_sort_file.out @@ -0,0 +1,10 @@ +cluster1 1 ACGT ABCD +cluster1 2 ACGT ABCD +cluster2 1 ACGT ABCD +cluster2 2 ACGT ABCD +cluster2 3 ACGT ABCD +cluster2 4 ACGT ABCD +cluster2 11 ACGT ABCD +cluster2 12 ACGT ABCD +cluster3 1 ACGT ABCD +cluster3 2 ACGT ABCD diff --git a/ariba/tests/read_store_test.py b/ariba/tests/read_store_test.py new file mode 100644 index 00000000..faadb01a --- /dev/null +++ b/ariba/tests/read_store_test.py @@ -0,0 +1,21 @@ +import unittest +import sys +import os +import shutil +import filecmp +from ariba import read_store + +modules_dir = os.path.dirname(os.path.abspath(read_store.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + + +class TestReadStore(unittest.TestCase): + def test_sort_file(self): + '''test _sort_file''' + infile = os.path.join(data_dir, 'read_store_test_sort_file.in') + expected = os.path.join(data_dir, 'read_store_test_sort_file.out') + tmpfile = 'tmp.read_store_test_sort_file.out' + read_store.ReadStore._sort_file(infile, tmpfile) + self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) + os.unlink(tmpfile) + From c1128a590562b98e34435019f5a540944bf9806d Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:25:58 +0000 Subject: [PATCH 02/25] Add read_store --- ariba/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ariba/__init__.py b/ariba/__init__.py index 4996dd52..12a0027a 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -23,6 +23,7 @@ 'histogram', 'link', 'mapping', + 'read_store', 'reference_data', 'ref_genes_getter', 'ref_preparer', From a73a986bbb14fc2af94cb639602a99537ca2972f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:26:12 +0000 Subject: [PATCH 03/25] New method _compress_and_index_file --- ariba/read_store.py | 9 +++++++ ...read_store_test_compress_and_index_file.in | 10 +++++++ ariba/tests/read_store_test.py | 26 +++++++++++++++++++ 3 files changed, 45 insertions(+) create mode 100644 ariba/tests/data/read_store_test_compress_and_index_file.in diff --git a/ariba/read_store.py b/ariba/read_store.py index 5cafebda..3a064d53 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -1,6 +1,7 @@ import pysam import pyfastaq import os +import pysam from ariba import common class Error (Exception): pass @@ -20,3 +21,11 @@ def _sort_file(infile, outfile, log_fh=None): cmd = 'sort -k1,1 -k 2,2n ' + infile + ' > ' + outfile verbose = log_fh is not None common.syscall(cmd, verbose=verbose, verbose_filehandle=log_fh) + + + @staticmethod + def _compress_and_index_file(infile, log_fh=None): + if log_fh is not None: + print('Compressing file', infile, file=log_fh, flush=True) + pysam.tabix_compress(infile, infile + '.gz') + pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1) diff --git a/ariba/tests/data/read_store_test_compress_and_index_file.in b/ariba/tests/data/read_store_test_compress_and_index_file.in new file mode 100644 index 00000000..e1b0556d --- /dev/null +++ b/ariba/tests/data/read_store_test_compress_and_index_file.in @@ -0,0 +1,10 @@ +cluster1 1 ACGT ABCD +cluster1 2 ACGT ABCD +cluster2 1 ACGT ABCD +cluster2 2 ACGT ABCD +cluster2 3 ACGT ABCD +cluster2 4 ACGT ABCD +cluster2 11 ACGT ABCD +cluster2 12 ACGT ABCD +cluster3 1 ACGT ABCD +cluster3 2 ACGT ABCD diff --git a/ariba/tests/read_store_test.py b/ariba/tests/read_store_test.py index faadb01a..21e75ec3 100644 --- a/ariba/tests/read_store_test.py +++ b/ariba/tests/read_store_test.py @@ -3,12 +3,20 @@ import os import shutil import filecmp +import pyfastaq from ariba import read_store modules_dir = os.path.dirname(os.path.abspath(read_store.__file__)) data_dir = os.path.join(modules_dir, 'tests', 'data') +def file_to_list(infile): + f = pyfastaq.utils.open_file_read(infile) + lines = [x for x in f.readlines()] + pyfastaq.utils.close(f) + return lines + + class TestReadStore(unittest.TestCase): def test_sort_file(self): '''test _sort_file''' @@ -19,3 +27,21 @@ def test_sort_file(self): self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False)) os.unlink(tmpfile) + + def test_compress_and_index_file(self): + '''Test _compress_and_index_file''' + infile = os.path.join(data_dir, 'read_store_test_compress_and_index_file.in') + tmpfile = 'tmp.test_compress_and_index_file.in' + tmpfile_gz = 'tmp.test_compress_and_index_file.in.gz' + shutil.copyfile(infile, tmpfile) + read_store.ReadStore._compress_and_index_file(tmpfile) + self.assertTrue(os.path.exists(tmpfile_gz)) + expected_lines = file_to_list(infile) + got_lines = file_to_list(tmpfile_gz) + self.assertEqual(expected_lines, got_lines) + self.assertTrue(os.path.exists(tmpfile_gz + '.tbi')) + os.unlink(tmpfile) + os.unlink(tmpfile_gz) + os.unlink(tmpfile_gz + '.tbi') + + From 8fc3dcba9825977c59e246db5ab8574cd01ce7d7 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:28:59 +0000 Subject: [PATCH 04/25] sort and compress in __init__ --- ariba/read_store.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ariba/read_store.py b/ariba/read_store.py index 3a064d53..2715deea 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -7,14 +7,19 @@ class Error (Exception): pass class ReadStore: - def __init__(self, infile, outfile, log_fh=None): + def __init__(self, infile, outprefix, log_fh=None): self.infile = os.path.abspath(infile) - self.outfile = os.path.abspath(outfile) + self.outprefix = os.path.abspath(outprefix) self.log_fh = log_fh + self.outfile = os.path.abspath(outprefix) + '.gz' if not os.path.exists(self.infile): raise Error('File not found ' + self.infile + '. Cannot continue') + self._sort_file(self.infile, self.outprefix, log_fh=self.log_fh) + self._compress_and_index_file(self.outprefix, log_fh=self.log_fh) + os.unlink(self.outprefix) + @staticmethod def _sort_file(infile, outfile, log_fh=None): From 5ae61dc1efff02923b3e4fa282af5e0cef8d324e Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:45:13 +0000 Subject: [PATCH 05/25] New method get_reads --- ariba/read_store.py | 17 +++++++++++++++++ ariba/tests/data/read_store_test_get_reads.in | 10 ++++++++++ .../data/read_store_test_get_reads.reads_1.fq | 12 ++++++++++++ .../data/read_store_test_get_reads.reads_2.fq | 12 ++++++++++++ ariba/tests/read_store_test.py | 17 +++++++++++++++++ 5 files changed, 68 insertions(+) create mode 100644 ariba/tests/data/read_store_test_get_reads.in create mode 100644 ariba/tests/data/read_store_test_get_reads.reads_1.fq create mode 100644 ariba/tests/data/read_store_test_get_reads.reads_2.fq diff --git a/ariba/read_store.py b/ariba/read_store.py index 2715deea..4cb76ec8 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -34,3 +34,20 @@ def _compress_and_index_file(infile, log_fh=None): print('Compressing file', infile, file=log_fh, flush=True) pysam.tabix_compress(infile, infile + '.gz') pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1) + + + def get_reads(self, cluster_name, out1, out2): + tabix_file = pysam.TabixFile(self.outfile) + f_out1 = pyfastaq.utils.open_file_write(out1) + f_out2 = pyfastaq.utils.open_file_write(out2) + + for line in tabix_file.fetch(reference=cluster_name): + cluster, number, seq, qual = line.rstrip().split() + number = int(number) + if number % 2 == 0: + print(str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2) + else: + print(str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1) + + pyfastaq.utils.close(f_out1) + pyfastaq.utils.close(f_out2) diff --git a/ariba/tests/data/read_store_test_get_reads.in b/ariba/tests/data/read_store_test_get_reads.in new file mode 100644 index 00000000..6cfcecb4 --- /dev/null +++ b/ariba/tests/data/read_store_test_get_reads.in @@ -0,0 +1,10 @@ +cluster1 1 AAAA ABCD +cluster1 2 CCCC IIII +cluster2 1 AAAA HIHI +cluster2 2 CCCC GGGG +cluster2 3 GGGG DEFG +cluster2 4 TTTT GFED +cluster2 11 ACGT CDEF +cluster2 12 TGCA CDEF +cluster3 1 AGGG ABCD +cluster3 2 AGGG ABCD diff --git a/ariba/tests/data/read_store_test_get_reads.reads_1.fq b/ariba/tests/data/read_store_test_get_reads.reads_1.fq new file mode 100644 index 00000000..7ceb45de --- /dev/null +++ b/ariba/tests/data/read_store_test_get_reads.reads_1.fq @@ -0,0 +1,12 @@ +1/1 +AAAA ++ +HIHI +3/1 +GGGG ++ +DEFG +11/1 +ACGT ++ +CDEF diff --git a/ariba/tests/data/read_store_test_get_reads.reads_2.fq b/ariba/tests/data/read_store_test_get_reads.reads_2.fq new file mode 100644 index 00000000..73e7389c --- /dev/null +++ b/ariba/tests/data/read_store_test_get_reads.reads_2.fq @@ -0,0 +1,12 @@ +1/2 +CCCC ++ +GGGG +3/2 +TTTT ++ +GFED +11/2 +TGCA ++ +CDEF diff --git a/ariba/tests/read_store_test.py b/ariba/tests/read_store_test.py index 21e75ec3..0e932619 100644 --- a/ariba/tests/read_store_test.py +++ b/ariba/tests/read_store_test.py @@ -45,3 +45,20 @@ def test_compress_and_index_file(self): os.unlink(tmpfile_gz + '.tbi') + def test_get_reads(self): + '''Test get_reads''' + infile = os.path.join(data_dir, 'read_store_test_get_reads.in') + expected1 = os.path.join(data_dir, 'read_store_test_get_reads.reads_1.fq') + expected2 = os.path.join(data_dir, 'read_store_test_get_reads.reads_2.fq') + outprefix = 'tmp.read_store_test_get_reads' + reads1 = outprefix + '.reads_1.fq' + reads2 = outprefix + '.reads_2.fq' + rstore = read_store.ReadStore(infile, outprefix) + rstore.get_reads('cluster2', reads1, reads2) + self.assertTrue(filecmp.cmp(expected1, reads1)) + self.assertTrue(filecmp.cmp(expected2, reads2)) + os.unlink(outprefix + '.gz') + os.unlink(outprefix + '.gz.tbi') + os.unlink(reads1) + os.unlink(reads2) + From 1dc47459aa518dba64cde4878ea106527db18cc7 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:45:55 +0000 Subject: [PATCH 06/25] remove one import pysam (was there twice) --- ariba/read_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ariba/read_store.py b/ariba/read_store.py index 4cb76ec8..5574cd96 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -1,4 +1,3 @@ -import pysam import pyfastaq import os import pysam From b9808be362efe6e4f38ecd40cdd6e1021c804450 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 12:49:34 +0000 Subject: [PATCH 07/25] New method clean --- ariba/read_store.py | 5 +++++ ariba/tests/data/read_store_test_clean.in | 10 ++++++++++ ariba/tests/read_store_test.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 ariba/tests/data/read_store_test_clean.in diff --git a/ariba/read_store.py b/ariba/read_store.py index 5574cd96..5dd1a331 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -50,3 +50,8 @@ def get_reads(self, cluster_name, out1, out2): pyfastaq.utils.close(f_out1) pyfastaq.utils.close(f_out2) + + + def clean(self): + os.unlink(self.outfile) + os.unlink(self.outfile + '.tbi') diff --git a/ariba/tests/data/read_store_test_clean.in b/ariba/tests/data/read_store_test_clean.in new file mode 100644 index 00000000..6cfcecb4 --- /dev/null +++ b/ariba/tests/data/read_store_test_clean.in @@ -0,0 +1,10 @@ +cluster1 1 AAAA ABCD +cluster1 2 CCCC IIII +cluster2 1 AAAA HIHI +cluster2 2 CCCC GGGG +cluster2 3 GGGG DEFG +cluster2 4 TTTT GFED +cluster2 11 ACGT CDEF +cluster2 12 TGCA CDEF +cluster3 1 AGGG ABCD +cluster3 2 AGGG ABCD diff --git a/ariba/tests/read_store_test.py b/ariba/tests/read_store_test.py index 0e932619..842eed5e 100644 --- a/ariba/tests/read_store_test.py +++ b/ariba/tests/read_store_test.py @@ -62,3 +62,19 @@ def test_get_reads(self): os.unlink(reads1) os.unlink(reads2) + + def test_clean(self): + '''Test clean''' + infile = os.path.join(data_dir, 'read_store_test_clean.in') + outprefix = 'tmp.read_store_test_clean' + self.assertFalse(os.path.exists(outprefix)) + self.assertFalse(os.path.exists(outprefix + '.gz')) + self.assertFalse(os.path.exists(outprefix + '.gz.tbi')) + rstore = read_store.ReadStore(infile, outprefix) + self.assertFalse(os.path.exists(outprefix)) + self.assertTrue(os.path.exists(outprefix + '.gz')) + self.assertTrue(os.path.exists(outprefix + '.gz.tbi')) + rstore.clean() + self.assertFalse(os.path.exists(outprefix)) + self.assertFalse(os.path.exists(outprefix + '.gz')) + self.assertFalse(os.path.exists(outprefix + '.gz.tbi')) From 39a20f538e8283146a11468cec30b7f435838ebe Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 13:12:53 +0000 Subject: [PATCH 08/25] Make input files if root_dir does not exist --- ariba/cluster.py | 39 ++++++++++++++++++++++++++++--------- ariba/tests/cluster_test.py | 3 --- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index e71e3f7d..bfe4b7e9 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -17,6 +17,8 @@ def __init__(self, refdata, total_reads, total_reads_bases, + read_store=None, + reference_names=None, logfile=None, assembly_coverage=50, assembly_kmer=21, @@ -42,13 +44,14 @@ def __init__(self, extern_progs=None, random_seed=42, ): - self.root_dir = os.path.abspath(root_dir) - if not os.path.exists(self.root_dir): - raise Error('Directory ' + self.root_dir + ' not found. Cannot continue') - - self.name = name + self.read_store = read_store self.refdata = refdata + self.name = name + self.reference_fa = os.path.join(self.root_dir, 'reference.fa') + self.reference_names = reference_names + self._set_up_input_files() + self.total_reads = total_reads self.total_reads_bases = total_reads_bases self.logfile = logfile @@ -60,12 +63,8 @@ def __init__(self, self.reads_insert = reads_insert self.spades_other_options = spades_other_options - self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq') - self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq') self.reads_for_assembly1 = os.path.join(self.root_dir, 'reads_for_assembly_1.fq') self.reads_for_assembly2 = os.path.join(self.root_dir, 'reads_for_assembly_2.fq') - self.reference_fa = os.path.join(self.root_dir, 'reference.fa') - self.references_fa = os.path.join(self.root_dir, 'references.fa') for fname in [self.all_reads1, self.all_reads2, self.references_fa]: if not os.path.exists(fname): @@ -124,6 +123,28 @@ def __init__(self, self.random_seed = random_seed + def _set_up_input_files(self): + self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq') + self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq') + self.references_fa = os.path.join(self.root_dir, 'references.fa') + + if os.path.exists(self.root_dir): + assert self.read_store is None + if not (os.path.exists(self.all_reads1) and os.path.exists(self.all_reads2)): + raise Error('Error making cluster. Reads files not found') + if not os.path.exists(self.references_fa): + raise Error('Error making cluster. References fasta not found') + else: + assert self.read_store is not None + assert self.reference_names is not None + try: + os.mkdir(self.root_dir) + except: + raise Error('Error making directory ' + seolf.root_dir) + self.read_store.get_reads(self.name, self.all_reads1, self.all_reads2) + self.refdata.write_seqs_to_fasta(self.references_fa, self.reference_names) + + def _clean_file(self, filename): if self.clean: print('Deleting file', filename, file=self.log_fh) diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index 76d1f43d..7ccdd61d 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -51,9 +51,6 @@ def test_init_fail_files_missing(self): c = cluster.Cluster(tmpdir, 'name', refdata=refdata, total_reads=42, total_reads_bases=4242) shutil.rmtree(tmpdir) - with self.assertRaises(cluster.Error): - c = cluster.Cluster('directorydoesnotexistshouldthrowerror', 'name', refdata=refdata, total_reads=42, total_reads_bases=4242) - def test_number_of_reads_for_assembly(self): '''Test _number_of_reads_for_assembly''' From b8fafa0a7608169faebd5c26f5e6623283ace8f3 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 14:42:05 +0000 Subject: [PATCH 09/25] Protect against overwriting infile --- ariba/read_store.py | 1 + .../clusters_test_bam_to_clusters.out.ref1.reads_1.fq | 8 -------- .../clusters_test_bam_to_clusters.out.ref1.reads_2.fq | 8 -------- .../clusters_test_bam_to_clusters.out.ref2.reads_1.fq | 4 ---- .../clusters_test_bam_to_clusters.out.ref2.reads_2.fq | 4 ---- 5 files changed, 1 insertion(+), 24 deletions(-) delete mode 100644 ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq delete mode 100644 ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq delete mode 100644 ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq delete mode 100644 ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq diff --git a/ariba/read_store.py b/ariba/read_store.py index 5dd1a331..0c314641 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -7,6 +7,7 @@ class Error (Exception): pass class ReadStore: def __init__(self, infile, outprefix, log_fh=None): + assert infile != outprefix self.infile = os.path.abspath(infile) self.outprefix = os.path.abspath(outprefix) self.log_fh = log_fh diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq b/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq deleted file mode 100644 index c1510f4a..00000000 --- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_1.fq +++ /dev/null @@ -1,8 +0,0 @@ -@read1/1 -GTATATGGTGGGTCGTCATGGAAGCAGTACCTATCAGCATAGCTGCACTACCCTACATGC -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII -@read2/1 -CGGCTTAACGGCACTTTTCCACGCAAGTGTTGCTCTGAAAAGTTGGGACTTATGTCTTCC -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq b/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq deleted file mode 100644 index 532fbc9c..00000000 --- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref1.reads_2.fq +++ /dev/null @@ -1,8 +0,0 @@ -@read1/2 -CTTCGAGTGCCCCAACACAAATTCGCATCCTCTAGGGGGGTTTTTCGTTTTGCGAGTATC -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII -@read2/2 -GGGCCCGTGGTAGTTAGACTAGAGGAATAGCTGAGAGTTGACATTTACGGTGGGAACAGC -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq b/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq deleted file mode 100644 index f62449d9..00000000 --- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_1.fq +++ /dev/null @@ -1,4 +0,0 @@ -@read2/1 -CGGCTTAACGGCACTTTTCCACGCAAGTGTTGCTCTGAAAAGTTGGGACTTATGTCTTCC -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII diff --git a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq b/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq deleted file mode 100644 index 8ebade74..00000000 --- a/ariba/tests/data/clusters_test_bam_to_clusters.out.ref2.reads_2.fq +++ /dev/null @@ -1,4 +0,0 @@ -@read2/2 -GGGCCCGTGGTAGTTAGACTAGAGGAATAGCTGAGAGTTGACATTTACGGTGGGAACAGC -+ -IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII From a43c7ee8c53fac5e69c76895dd78e4d4efd02991 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 14:42:33 +0000 Subject: [PATCH 10/25] Make read store file instead of fastq files for each cluster --- ariba/clusters.py | 32 +++++++++++++------------------- ariba/tests/clusters_test.py | 21 ++++++++++----------- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index 338d41f2..ae3cc321 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -8,7 +8,7 @@ import multiprocessing import pysam import pyfastaq -from ariba import cluster, common, mapping, histogram, report, report_filter, reference_data +from ariba import cluster, common, mapping, histogram, read_store, report, report_filter, reference_data class Error (Exception): pass @@ -171,9 +171,10 @@ def _map_reads_to_clustered_genes(self): def _bam_to_clusters_reads(self): - '''Sets up Cluster directories (one for each gene that has reads that mapped to it), writes reads fwd and rev files. Also gathers histogram data of insert size''' - filehandles_1 = {} # gene name -> filehandle of fwd_reads - filehandles_2 = {} # gene name -> filehandle of rev_reads + '''Sets up ReadStore of reads for all the clusters. Also gathers histogram data of insert size''' + reads_file_for_read_store = os.path.join(self.outdir, 'reads') + f_out = pyfastaq.utils.open_file_write(reads_file_for_read_store) + sam_reader = pysam.Samfile(self.bam, "rb") sam1 = None self.proper_pairs = 0 @@ -201,36 +202,29 @@ def _bam_to_clusters_reads(self): for ref in ref_seqs: if ref not in self.cluster_to_dir: - assert ref not in filehandles_1 - assert ref not in filehandles_2 - new_dir = os.path.join(self.clusters_outdir, ref) - try: - os.mkdir(new_dir) - except: - raise Error('Error mkdir ' + new_dir) - self.cluster_to_dir[ref] = new_dir - filehandles_1[ref] = pyfastaq.utils.open_file_write(os.path.join(new_dir, 'reads_1.fq')) - filehandles_2[ref] = pyfastaq.utils.open_file_write(os.path.join(new_dir, 'reads_2.fq')) if self.verbose: print('New cluster with reads that hit:', ref, flush=True) - print(read1, file=filehandles_1[ref]) - print(read2, file=filehandles_2[ref]) self.cluster_read_counts[ref] = self.cluster_read_counts.get(ref, 0) + 2 self.cluster_base_counts[ref] = self.cluster_base_counts.get(ref, 0) + len(read1) + len(read2) + print(ref, self.cluster_read_counts[ref] - 1, read1.seq, read1.qual, sep='\t', file=f_out) + print(ref, self.cluster_read_counts[ref], read2.seq, read2.qual, sep='\t', file=f_out) sam1 = None - for ref in filehandles_1: - pyfastaq.utils.close(filehandles_1[ref]) - pyfastaq.utils.close(filehandles_2[ref]) + pyfastaq.utils.close(f_out) + + if len(self.cluster_read_counts): + self.read_store = read_store.ReadStore(reads_file_for_read_store, os.path.join(self.outdir, 'read_store')) + os.unlink(reads_file_for_read_store) if self.verbose: print('Found', self.proper_pairs, 'proper read pairs') print('Total clusters to perform local assemblies:', len(self.cluster_to_dir), flush=True) + def _set_insert_size_data(self): if len(self.insert_hist) == 0: return False diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py index 65338845..bcef378c 100644 --- a/ariba/tests/clusters_test.py +++ b/ariba/tests/clusters_test.py @@ -12,6 +12,13 @@ extern_progs = external_progs.ExternalProgs() +def file_to_list(infile): + f = pyfastaq.utils.open_file_read(infile) + lines = [x for x in f.readlines()] + pyfastaq.utils.close(f) + return lines + + class TestClusters(unittest.TestCase): def setUp(self): self.cluster_dir = 'tmp.Cluster' @@ -108,18 +115,10 @@ def test_bam_to_clusters_reads(self): os.path.join(data_dir, 'clusters_test_bam_to_clusters.out.ref2.reads_2.fq'), ] - got = [ - os.path.join(clusters_dir, 'Clusters/ref1/reads_1.fq'), - os.path.join(clusters_dir, 'Clusters/ref1/reads_2.fq'), - os.path.join(clusters_dir, 'Clusters/ref2/reads_1.fq'), - os.path.join(clusters_dir, 'Clusters/ref2/reads_2.fq'), - ] - - - for i in range(len(got)): - self.assertTrue(os.path.exists(got[i])) - self.assertTrue(filecmp.cmp(expected[i], got[i], shallow=False)) + got_reads_store_lines = file_to_list(os.path.join(clusters_dir, 'read_store.gz')) + expected_reads_store_lines = file_to_list(os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.read_store.gz')) + self.assertEqual(expected_reads_store_lines, got_reads_store_lines) self.assertEqual({780:1}, c.insert_hist.bins) self.assertEqual({'ref1': 4, 'ref2': 2}, c.cluster_read_counts) self.assertEqual({'ref1': 240, 'ref2': 120}, c.cluster_base_counts) From c58a855bd7bffebcdb7ea9bec5c70ab6c61617d7 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 14:54:28 +0000 Subject: [PATCH 11/25] Use read store in each cluster --- ariba/clusters.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index ae3cc321..3920040b 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -217,7 +217,17 @@ def _bam_to_clusters_reads(self): pyfastaq.utils.close(f_out) if len(self.cluster_read_counts): - self.read_store = read_store.ReadStore(reads_file_for_read_store, os.path.join(self.outdir, 'read_store')) + if self.verbose: + filehandle = sys.stdout + else: + filehandle = None + + self.read_store = read_store.ReadStore( + reads_file_for_read_store, + os.path.join(self.outdir, 'read_store'), + log_fh=filehandle + ) + os.unlink(reads_file_for_read_store) if self.verbose: @@ -260,7 +270,7 @@ def _init_and_run_clusters(self): if self.verbose: print('Constructing cluster', seq_name + '.', counter, 'of', str(len(self.cluster_to_dir))) new_dir = self.cluster_to_dir[seq_name] - self.refdata.write_seqs_to_fasta(os.path.join(new_dir, 'references.fa'), self.cluster_ids[seq_type][seq_name]) + #self.refdata.write_seqs_to_fasta(os.path.join(new_dir, 'references.fa'), self.cluster_ids[seq_type][seq_name]) self.log_files.append(os.path.join(self.logs_dir, seq_name + '.log')) cluster_list.append(cluster.Cluster( @@ -269,6 +279,8 @@ def _init_and_run_clusters(self): self.refdata, self.cluster_read_counts[seq_name], self.cluster_base_counts[seq_name], + read_store=self.read_store, + reference_names=self.cluster_ids[seq_type][seq_name], logfile=self.log_files[-1], assembly_coverage=self.assembly_coverage, assembly_kmer=self.assembly_kmer, From 40ed0fca786be118b6927b051a6ecae60966bbc5 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 14:59:40 +0000 Subject: [PATCH 12/25] Write message when getting reads --- ariba/read_store.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ariba/read_store.py b/ariba/read_store.py index 0c314641..0c898fa5 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -37,6 +37,8 @@ def _compress_and_index_file(infile, log_fh=None): def get_reads(self, cluster_name, out1, out2): + if self.log_fh is not None: + print('Getting reads for', cluster_name, 'from', self.outfile, file=self.log_fh) tabix_file = pysam.TabixFile(self.outfile) f_out1 = pyfastaq.utils.open_file_write(out1) f_out2 = pyfastaq.utils.open_file_write(out2) @@ -51,6 +53,8 @@ def get_reads(self, cluster_name, out1, out2): pyfastaq.utils.close(f_out1) pyfastaq.utils.close(f_out2) + if self.log_fh is not None: + print('Finished getting reads for', cluster_name, 'from', self.outfile, file=self.log_fh) def clean(self): From 1032409b0b4b1c3b4640b07610293a70ee10f723 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 15:14:29 +0000 Subject: [PATCH 13/25] Add missing @ in fastq file name lines --- ariba/read_store.py | 5 ++--- ariba/tests/data/read_store_test_get_reads.reads_1.fq | 6 +++--- ariba/tests/data/read_store_test_get_reads.reads_2.fq | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ariba/read_store.py b/ariba/read_store.py index 0c898fa5..a73f390b 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -47,16 +47,15 @@ def get_reads(self, cluster_name, out1, out2): cluster, number, seq, qual = line.rstrip().split() number = int(number) if number % 2 == 0: - print(str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2) + print('@' + str(number - 1) + '/2', seq, '+', qual, sep='\n', file=f_out2) else: - print(str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1) + print('@' + str(number) + '/1', seq, '+', qual, sep='\n', file=f_out1) pyfastaq.utils.close(f_out1) pyfastaq.utils.close(f_out2) if self.log_fh is not None: print('Finished getting reads for', cluster_name, 'from', self.outfile, file=self.log_fh) - def clean(self): os.unlink(self.outfile) os.unlink(self.outfile + '.tbi') diff --git a/ariba/tests/data/read_store_test_get_reads.reads_1.fq b/ariba/tests/data/read_store_test_get_reads.reads_1.fq index 7ceb45de..1f6a519f 100644 --- a/ariba/tests/data/read_store_test_get_reads.reads_1.fq +++ b/ariba/tests/data/read_store_test_get_reads.reads_1.fq @@ -1,12 +1,12 @@ -1/1 +@1/1 AAAA + HIHI -3/1 +@3/1 GGGG + DEFG -11/1 +@11/1 ACGT + CDEF diff --git a/ariba/tests/data/read_store_test_get_reads.reads_2.fq b/ariba/tests/data/read_store_test_get_reads.reads_2.fq index 73e7389c..2270fd18 100644 --- a/ariba/tests/data/read_store_test_get_reads.reads_2.fq +++ b/ariba/tests/data/read_store_test_get_reads.reads_2.fq @@ -1,12 +1,12 @@ -1/2 +@1/2 CCCC + GGGG -3/2 +@3/2 TTTT + GFED -11/2 +@11/2 TGCA + CDEF From c41b05c464cc2de22a8775fbcf8419d7610b7195 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 15:14:57 +0000 Subject: [PATCH 14/25] Add missing test file --- ...sters_test_bam_to_clusters_reads.read_store.gz | Bin 0 -> 213 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ariba/tests/data/clusters_test_bam_to_clusters_reads.read_store.gz diff --git a/ariba/tests/data/clusters_test_bam_to_clusters_reads.read_store.gz b/ariba/tests/data/clusters_test_bam_to_clusters_reads.read_store.gz new file mode 100644 index 0000000000000000000000000000000000000000..02b0e0fa9c93236a6966b8cec562322be46c89d4 GIT binary patch literal 213 zcmb2|=3rp}f&Xj_PR>jWI~a~mI>>v(fP>|JjQPiC%4(Ht(#02G7*0P;pq+5i9m literal 0 HcmV?d00001 From 97d66e6702e782d908e41a028630d2ad5560fca6 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 15:15:12 +0000 Subject: [PATCH 15/25] Pass clean option --- ariba/cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index bfe4b7e9..1e048a85 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -275,7 +275,8 @@ def run(self): sspace_k=self.sspace_k, sspace_sd=self.sspace_sd, reads_insert=self.reads_insert, - extern_progs=self.extern_progs + extern_progs=self.extern_progs, + clean=self.clean ) self.assembly.run() From 7176e8072b2c25016b0c4f9bb2d50ba43257cb6c Mon Sep 17 00:00:00 2001 From: martinghunt Date: Fri, 29 Apr 2016 15:26:48 +0000 Subject: [PATCH 16/25] extract reads and references in run(), not __init__() --- ariba/cluster.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/ariba/cluster.py b/ariba/cluster.py index 1e048a85..b213c4e9 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -50,7 +50,12 @@ def __init__(self, self.name = name self.reference_fa = os.path.join(self.root_dir, 'reference.fa') self.reference_names = reference_names - self._set_up_input_files() + self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq') + self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq') + self.references_fa = os.path.join(self.root_dir, 'references.fa') + + if os.path.exists(self.root_dir): + self._input_files_exist() self.total_reads = total_reads self.total_reads_bases = total_reads_bases @@ -66,11 +71,6 @@ def __init__(self, self.reads_for_assembly1 = os.path.join(self.root_dir, 'reads_for_assembly_1.fq') self.reads_for_assembly2 = os.path.join(self.root_dir, 'reads_for_assembly_2.fq') - for fname in [self.all_reads1, self.all_reads2, self.references_fa]: - if not os.path.exists(fname): - raise Error('File ' + fname + ' not found. Cannot continue') - - self.ref_sequence = None self.max_insert = max_insert @@ -123,17 +123,17 @@ def __init__(self, self.random_seed = random_seed - def _set_up_input_files(self): - self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq') - self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq') - self.references_fa = os.path.join(self.root_dir, 'references.fa') + def _input_files_exist(self): + assert self.read_store is None + if not (os.path.exists(self.all_reads1) and os.path.exists(self.all_reads2)): + raise Error('Error making cluster. Reads files not found') + if not os.path.exists(self.references_fa): + raise Error('Error making cluster. References fasta not found') + + def _set_up_input_files(self): if os.path.exists(self.root_dir): - assert self.read_store is None - if not (os.path.exists(self.all_reads1) and os.path.exists(self.all_reads2)): - raise Error('Error making cluster. Reads files not found') - if not os.path.exists(self.references_fa): - raise Error('Error making cluster. References fasta not found') + self._input_files_exist() else: assert self.read_store is not None assert self.reference_names is not None @@ -232,6 +232,13 @@ def _make_reads_for_assembly(number_of_wanted_reads, total_reads, reads_in1, rea def run(self): if self.logfile is None: self.logfile = os.path.join(self.root_dir, 'log.txt') + + self._set_up_input_files() + + for fname in [self.all_reads1, self.all_reads2, self.references_fa]: + if not os.path.exists(fname): + raise Error('File ' + fname + ' not found. Cannot continue') + self.log_fh = pyfastaq.utils.open_file_write(self.logfile) print('{:_^79}'.format(' LOG FILE START ' + self.name + ' '), file=self.log_fh, flush=True) From a8420088db99fe45e322200998ef7eca112dd95e Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 08:30:01 +0000 Subject: [PATCH 17/25] Store assembled ref seqs in memory instead writing to file --- ariba/assembly_compare.py | 14 +++++++------- ariba/tests/assembly_compare_test.py | 15 ++++++++------- ...rite_assembled_reference_sequences.expected.fa | 7 ------- 3 files changed, 15 insertions(+), 21 deletions(-) delete mode 100644 ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py index 9d03867c..54f3099b 100644 --- a/ariba/assembly_compare.py +++ b/ariba/assembly_compare.py @@ -137,13 +137,13 @@ def ref_cov_per_contig(nucmer_hits): @staticmethod - def _write_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly, outfile): + def _get_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly): '''nucmer_hits = hits made by self._parse_nucmer_coords_file. ref_gene = reference sequence (pyfastaq.sequences.Fasta object) assembly = dictionary of contig name -> contig. - Writes each piece of assembly that corresponds to the reference sequence - to a fasta file.''' - f = pyfastaq.utils.open_file_write(outfile) + Makes a set of Fasta objects of each piece of assembly that + corresponds to the reference sequeunce.''' + sequences = {} for contig in sorted(nucmer_hits): for hit in nucmer_hits[contig]: @@ -168,9 +168,9 @@ def _write_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly, ou if hit.hit_length_ref == hit.ref_length: fa.id += '.complete' - print(fa, file=f) + sequences[fa.id] = fa - pyfastaq.utils.close(f) + return sequences @staticmethod @@ -287,4 +287,4 @@ def run(self): self._run_nucmer() self.nucmer_hits = self._parse_nucmer_coords_file(self.nucmer_coords_file, self.ref_sequence.id) self.percent_identities = self._nucmer_hits_to_percent_identity(self.nucmer_hits) - self._write_assembled_reference_sequences(self.nucmer_hits, self.ref_sequence, self.assembly_sequences, self.assembled_ref_seqs_file) + self.assembled_reference_sequences = self._write_assembled_reference_sequences(self.nucmer_hits, self.ref_sequence, self.assembly_sequences, self.assembled_ref_seqs_file) diff --git a/ariba/tests/assembly_compare_test.py b/ariba/tests/assembly_compare_test.py index 7f88111b..671651fc 100644 --- a/ariba/tests/assembly_compare_test.py +++ b/ariba/tests/assembly_compare_test.py @@ -133,8 +133,8 @@ def test_ref_cov_per_contig(self): self.assertEqual(expected, got) - def test_write_assembled_reference_sequences(self): - '''test _write_assembled_reference_sequences''' + def test_get_assembled_reference_sequences(self): + '''test _get_assembled_reference_sequences''' ref_sequence = pyfastaq.sequences.Fasta('ref_seq', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG') assembly = { 'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'), @@ -154,11 +154,12 @@ def test_write_assembled_reference_sequences(self): ] } - tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa' - assembly_compare.AssemblyCompare._write_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly, tmp_outfile) - expected_outfile = os.path.join(data_dir, 'assembly_compare_write_assembled_reference_sequences.expected.fa') - self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False)) - os.unlink(tmp_outfile) + expected = {'ref_seq.1.147.contig1.61.207.+.complete': pyfastaq.sequences.Fasta('ref_seq.1.147.contig1.61.207.+.complete', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG'), + 'ref_seq.18.120.contig2.1.103.-': pyfastaq.sequences.Fasta('ref_seq.18.120.contig2.1.103.-', 'TTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT') + } + + got = assembly_compare.AssemblyCompare._get_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly) + self.assertEqual(expected, got) def test_whole_gene_covered_by_nucmer_hits(self): diff --git a/ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa b/ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa deleted file mode 100644 index 9fecb1a5..00000000 --- a/ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa +++ /dev/null @@ -1,7 +0,0 @@ ->ref_seq.1.147.contig1.61.207.+.complete -ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT -TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT -GCCAGTGGCATCTGTGTAAGCGCTTAG ->ref_seq.18.120.contig2.1.103.- -TTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCG -TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT From cf088156a732f209027b74faee3acd231222881b Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 08:33:56 +0000 Subject: [PATCH 18/25] Fix function call to use new syntax --- ariba/assembly_compare.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py index 54f3099b..6b9fe21f 100644 --- a/ariba/assembly_compare.py +++ b/ariba/assembly_compare.py @@ -34,7 +34,6 @@ def __init__(self, self.nucmer_coords_file = self.outprefix + '.nucmer.coords' self.nucmer_snps_file = self.nucmer_coords_file + '.snps' - self.assembled_ref_seqs_file = self.outprefix + '.assembled_refs.fasta' def _run_nucmer(self): @@ -287,4 +286,4 @@ def run(self): self._run_nucmer() self.nucmer_hits = self._parse_nucmer_coords_file(self.nucmer_coords_file, self.ref_sequence.id) self.percent_identities = self._nucmer_hits_to_percent_identity(self.nucmer_hits) - self.assembled_reference_sequences = self._write_assembled_reference_sequences(self.nucmer_hits, self.ref_sequence, self.assembly_sequences, self.assembled_ref_seqs_file) + self.assembled_reference_sequences = self._get_assembled_reference_sequences(self.nucmer_hits, self.ref_sequence, self.assembly_sequences) From 70f43db17140c033d05394bcbe9457f002fd9743 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 08:49:57 +0000 Subject: [PATCH 19/25] Get assembled seqs from memory, not files --- ariba/clusters.py | 9 ++++----- ariba/tests/clusters_test.py | 18 +++++++++++------- ...atted_assembled_genes_fasta.expected.out.fa | 10 +++++----- ...te_catted_assembled_genes_fasta.in.gene1.fa | 4 ---- ...te_catted_assembled_genes_fasta.in.gene2.fa | 2 -- 5 files changed, 20 insertions(+), 23 deletions(-) delete mode 100644 ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa delete mode 100644 ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa diff --git a/ariba/clusters.py b/ariba/clusters.py index 3920040b..3f232e4f 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -349,13 +349,12 @@ def _write_catted_assembled_seqs_fasta(self, outfile): for gene in sorted(self.clusters): try: - cluster_fasta = self.clusters[gene].assembly_compare.assembled_ref_seqs_file + seq_dict = self.clusters[gene].assembly_compare.assembled_reference_sequences except: continue - if os.path.exists(cluster_fasta): - file_reader = pyfastaq.sequences.file_reader(cluster_fasta) - for seq in file_reader: - print(seq, file=f) + + for seq_name in sorted(seq_dict): + print(seq_dict[seq_name], file=f) pyfastaq.utils.close(f) diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py index bcef378c..27c74954 100644 --- a/ariba/tests/clusters_test.py +++ b/ariba/tests/clusters_test.py @@ -171,18 +171,22 @@ def __init__(self, lines): def test_write_catted_assembled_seqs_fasta(self): '''test _write_catted_assembled_seqs_fasta''' + seq1 = pyfastaq.sequences.Fasta('seq1', 'ACGT') + seq2 = pyfastaq.sequences.Fasta('seq2', 'TTTT') + seq3 = pyfastaq.sequences.Fasta('seq3', 'AAAA') class FakeAssemblyCompare: - def __init__(self, filename): - self.assembled_ref_seqs_file = filename + def __init__(self, assembled_seqs): + if assembled_seqs is not None: + self.assembled_reference_sequences = {x.id: x for x in assembled_seqs} class FakeCluster: - def __init__(self, filename): - #self.final_assembled_genes_fa = filename - self.assembly_compare = FakeAssemblyCompare(filename) + def __init__(self, assembled_seqs): + self.assembly_compare = FakeAssemblyCompare(assembled_seqs) self.clusters.clusters = { - 'gene1': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa')), - 'gene2': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa')), + 'gene1': FakeCluster([seq1, seq2]), + 'gene2': FakeCluster([seq3]), + 'gene3': FakeCluster(None), } tmp_file = 'tmp.test_write_catted_assembled_seqs_fasta.fa' diff --git a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa index d340bc20..a6639817 100644 --- a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa +++ b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.expected.out.fa @@ -1,6 +1,6 @@ ->gene1.1 +>seq1 ACGT ->gene1.2 -CAT ->gene2 -GTGT +>seq2 +TTTT +>seq3 +AAAA diff --git a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa deleted file mode 100644 index 27aef244..00000000 --- a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa +++ /dev/null @@ -1,4 +0,0 @@ ->gene1.1 -ACGT ->gene1.2 -CAT diff --git a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa b/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa deleted file mode 100644 index 2697b9f2..00000000 --- a/ariba/tests/data/clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->gene2 -GTGT From 5d5868b840fde8cfa0a82d56aafccd004ca85a98 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 08:50:54 +0000 Subject: [PATCH 20/25] Typo in log message --- ariba/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index 3f232e4f..ff32fa7d 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -362,7 +362,7 @@ def _write_catted_assembled_seqs_fasta(self, outfile): def _clean(self): if self.clean: if self.verbose: - print('Deleting clusters direcory', self.clusters_outdir) + print('Deleting clusters directory', self.clusters_outdir) shutil.rmtree(self.clusters_outdir) print('Deleting Logs directory', self.logs_dir) shutil.rmtree(self.logs_dir) From 727e61a2dbd5d73b86ee29ce654c9164341d47b9 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 08:54:12 +0000 Subject: [PATCH 21/25] Delete read store files --- ariba/clusters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ariba/clusters.py b/ariba/clusters.py index ff32fa7d..850827c2 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -366,6 +366,8 @@ def _clean(self): shutil.rmtree(self.clusters_outdir) print('Deleting Logs directory', self.logs_dir) shutil.rmtree(self.logs_dir) + print('Deleting reads store files', self.read_store.outfile + '[.tbi]') + self.read_store.clean() else: if self.verbose: print('Not deleting anything because --noclean used') From b59c19d1b74a970446d05e19ca1fefb4e7ff44bc Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 09:00:16 +0000 Subject: [PATCH 22/25] gzip assembled seqs fasta file --- ariba/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index 850827c2..a4940f4e 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -76,7 +76,7 @@ def __init__(self, self.report_file_all_tsv = os.path.join(self.outdir, 'report.all.tsv') self.report_file_all_xls = os.path.join(self.outdir, 'report.all.xls') self.report_file_filtered_prefix = os.path.join(self.outdir, 'report') - self.catted_assembled_seqs_fasta = os.path.join(self.outdir, 'assembled_seqs.fa') + self.catted_assembled_seqs_fasta = os.path.join(self.outdir, 'assembled_seqs.fa.gz') self.threads = threads self.verbose = verbose From bf7d9bbb20882b1628a2fd5246a6e4351ceab0f6 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 09:08:22 +0000 Subject: [PATCH 23/25] Delete cluster dir asap --- ariba/clusters.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index a4940f4e..249bf0a1 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -13,12 +13,18 @@ class Error (Exception): pass -def _run_cluster(obj, verbose): +def _run_cluster(obj, verbose, clean): if verbose: print('Start running cluster', obj.name, 'in directory', obj.root_dir, flush=True) obj.run() if verbose: print('Finished running cluster', obj.name, 'in directory', obj.root_dir, flush=True) + + if clean: + if verbose: + print('Deleting cluster dir', obj.root_dir, flush=True) + shutil.rmtree(obj.root_dir) + return obj @@ -312,7 +318,7 @@ def _init_and_run_clusters(self): cluster_list = pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose))) else: for c in cluster_list: - _run_cluster(c, self.verbose) + _run_cluster(c, self.verbose, self.clean) self.clusters = {c.name: c for c in cluster_list} From d3bee7a32893f1397b2dedfae1549b6ac12b285f Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 09:15:39 +0000 Subject: [PATCH 24/25] Remove self.log_fh (breaks multithreading) --- ariba/read_store.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ariba/read_store.py b/ariba/read_store.py index a73f390b..d75bea83 100644 --- a/ariba/read_store.py +++ b/ariba/read_store.py @@ -10,14 +10,13 @@ def __init__(self, infile, outprefix, log_fh=None): assert infile != outprefix self.infile = os.path.abspath(infile) self.outprefix = os.path.abspath(outprefix) - self.log_fh = log_fh self.outfile = os.path.abspath(outprefix) + '.gz' if not os.path.exists(self.infile): raise Error('File not found ' + self.infile + '. Cannot continue') - self._sort_file(self.infile, self.outprefix, log_fh=self.log_fh) - self._compress_and_index_file(self.outprefix, log_fh=self.log_fh) + self._sort_file(self.infile, self.outprefix, log_fh) + self._compress_and_index_file(self.outprefix, log_fh) os.unlink(self.outprefix) @@ -36,9 +35,9 @@ def _compress_and_index_file(infile, log_fh=None): pysam.tabix_index(infile + '.gz', seq_col=0, start_col=1, end_col=1) - def get_reads(self, cluster_name, out1, out2): - if self.log_fh is not None: - print('Getting reads for', cluster_name, 'from', self.outfile, file=self.log_fh) + def get_reads(self, cluster_name, out1, out2, log_fh=None): + if log_fh is not None: + print('Getting reads for', cluster_name, 'from', self.outfile, file=log_fh) tabix_file = pysam.TabixFile(self.outfile) f_out1 = pyfastaq.utils.open_file_write(out1) f_out2 = pyfastaq.utils.open_file_write(out2) @@ -53,8 +52,8 @@ def get_reads(self, cluster_name, out1, out2): pyfastaq.utils.close(f_out1) pyfastaq.utils.close(f_out2) - if self.log_fh is not None: - print('Finished getting reads for', cluster_name, 'from', self.outfile, file=self.log_fh) + if log_fh is not None: + print('Finished getting reads for', cluster_name, 'from', self.outfile, file=log_fh) def clean(self): os.unlink(self.outfile) From 4d3ea9d11a5d399b52f292f8f3503c950443cfc2 Mon Sep 17 00:00:00 2001 From: martinghunt Date: Tue, 3 May 2016 09:16:04 +0000 Subject: [PATCH 25/25] Pass clean option when multithreading --- ariba/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ariba/clusters.py b/ariba/clusters.py index 249bf0a1..15089900 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -315,7 +315,7 @@ def _init_and_run_clusters(self): if self.threads > 1: pool = multiprocessing.Pool(self.threads) - cluster_list = pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose))) + cluster_list = pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose), itertools.repeat(self.clean))) else: for c in cluster_list: _run_cluster(c, self.verbose, self.clean)