diff --git a/ariba/cluster.py b/ariba/cluster.py index 75dff87b..853a8dcb 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -127,6 +127,15 @@ def __init__(self, self.percent_identities = {} + def _get_read_counts(self): + count1 = pyfastaq.tasks.count_sequences(self.reads1) + count2 = pyfastaq.tasks.count_sequences(self.reads2) + if count1 == count2: + return count1 + count2 + else: + raise Error('Different number of fwd/rev reads in cluster ' + self.name + '! Cannot continue') + + def _get_total_alignment_score(self, gene_name): tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam') assert not os.path.exists(tmp_bam) @@ -744,12 +753,14 @@ def _get_vcf_variant_counts(self): def _make_report_lines(self): self.report_lines = [] cov_per_contig = self._nucmer_hits_to_gene_cov_per_contig() + total_reads = self._get_read_counts() if len(self.variants) == 0: for contig in self.percent_identities: self.report_lines.append([ self.gene.id, self.status_flag.to_number(), + total_reads, self.name, len(self.gene), cov_per_contig[contig], @@ -767,6 +778,7 @@ def _make_report_lines(self): self.report_lines.append([ self.gene.id, self.status_flag.to_number(), + total_reads, self.name, len(self.gene), cov_per_contig[contig], diff --git a/ariba/clusters.py b/ariba/clusters.py index 56fbd033..3d09660a 100644 --- a/ariba/clusters.py +++ b/ariba/clusters.py @@ -307,6 +307,7 @@ def _write_reports(self): columns = [ '#gene', 'flag', + 'reads', 'cluster', 'gene_len', 'assembled', diff --git a/ariba/summary.py b/ariba/summary.py index c15b935f..76ffaaf7 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -8,6 +8,7 @@ class Error (Exception): pass columns = [ 'gene', 'flag', + 'reads', 'cluster', 'gene_len', 'assembled', @@ -26,6 +27,7 @@ class Error (Exception): pass ] int_columns = [ + 'reads', 'gene_len', 'assembled', 'gene_start', diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py index acbf4492..4b6317dc 100644 --- a/ariba/tests/cluster_test.py +++ b/ariba/tests/cluster_test.py @@ -56,6 +56,25 @@ def test_init_fail_files_missing(self): clean_cluster_dir(d) + def test_get_read_counts(self): + '''test _get_read_counts pass''' + cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts') + clean_cluster_dir(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') + self.assertEqual(2, c._get_read_counts()) + clean_cluster_dir(cluster_dir) + + + def test_get_read_counts_fail(self): + '''test _get_read_counts fail''' + cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts_fail') + clean_cluster_dir(cluster_dir) + c = cluster.Cluster(cluster_dir, 'name') + with self.assertRaises(cluster.Error): + c._get_read_counts() + clean_cluster_dir(cluster_dir) + + def test_get_total_alignment_score(self): '''test _get_total_alignment_score''' cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score') @@ -629,6 +648,7 @@ def test_make_report_lines(self): expected = [[ 'gene', 42, + 2, 'cluster_name', 39, 10, diff --git a/ariba/tests/data/cluster_test_get_read_counts/genes.fa b/ariba/tests/data/cluster_test_get_read_counts/genes.fa new file mode 100644 index 00000000..042775d6 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_read_counts/genes.fa @@ -0,0 +1,2 @@ +>name_of_gene +AAACCCGGGTTT diff --git a/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq b/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq new file mode 100644 index 00000000..6ff1e12f --- /dev/null +++ b/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq @@ -0,0 +1,4 @@ +@read1/1 +ACGTACGT ++ +IIIIIIII diff --git a/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq b/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq new file mode 100644 index 00000000..2eb387ff --- /dev/null +++ b/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq @@ -0,0 +1,4 @@ +@read1/2 +ACGTACGT ++ +IIIIIIII diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa b/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa new file mode 100644 index 00000000..042775d6 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa @@ -0,0 +1,2 @@ +>name_of_gene +AAACCCGGGTTT diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq new file mode 100644 index 00000000..46dd9a20 --- /dev/null +++ b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq @@ -0,0 +1,8 @@ +@read1/1 +ACGTACGT ++ +IIIIIIII +@read2/1 +TCATCATA ++ +:D:D:D:D diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq new file mode 100644 index 00000000..2eb387ff --- /dev/null +++ b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq @@ -0,0 +1,4 @@ +@read1/2 +ACGTACGT ++ +IIIIIIII diff --git a/ariba/tests/data/clusters_test_write_report.tsv b/ariba/tests/data/clusters_test_write_report.tsv index b4861113..4e4611da 100644 --- a/ariba/tests/data/clusters_test_write_report.tsv +++ b/ariba/tests/data/clusters_test_write_report.tsv @@ -1,3 +1,3 @@ -#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +#gene flag reads cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt gene1 line1 gene2 line2 diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv index 6ed46946..8bc77486 100644 --- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv +++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv @@ -1,4 +1,4 @@ -#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt -gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . -gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . -gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . +#gene flag reads cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +gene1 27 42 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . +gene2 15 44 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . +gene2 15 46 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv index 61c260cc..9d3916da 100644 --- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv +++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv @@ -1,3 +1,3 @@ -#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt -gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . -gene3 27 3 750 750 98.93 . . . . . . gene3.scaffold.1 1047 . . . +#gene flag reads cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +gene1 27 142 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . +gene3 27 144 3 750 750 98.93 . . . . . . gene3.scaffold.1 1047 . . . diff --git a/ariba/tests/data/summary_test_load_file.in.tsv b/ariba/tests/data/summary_test_load_file.in.tsv index e81496ce..1abb334e 100644 --- a/ariba/tests/data/summary_test_load_file.in.tsv +++ b/ariba/tests/data/summary_test_load_file.in.tsv @@ -1,5 +1,5 @@ -#gene flag cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt -gene1 27 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . -gene2 15 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . -gene2 15 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . -gene3 187 3 750 750 98.93 SNP SYN . 318 318 C gene3.scaffold.1 1047 319 319 G +#gene flag reads cluster gene_len assembled pc_ident var_type var_effect new_aa gene_start gene_end gene_nt scaffold scaff_len scaff_start scaff_end scaff_nt +gene1 27 42 1 822 822 100.0 . . . . . . gene1.scaffold.1 1490 . . . +gene2 15 44 2 780 780 100.0 . . . . . . gene2.scaffold.2 1124 . . . +gene2 15 46 2 780 770 99.0 . . . . . . gene2.scaffold.3 1097 . . . +gene3 187 48 3 750 750 98.93 SNP SYN . 318 318 C gene3.scaffold.1 1047 319 319 G diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index 3394eee8..ee843722 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -21,11 +21,12 @@ def test_init(self): def test_line2dict(self): '''Test _line2dict''' - line = '\t'.join(['gene1', '187', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '66', '66', 'A', 'gene1.scaffold.1', '1047', '67', '67', 'C']) + line = '\t'.join(['gene1', '187', '42', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '66', '66', 'A', 'gene1.scaffold.1', '1047', '67', '67', 'C']) s = summary.Summary('out', filenames=['spam', 'eggs']) expected = { 'gene': 'gene1', 'flag': flag.Flag(187), + 'reads': 42, 'cluster': '3', 'gene_len': 750, 'assembled': 750, @@ -51,10 +52,10 @@ def test_load_file(self): infile = os.path.join(data_dir, 'summary_test_load_file.in.tsv') lines = [ - ['gene1', '27', '1', '822', '822', '100.0', '.', '.', '.', '.', '.', '.', 'gene1.scaffold.1', '1490', '.', '.', '.'], - ['gene2', '15', '2', '780', '780', '100.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.2', '1124', '.', '.', '.'], - ['gene2', '15', '2', '780', '770', '99.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.3', '1097', '.', '.', '.'], - ['gene3', '187', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '318', '318', 'C', 'gene3.scaffold.1', '1047', '319', '319', 'G'] + ['gene1', '27', '42', '1', '822', '822', '100.0', '.', '.', '.', '.', '.', '.', 'gene1.scaffold.1', '1490', '.', '.', '.'], + ['gene2', '15', '44', '2', '780', '780', '100.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.2', '1124', '.', '.', '.'], + ['gene2', '15', '46', '2', '780', '770', '99.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.3', '1097', '.', '.', '.'], + ['gene3', '187', '48', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '318', '318', 'C', 'gene3.scaffold.1', '1047', '319', '319', 'G'] ] dicts = [s._line2dict('\t'.join(x)) for x in lines] expected = {'gene1': [dicts[0]], 'gene2': dicts[1:3], 'gene3': [dicts[3]]}