diff --git a/README.md b/README.md index beb1ae1d..5e1ff0d0 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,8 @@ Once the dependencies are installed, install ARIBA using pip: ARIBA also depends on several Python packages, all of which are available via pip, so the above command will get those automatically if they -are not installed. The packages are dendropy >= 4.2.0, +are not installed. The packages are dendropy >= 4.2.0, matplotlib (no +minimum version required, but only tested on 2.0.0), pyfastaq >= 3.12.0, pysam >= 0.9.1, and pymummer >= 0.10.1. Alternatively, you can download the latest release from this github repository, diff --git a/ariba/__init__.py b/ariba/__init__.py index 1ca0437e..ca6fc157 100644 --- a/ariba/__init__.py +++ b/ariba/__init__.py @@ -25,6 +25,7 @@ 'mapping', 'megares_data_finder', 'megares_zip_parser', + 'mic_plotter', 'mlst_profile', 'mlst_reporter', 'pubmlst_getter', diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py index a0eb6a00..bd8215df 100644 --- a/ariba/assembly_compare.py +++ b/ariba/assembly_compare.py @@ -135,6 +135,39 @@ def nucmer_hits_to_ref_coords(cls, nucmer_hits, contig=None): return coords + @classmethod + def nucmer_hits_to_ref_and_qry_coords(cls, nucmer_hits, contig=None): + '''Same as nucmer_hits_to_ref_coords, except removes containing hits first, + and returns ref and qry coords lists''' + if contig is None: + ctg_coords = {key: [] for key in nucmer_hits.keys()} + else: + ctg_coords = {contig: []} + + ref_coords = {} + + for key in ctg_coords: + hits = copy.copy(nucmer_hits[key]) + hits.sort(key=lambda x: len(x.ref_coords())) + + if len(hits) > 1: + i = 0 + while i < len(hits) - 1: + c1 = hits[i].ref_coords() + c2 = hits[i+1].ref_coords() + if c2.contains(c1): + hits.pop(i) + else: + i += 1 + + ref_coords[key] = [hit.ref_coords() for hit in hits] + ctg_coords[key] = [hit.qry_coords() for hit in hits] + pyfastaq.intervals.merge_overlapping_in_list(ref_coords[key]) + pyfastaq.intervals.merge_overlapping_in_list(ctg_coords[key]) + + return ctg_coords, ref_coords + + @staticmethod def ref_cov_per_contig(nucmer_hits): '''Input is hits made by self._parse_nucmer_coords_file. diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py index 733334cc..fa4e2e21 100644 --- a/ariba/assembly_variants.py +++ b/ariba/assembly_variants.py @@ -260,7 +260,7 @@ def _get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nuc return variants - def get_variants(self, ref_sequence_name, nucmer_coords): + def get_variants(self, ref_sequence_name, allowed_ctg_coords, allowed_ref_coords, nucmer_matches=None): '''Nucmr coords = dict. Key=contig name. Value = list of intervals of ref coords that match the contig. Made by assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords Returns dictionary. Key=contig name. Value = list of variants. Each variant @@ -287,12 +287,27 @@ def get_variants(self, ref_sequence_name, nucmer_coords): known_non_wild_variants_in_ref = self.refdata.all_non_wild_type_variants(ref_sequence_name) - for contig in nucmer_coords: + for contig in allowed_ctg_coords: + if contig not in allowed_ref_coords: + continue + used_known_variants = set() variants[contig] = [] if contig in mummer_variants: for mummer_variant_list in mummer_variants[contig]: + ref_start = min([x.ref_start for x in mummer_variant_list]) + ref_end = max([x.ref_end for x in mummer_variant_list]) + ctg_start = min([x.qry_start for x in mummer_variant_list]) + ctg_end = min([x.qry_end for x in mummer_variant_list]) + ref_interval = intervals.Interval(ref_start, ref_end) + ctg_interval = intervals.Interval(ctg_start, ctg_end) + ref_ok = True in {x.intersects(ref_interval) for x in allowed_ref_coords[contig]} + qry_ok = True in {x.intersects(ctg_interval) for x in allowed_ctg_coords[contig]} + + if not (ref_ok and qry_ok): + continue + if seq_type == 'p': new_variant, used_variants = self._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variant_list) else: @@ -306,10 +321,10 @@ def get_variants(self, ref_sequence_name, nucmer_coords): # for this contig, need to know all the ref sequence and coords it maps to. # Then report just the unused known variants, as the contig also has these variants if seq_type == 'p': - new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, nucmer_coords[contig]) + new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, allowed_ref_coords[contig]) else: - new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, nucmer_coords[contig]) + new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, allowed_ref_coords[contig]) if is_variant_only: new_variants = [x for x in new_variants if len(x[5]) > 0] diff --git a/ariba/cluster.py b/ariba/cluster.py index 4d41d60b..6285513b 100644 --- a/ariba/cluster.py +++ b/ariba/cluster.py @@ -382,9 +382,9 @@ def _run(self): self.assembly_compare.run() self.status_flag = self.assembly_compare.update_flag(self.status_flag) - nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits) + allowed_ctg_pos, allowed_ref_pos = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(self.assembly_compare.nucmer_hits) assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file) - self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref) + self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, allowed_ctg_pos, allowed_ref_pos) for var_list in self.assembly_variants.values(): for var in var_list: diff --git a/ariba/ext/vcfcall_ariba.cpp b/ariba/ext/vcfcall_ariba.cpp index 3ead497e..c2dba617 100644 --- a/ariba/ext/vcfcall_ariba.cpp +++ b/ariba/ext/vcfcall_ariba.cpp @@ -133,7 +133,8 @@ void vectorSumAndMax(const std::vector& v, uint32_t& maxValue, uint32_ bool depthOk(const uint32_t& totalDepth, const uint32_t& testNum, const uint32_t& minSecondDepth, const float& maxAlleleFreq) { - return (testNum >= minSecondDepth && 1.0 * testNum / totalDepth <= maxAlleleFreq); + double depthFreq = 1.0 * testNum / totalDepth; + return (testNum >= minSecondDepth && depthFreq >= (1 - maxAlleleFreq) && depthFreq <= maxAlleleFreq); } diff --git a/ariba/mic_plotter.py b/ariba/mic_plotter.py new file mode 100644 index 00000000..ca5992f4 --- /dev/null +++ b/ariba/mic_plotter.py @@ -0,0 +1,651 @@ +import csv +import sys +import random +import re +import os +import itertools +import collections +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec +import matplotlib.cm as cmx +import math +import pyfastaq +from ariba import common, reference_data + +class Error (Exception): pass + +regex_string_to_float = re.compile(r'\s*(?P[<>]?)\s*(?P=?)\s*(?P[0-9.]+)\s*$') + +regex_position_from_var = re.compile(r'^[^0-9]*(?P[0-9]+)[^0-9]*$') + +class MicPlotter: + def __init__(self, + refdata_dir, + antibiotic, + mic_file, + summary_file, + outprefix, + use_hets='yes', + main_title=None, + plot_height=7, + plot_width=7, + log_y=2, + plot_types="points,violin", + jitter_width=0.1, + no_combinations=False, + hlines='', + point_size=4, + point_scale=1, + dot_size=100, + dot_outline=False, + dot_y_text_size=7, + panel_heights='9,2', + panel_widths='5,1', + colourmap='Accent', + number_of_colours=0, + colour_skip=None, + interrupted=False, + violin_width=0.75, + xkcd=False, + min_samples=1, + count_legend_x=-2, + out_format='pdf', + p_cutoff=0.05 + ): + refdata_fa = os.path.join(refdata_dir, '02.cdhit.all.fa') + refdata_tsv = os.path.join(refdata_dir, '01.filter.check_metadata.tsv') + self.refdata = reference_data.ReferenceData([refdata_fa], [refdata_tsv]) + self.antibiotic = antibiotic + self.mic_file = mic_file + self.summary_file = summary_file + self.outprefix = outprefix + + allowed_use_hets = {'yes', 'no', 'exclude'} + if not use_hets in allowed_use_hets: + raise Error('Error in use_hets option. Allowed options are: ' + str(allowed_use_hets) + '. Got: ' + use_hets) + self.use_hets = use_hets + + self.main_title = self.antibiotic if main_title is None else main_title + self.plot_height = plot_height + self.plot_width = plot_width + self.log_y = log_y + self.plot_types = set(plot_types.split(',')) + + allowed_plot_types = {'point', 'violin'} + if not self.plot_types.issubset(allowed_plot_types): + raise Error('Error in plot_types option. Allowed types are: ' + str(allowed_plot_types) + '. Got: ' + str(self.plot_types)) + + self.jitter_width = jitter_width + self.no_combinations = no_combinations + + try: + if len(hlines) == 0: + self.hlines = [] + else: + self.hlines = [float(x) for x in hlines.split(',')] + except: + raise Error('Error in hlines option. Needs to be a list of numbers separated by commas, or empty. Got this:\n' + hlines) + + self.point_size = point_size + self.point_scale = point_scale + self.dot_size = dot_size + self.dot_outline = dot_outline + self.dot_y_text_size = dot_y_text_size + + try: + self.panel_heights = [int(x) for x in panel_heights.split(',')] + except: + raise Error('Error in panel_heights option. Needs to be of the form integer1,integer2. Got this:\n' + panel_heights) + + try: + self.panel_widths = [int(x) for x in panel_widths.split(',')] + except: + raise Error('Error in panel_widths option. Needs to be of the form integer1,integer2. Got this:\n' + panel_widths) + + self.colourmap = colourmap + self.number_of_colours = number_of_colours + + if colour_skip is None: + self.colour_skip = None + else: + try: + self.colour_skip = [float(x) for x in colour_skip.split(',')] + except: + raise Error('Error in colour_skip option. Needs to be of the form a,b where 0 <= a < b <= 1. Got this:\n' + colour_skip) + + self.interrupted = interrupted + self.violin_width = violin_width + if xkcd: + plt.xkcd() + self.min_samples = min_samples + self.count_legend_x = count_legend_x + self.out_format = out_format + self.p_cutoff = p_cutoff + + + @classmethod + def _mic_string_to_float(cls, s): + regex_match = regex_string_to_float.match(s) + + if regex_match is None or regex_match.group('number') == '.': + if s.strip() in {'NA', 'na', '', '.'}: + return 'NA' + else: + return None + + try: + flt = float(regex_match.group('number')) + except: + return None + + if regex_match.group('equals') == '': + if regex_match.group('lt_or_gt') == '<': + return 0.5 * flt + elif regex_match.group('lt_or_gt') == '>': + return 2 * flt + + return flt + + + @classmethod + def _load_mic_file(cls, infile): + mic_data = {} + + with open(infile) as f: + reader = csv.DictReader(f, delimiter='\t') + if reader.fieldnames[0] != 'Sample': + raise Error('Error. Expected first column of MIC file "' + infile + '" to be "Sample"') + + for row in reader: + mic_data[row['Sample']] = {x: MicPlotter._mic_string_to_float(row[x]) for x in reader.fieldnames[1:]} + + return mic_data + + + @classmethod + def _load_summary_file(cls, infile): + data = {} + + with open(infile) as f: + reader = csv.DictReader(f, delimiter=',') + + if reader.fieldnames[0] != 'name': + raise Error('Error. Expected first column of summary file "' + infile + '" to be "name"') + + clusters = [x.split('.', maxsplit=1)[0] for x in reader.fieldnames[1:]] + + for row in reader: + data[row['name']] = {} + + for field in row: + if field == 'name': + continue + + cluster, col = field.split('.', maxsplit=1) + if cluster not in clusters: + raise Error('Cluster "' + cluster + '" not recognised. Cannot continue') + if cluster not in data[row['name']]: + data[row['name']][cluster] = {} + + try: + value = float(row[field]) + except: + value = row[field] + data[row['name']][cluster][col] = value + + return data + + + @classmethod + def _get_colours(cls, total_length, number_of_colours, colormap, skip=None): + if number_of_colours == 1: + return ["black"] * total_length + elif number_of_colours == 0: + cmap = cmx.get_cmap(colormap) + if skip is None: + vals = [1.0 * x / (total_length - 1) for x in range(total_length)] + else: + assert len(skip) == 2 and 0 <= skip[0] <= 1 and 0 <= skip[1] <= 1 + if skip[-1] == 1: + vals = [skip[0] * x / (total_length - 1) for x in range(total_length)] + elif skip[0] == 0: + vals = [skip[1] + (1 - skip[1]) * x / (total_length - 1) for x in range(total_length)] + else: + length = 1 - (skip[1] - skip[0]) + vals = [(length) * x / (total_length - 1) for x in range(total_length)] + vals = [x if x < skip[0] else x + (1-length) for x in vals] + + return [cmap(x) for x in vals] + else: + cmap = cmx.get_cmap(colormap) + colours = [] + for i in itertools.cycle(range(number_of_colours)): + colours.append(cmap(i)) + if len(colours) >= total_length: + break + return colours + + + @classmethod + def _get_top_plot_data(cls, summary_data, mic_data, antibiotic, use_hets, refdata=None, no_combinations=False, interrupted=False, outfile=None): + assert use_hets in {'yes', 'no', 'exclude'} + if outfile is not None: + f = pyfastaq.utils.open_file_write(outfile) + print('Sample\tMIC\tMutations', file=f) + + ignore_columns = {'assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var', 'MULTIPLE'} + all_mutations = set() + all_mutations_seen_combinations = set() + top_plot_data = {} # cluster combination -> list of y coords (MIC values) + + for sample in sorted(summary_data): + if sample not in mic_data: + raise Error('No MIC data found for sample "' + sample + '". Cannot continue') + if antibiotic not in mic_data[sample]: + raise Error('Antibiotic "' + antibiotic + '" not found. Cannot continue') + + if mic_data[sample][antibiotic] == 'NA': + continue + + mutations = set() + found_het_and_exclude = False + + for cluster in summary_data[sample]: + if 'assembled' in summary_data[sample][cluster] and summary_data[sample][cluster]['assembled'] == 'interrupted' and interrupted: + mutations.add(cluster + '.interrupted') + + if refdata is not None and 'match' in summary_data[sample][cluster] and summary_data[sample][cluster]['match'] == 'yes' and 'ref_seq' in summary_data[sample][cluster]: + ref_type, variant_only = refdata.sequence_type(summary_data[sample][cluster]['ref_seq']) + if not variant_only: + mutations.add(cluster + '.present') + + for column, value in summary_data[sample][cluster].items(): + if column in ignore_columns or column.endswith('.%'): + continue + + if value == 'yes' or (use_hets == 'yes' and value == 'het'): + mutations.add(cluster + '.' + column.strip()) + elif use_hets == 'exclude' and value == 'het': + found_het_and_exclude = True + break + + if found_het_and_exclude: + break + + if found_het_and_exclude: + continue + + + if len(mutations) == 0: + mutations.add('without_mutation') + + all_mutations.update(mutations) + mutations = list(mutations) + mutations.sort() + if no_combinations: + for mutation in mutations: + all_mutations_seen_combinations.add((mutation,)) + if mutation not in top_plot_data: + top_plot_data[mutation] = [] + top_plot_data[mutation].append(mic_data[sample][antibiotic]) + if outfile is not None: + print(sample, mic_data[sample][antibiotic], mutation, sep='\t', file=f) + else: + all_mutations_seen_combinations.add(tuple(mutations)) + mutations = '.'.join(mutations) + if mutations not in top_plot_data: + top_plot_data[mutations] = [] + top_plot_data[mutations].append(mic_data[sample][antibiotic]) + if outfile is not None: + print(sample, mic_data[sample][antibiotic], mutations, sep='\t', file=f) + + + if outfile is not None: + pyfastaq.utils.close(f) + + return top_plot_data, all_mutations, all_mutations_seen_combinations + + + @classmethod + def _filter_top_plot_data(cls, top_plot_data, all_mutations, seen_combinations, min_samples): + if min_samples == 1: + return top_plot_data, all_mutations, seen_combinations + + new_top_plot_data = {} + new_all_mutations = set() + new_seen_combinations = set() + + for mutation_tuple in seen_combinations: + mutation_string = '.'.join(mutation_tuple) + mics = top_plot_data[mutation_string] + + if len(mics) >= min_samples: + new_top_plot_data[mutation_string] = mics + new_seen_combinations.add(mutation_tuple) + new_all_mutations.update(mutation_tuple) + + return new_top_plot_data, new_all_mutations, new_seen_combinations + + + @classmethod + def _top_plot_y_ticks(cls, mic_data, antibiotic, log_y): + mic_values = set() + for sample in mic_data: + mic = mic_data[sample][antibiotic] + if mic not in [None, 'NA']: + mic_values.add(mic) + + max_mic = max(mic_values) + min_mic = min(mic_values) + new_mic_values = [] + i = 1 + while i < max_mic * 2: + new_mic_values.append(i) + i *= 2 + + i = 0.5 + while i > min_mic / 2: + new_mic_values.append(i) + i *= 0.5 + + new_mic_values.sort() + new_mic_values = [round(x, 4) for x in new_mic_values] + + if log_y > 0: + tick_positions = [math.log(x, log_y) for x in new_mic_values] + else: + tick_positions = new_mic_values + + return tick_positions, new_mic_values + + + @classmethod + def _top_plot_scatter_counts(cls, mutations, top_plot_data, colours, log_y): + x_coords = [] + y_coords = [] + sizes = [] + colour_list = [] + + for i, mutation in enumerate(mutations): + counts = collections.Counter(top_plot_data[mutation]) + for mic in sorted(counts): + x_coords.append(i + 1) + if log_y > 0: + y_coords.append(math.log(mic, log_y)) + else: + y_coords.append(mic) + sizes.append(counts[mic]) + colour_list.append(colours[i]) + + return x_coords, y_coords, sizes, colour_list + + + @classmethod + def _top_plot_scatter_data(cls, mutations, top_plot_data, colours, log_y, x_jitter): + x_coords = [] + y_coords = [] + colour_list = [] + + for i, mutation in enumerate(mutations): + for mic in top_plot_data[mutation]: + if len(top_plot_data[mutation]) > 1: + x_coords.append(i + 1 + random.uniform(-x_jitter, x_jitter)) + else: + x_coords.append(i + 1) + + if log_y > 0: + y_coords.append(math.log(mic, log_y)) + else: + y_coords.append(mic) + colour_list.append(colours[i]) + + return x_coords, y_coords, colour_list + + + @classmethod + def _top_plot_violin_data(cls, mutations, top_plot_data, log_y): + violin_data = [] + violin_pos = [] + + for i, mutation in enumerate(mutations): + if log_y > 0: + violin_data.append([math.log(x, log_y) for x in top_plot_data[mutation]]) + else: + violin_data.append(top_plot_data[mutation]) + violin_pos.append(i + 1) + + return violin_data, violin_pos + + + @classmethod + def _ordered_bottom_plot_rows(cls, mutations): + l = [] + infinity = float('inf') + + for x in mutations: + try: + cluster, variant = x.split('.', maxsplit=1) + except: + l.append((x, infinity, x)) + continue + + if '.' in variant: + try: + var_group, var = variant.split('.', maxsplit=1) + except: + var_group = None + var = variant + + variant = var + + regex_match = regex_position_from_var.match(variant) + if regex_match is not None and regex_match.group('coord') != '': + coord = int(regex_match.group('coord')) + else: + coord = infinity + + l.append((cluster, coord, x)) + + l.sort() + return [x[-1] for x in l] + + + @classmethod + def _ordered_columns(cls, mutations, top_plot_data): + # FIXME + return sorted(list(mutations)) + + + @classmethod + def _bottom_scatter_data(cls, bottom_plot_rows, columns, colours, outline=False): + x_coords = [] + y_coords = [] + colour_list = [] + + for i, row in enumerate(bottom_plot_rows): + for j, col in enumerate(columns): + if row in col: + x_coords.append(j + 1) + y_coords.append(len(bottom_plot_rows) - i) + colour_list.append(colours[j]) + elif outline: + x_coords.append(j + 1) + y_coords.append(len(bottom_plot_rows) - i) + colour_list.append("white") + + return x_coords, y_coords, colour_list + + + @classmethod + def _right_plot_data(cls, scatter_count_sizes, x_pos): + y_max = max(scatter_count_sizes) + if y_max > 100: + y_max = int(math.ceil(y_max / 100.0)) * 100 + sizes = [5, 50] + else: + y_max = int(math.ceil(y_max / 10.0)) * 10 + sizes = [5, 10] + + while sizes[-1] < y_max: + sizes.append(sizes[-1]*2) + + x_coords = [x_pos] * len(sizes) + y_coords = [x + 1 for x in range(len(sizes))] + y_coords.reverse() + return x_coords, y_coords, sizes + + + @classmethod + def _pairwise_compare(cls, violin_data, columns, outfile, p_cutoff, compare_test): + try: + import scipy.stats + except: + print('WARNING: skipping Mann Whitney tests because scipy.stats not found', file=sys.stderr) + return + + output = [] + + for i, list1 in enumerate(violin_data): + for j, list2 in enumerate(violin_data): + if j <= i or len(list1) < 2 or len(list2) < 2: + continue + + list1set = set(list1) + + if len(list1set) == 1 and list1set == set(list2): + statistic = 'NA' + pvalue = 1 + else: + if compare_test == 'mannwhitneyu': + statistic, pvalue = scipy.stats.mannwhitneyu(list1, list2, alternative='two-sided') + elif compare_test == 'ks_2samp': + statistic, pvalue = scipy.stats.ks_2samp(list1, list2) + else: + raise Error('Test "' + compare_test + '" not recognised. Cannot continue') + + effect_size = abs(scipy.stats.norm.ppf(pvalue) / math.sqrt(len(list1) + len(list2))) + significant = 'yes' if pvalue < p_cutoff else 'no' + output.append((columns[i], columns[j], len(list1), len(list2), pvalue, significant, effect_size)) + + output.sort(key=lambda x: x[4]) + + with open(outfile, 'w') as f: + print('Combination1', 'Combination2', 'Size1', 'Size2', 'p-value', 'significant', 'effect_size', 'corrected_p-value', 'corrected_significant', 'corrected_effect_size', sep='\t', file=f) + for x in output: + corrected_p = min(1, len(output) * x[4]) + corrected_significant = 'yes' if corrected_p < p_cutoff else 'no' + corrected_effect_size = scipy.stats.norm.ppf(corrected_p) / math.sqrt(x[2] + x[3]) + print('\t'.join([str(z) for z in x]), corrected_p, corrected_significant, corrected_effect_size, sep='\t', file=f) + + + def _make_plot(self, mic_data, top_plot_data, all_mutations, mut_combinations): + bottom_plot_rows = MicPlotter._ordered_bottom_plot_rows(all_mutations) + columns = MicPlotter._ordered_columns(mut_combinations, top_plot_data) + colours = MicPlotter._get_colours(len(columns), self.number_of_colours, self.colourmap, self.colour_skip) + bottom_scatter_x, bottom_scatter_y, bottom_colours = MicPlotter._bottom_scatter_data(bottom_plot_rows, columns, colours, outline=self.dot_outline) + columns = ['.'.join(x) for x in columns] + assert len(colours) == len(columns) + max_x = len(colours) + 1 + + scatter_count_x, scatter_count_y, scatter_count_sizes, scatter_count_colours = MicPlotter._top_plot_scatter_counts(columns, top_plot_data, colours, self.log_y) + scatter_data_x, scatter_data_y, scatter_data_colours = MicPlotter._top_plot_scatter_data(columns, top_plot_data, colours, self.log_y, self.jitter_width) + violin_data, violin_positions = MicPlotter._top_plot_violin_data(columns, top_plot_data, self.log_y) + MicPlotter._pairwise_compare(violin_data, columns, self.outprefix + '.mannwhitney.tsv', self.p_cutoff, 'mannwhitneyu') + MicPlotter._pairwise_compare(violin_data, columns, self.outprefix + '.ks_2sample.tsv', self.p_cutoff, 'ks_2samp') + + # -------------------- SET UP GRID & PLOTS ----------------- + fig=plt.figure(figsize=(self.plot_width, self.plot_height)) + if 'point' not in self.plot_types: + self.point_size = 42 + + if self.point_size == 0: + gs = gridspec.GridSpec(2, 2, height_ratios=self.panel_heights, width_ratios=self.panel_widths) + else: + gs = gridspec.GridSpec(2, 1, height_ratios=self.panel_heights) + + plots=[] + plots.append(plt.subplot(gs[0])) + plots.append(plt.subplot(gs[1])) + if self.point_size == 0: + plots.append(plt.subplot(gs[2])) + bottom_plot_index = 2 + else: + bottom_plot_index = 1 + + # ------------------------- TOP PLOT ----------------------- + for h in self.hlines: + if self.log_y > 0: + h = math.log(h, self.log_y) + plots[0].hlines(h, 0, max_x, linestyle='--', linewidth=1, color='black') + + + if 'violin' in self.plot_types: + violins = plots[0].violinplot(violin_data, violin_positions, widths=self.violin_width, showmeans=False, showextrema=False, showmedians=False) + for x, pc in enumerate(violins['bodies']): + pc.set_facecolor(colours[x]) + pc.set_edgecolor(colours[x]) + + scaled_count_sizes = [self.point_scale * x for x in scatter_count_sizes] + + if 'point' in self.plot_types: + if self.point_size == 0: + plots[0].scatter(scatter_count_x, scatter_count_y, s=scaled_count_sizes, c=scatter_count_colours, linewidth=0) + else: + plots[0].scatter(scatter_data_x, scatter_data_y, c=scatter_data_colours, s=self.point_size) + + if self.log_y > 0: + miny = min(scatter_count_y) - 0.5 + maxy = max(scatter_count_y) + 0.5 + else: + miny = 0 + maxy = 1.05 * max(scatter_count_y) + + plots[0].axis([0,max(bottom_scatter_x) + 1, miny, maxy]) + + y_tick_positions, y_tick_labels = MicPlotter._top_plot_y_ticks(mic_data, self.antibiotic, self.log_y) + plots[0].yaxis.set_ticks(y_tick_positions) + plots[0].set_yticklabels(y_tick_labels) + ylabel = r'$\log_' + str(int(self.log_y)) + '$(MIC) $\mu$g/mL' if self.log_y > 0 else r'MIC $\mu$g/mL' + plots[0].set_ylabel(ylabel) + plots[0].set_xticklabels([]) + plots[0].set_title(self.main_title, fontsize=18) + + # ------------------------- BOTTOM PLOT ----------------------- + edgecolor = "black" if self.dot_outline else bottom_colours + plots[bottom_plot_index].axis([0,max(bottom_scatter_x) + 1,0,max(bottom_scatter_y) + 1]) + plots[bottom_plot_index].scatter(bottom_scatter_x, bottom_scatter_y, marker='o', s=self.dot_size, c=bottom_colours, edgecolor=edgecolor, lw=1) + plots[bottom_plot_index].spines["top"].set_visible(False) + plots[bottom_plot_index].spines["right"].set_visible(False) + plots[bottom_plot_index].spines["bottom"].set_visible(False) + plots[bottom_plot_index].spines["left"].set_visible(False) + plots[bottom_plot_index].yaxis.set_tick_params(length=0) + plots[bottom_plot_index].xaxis.set_ticks([]) + plots[bottom_plot_index].set_xticklabels([]) + plots[bottom_plot_index].yaxis.set_ticks([(i+1) for i in range(len(bottom_plot_rows))]) + plots[bottom_plot_index].set_yticklabels(bottom_plot_rows[::-1], fontsize=self.dot_y_text_size) + + # ------------------------- RIGHT PLOT ------------------------- + if self.point_size == 0: + right_x_coord = 0.75 + right_x, right_y, right_sizes = MicPlotter._right_plot_data(scatter_count_sizes, right_x_coord) + right_scaled_sizes = [self.point_scale * x for x in right_sizes] + plots[1].scatter(right_x, right_y, s=right_scaled_sizes, c="black") + plots[1].axis('off') + plots[1].axis([0,4,-2*len(right_y),len(right_y)+1]) + for i, y in enumerate(right_y): + plots[1].annotate(right_sizes[i], [right_x_coord + 0.75, y-0.2]) + plots[1].annotate("Counts", [right_x_coord - 0.1, len(right_y) + 0.5]) + + plt.tight_layout(w_pad=self.count_legend_x) + plt.savefig(self.outprefix + '.' + self.out_format) + + + + + def run(self): + mic_data = MicPlotter._load_mic_file(self.mic_file) + summary_data = MicPlotter._load_summary_file(self.summary_file) + boxplot_tsv = self.outprefix + '.data.tsv' + top_plot_data, all_mutations, combinations = MicPlotter._get_top_plot_data(summary_data, mic_data, self.antibiotic, self.use_hets, refdata=self.refdata, no_combinations=self.no_combinations, interrupted=self.interrupted, outfile=boxplot_tsv) + top_plot_data, all_mutations, combinations = MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, combinations, self.min_samples) + self._make_plot(mic_data, top_plot_data, all_mutations, combinations) diff --git a/ariba/summary.py b/ariba/summary.py index de9f06f3..fa28d444 100644 --- a/ariba/summary.py +++ b/ariba/summary.py @@ -30,12 +30,12 @@ def __init__( raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue') if filenames is None: - self.filenames = [] + self.filenames = {} else: - self.filenames = filenames + self.filenames = {x: None for x in filenames} if fofn is not None: - self.filenames.extend(self._load_fofn(fofn)) + self.filenames.update(self._load_fofn(fofn)) self.cluster_columns = self._determine_cluster_cols(cluster_cols) self.filter_rows = filter_rows @@ -66,9 +66,21 @@ def _determine_cluster_cols(cols_string): return Summary._determine_cols(cols_string, allowed_cols, 'cluster columns') - def _load_fofn(self, fofn): + @classmethod + def _load_fofn(cls, fofn): + '''Returns dictionary of filename -> short name. Value is None + whenever short name is not provided''' + filenames = {} f = pyfastaq.utils.open_file_read(fofn) - filenames = [x.rstrip() for x in f.readlines()] + for line in f: + fields = line.rstrip().split() + if len(fields) == 1: + filenames[fields[0]] = None + elif len(fields) == 2: + filenames[fields[0]] = fields[1] + else: + raise Error('Error at the following line of file ' + fofn + '. Expected at most 2 fields.\n' + line) + pyfastaq.utils.close(f) return filenames @@ -159,8 +171,8 @@ def _to_matrix(cls, filenames, all_data, all_potential_columns, cluster_cols): summary_cols_set = set(['assembled', 'match', 'ref_seq', 'pct_id', 'known_var', 'novel_var']) summary_cols_in_order = [x for x in summary_cols_in_order if cluster_cols[x]] - for filename in filenames: - line = [filename] + for filename in sorted(filenames): + line = [filename if filenames[filename] is None else filenames[filename]] for cluster_name in sorted(all_potential_columns): group_cols = sorted(list(all_potential_columns[cluster_name]['groups'])) diff --git a/ariba/summary_cluster.py b/ariba/summary_cluster.py index f3f952c7..512948b1 100644 --- a/ariba/summary_cluster.py +++ b/ariba/summary_cluster.py @@ -1,3 +1,4 @@ +import sys from ariba import flag, report, summary_cluster_variant class Error (Exception): pass @@ -30,10 +31,15 @@ def __eq__(self, other): @classmethod - def line2dict(cls, line): + def line2dict(cls, line, filename=None): data = line.rstrip().split('\t') if len(data) != len(report.columns): - raise Error('Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line) + if filename is not None: + filename_message = 'Error reading ariba summary file "' + filename + '". ' + else: + filename_message = '' + raise Error(filename_message + 'Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line) + d = {report.columns[i]: data[i] for i in range(len(data))} try: d['flag'] = flag.Flag(int(d['flag']) ) diff --git a/ariba/summary_cluster_variant.py b/ariba/summary_cluster_variant.py index 62c231a0..709a2b63 100644 --- a/ariba/summary_cluster_variant.py +++ b/ariba/summary_cluster_variant.py @@ -1,3 +1,5 @@ +import re + class Error (Exception): pass class SummaryClusterVariant: @@ -32,21 +34,34 @@ def _has_nonsynonymous(cls, data_dict): @classmethod - def _depths_look_het(cls, depths): - sorted_depths = sorted(depths) - min_var_read_depth = 5 - min_second_var_read_depth = 2 - max_allele_freq = 0.90 - total_depth = sum(depths) - second_depth_ok = len(depths) == 1 or (len(depths) > 1 and sorted_depths[-2] >= min_second_var_read_depth) - max_depth_ok = total_depth >= min_var_read_depth and sorted_depths[-1] / total_depth <= max_allele_freq - return depths[0] < sorted_depths[-1] or (second_depth_ok and max_depth_ok) + def _filter_depths(cls, ref_base, depths): + if ref_base not in depths: + return {} + + min_freq = 0.1 + max_freq = 0.9 + new_depths = {} + total_depth = sum(depths.values()) + ref_depth = depths[ref_base] + return {x: depths[x] for x in depths if depths[x] >= ref_depth or min_freq <= depths[x] / total_depth <= max_freq} + + + #@classmethod + #def _depths_look_het(cls, depths): + # sorted_depths = sorted(depths) + # min_var_read_depth = 5 + # min_second_var_read_depth = 2 + # max_allele_freq = 0.90 + # total_depth = sum(depths) + # second_depth_ok = len(depths) == 1 or (len(depths) > 1 and sorted_depths[-2] >= min_second_var_read_depth) + # max_depth_ok = total_depth >= min_var_read_depth and sorted_depths[-1] / total_depth <= max_allele_freq + # return depths[0] < sorted_depths[-1] or (second_depth_ok and max_depth_ok) @classmethod def _get_is_het_and_percent(cls, data_dict): if data_dict['gene'] == '1' or not (data_dict['known_var'] == '1' or data_dict['ref_ctg_effect'] == 'SNP' or data_dict['var_type'] == 'HET') or data_dict['smtls_nts'] == '.' or ';' in data_dict['smtls_nts'] or data_dict['smtls_nts_depth'] == 'ND': - return False, None + return False, None, None else: nucleotides = data_dict['smtls_nts'].split(',') depths = data_dict['smtls_nts_depth'].split(',') @@ -72,21 +87,27 @@ def _get_is_het_and_percent(cls, data_dict): elif data_dict['known_var_change'] != '.': var_nucleotide = data_dict['known_var_change'][-1] else: - return False, None + return False, None, None if var_nucleotide == '.': - return False, None + return False, None, None total_depth = sum(depths) if max([len(x) for x in nucleotides]) == 1: var_depth = nuc_to_depth.get(var_nucleotide, 0) else: var_depth = sum([nuc_to_depth[x] for x in nuc_to_depth if x[0] == var_nucleotide]) - is_het = SummaryClusterVariant._depths_look_het(depths) + filtered_depths = SummaryClusterVariant._filter_depths(nucleotides[0], nuc_to_depth) + + if len(filtered_depths) > 0: + bases = ''.join(sorted(list(filtered_depths.keys()))) + return len(filtered_depths) > 1, round(100 * var_depth / total_depth, 1), bases + else: + return False, None, None return is_het, round(100 * var_depth / total_depth, 1) except: - return False, None + return False, None, None def _get_nonsynon_variant_data(self, data_dict): @@ -101,10 +122,13 @@ def _get_nonsynon_variant_data(self, data_dict): else: self.var_string = data_dict['ref_ctg_effect'] - self.is_het, self.het_percent = SummaryClusterVariant._get_is_het_and_percent(data_dict) + self.is_het, self.het_percent, var_bases = SummaryClusterVariant._get_is_het_and_percent(data_dict) + if var_bases is not None: + self.var_string = re.sub(r'[^0-9]', '', self.var_string) + var_bases + + self.has_nonsynon = SummaryClusterVariant._has_nonsynonymous(data_dict) and not (data_dict['var_type'] == 'HET' and not self.is_het) - if not SummaryClusterVariant._has_nonsynonymous(data_dict): - self.has_nonsynon = False + if not self.has_nonsynon: return self.has_nonsynon = True diff --git a/ariba/summary_sample.py b/ariba/summary_sample.py index bc1ea25f..11107284 100644 --- a/ariba/summary_sample.py +++ b/ariba/summary_sample.py @@ -27,7 +27,7 @@ def _load_file(filename, min_pc_id, only_clusters=None): raise Error('Error parsing the following line.\n' + line) continue - data_dict = summary_cluster.SummaryCluster.line2dict(line) + data_dict = summary_cluster.SummaryCluster.line2dict(line, filename=filename) cluster = data_dict['cluster'] if only_clusters is not None and cluster not in only_clusters: continue diff --git a/ariba/tasks/__init__.py b/ariba/tasks/__init__.py index 769af324..299f5181 100644 --- a/ariba/tasks/__init__.py +++ b/ariba/tasks/__init__.py @@ -2,6 +2,7 @@ 'aln2meta', 'flag', 'getref', + 'micplot', 'prepareref', 'pubmlstget', 'pubmlstspecies', diff --git a/ariba/tasks/micplot.py b/ariba/tasks/micplot.py new file mode 100644 index 00000000..28ad1440 --- /dev/null +++ b/ariba/tasks/micplot.py @@ -0,0 +1,39 @@ +import argparse +import ariba + +def run(options): + plotter = ariba.mic_plotter.MicPlotter( + options.prepareref_dir, + options.antibiotic, + options.mic_file, + options.summary_file, + options.outprefix, + use_hets=options.use_hets, + main_title=options.main_title, + plot_height=options.plot_height, + plot_width=options.plot_width, + log_y=options.log_y, + plot_types=options.plot_types, + jitter_width=options.jitter_width, + no_combinations=options.no_combinations, + hlines=options.hlines, + point_size=options.point_size, + point_scale=options.point_scale, + dot_size=options.dot_size, + dot_outline=options.dot_outline, + dot_y_text_size=options.dot_y_text_size, + panel_heights=options.panel_heights, + panel_widths=options.panel_widths, + colourmap=options.colourmap, + number_of_colours=options.number_of_colours, + colour_skip=options.colour_skip, + interrupted=options.interrupted, + violin_width=options.violin_width, + xkcd=options.xkcd, + min_samples=options.min_samples, + count_legend_x=options.count_legend_x, + out_format=options.out_format, + p_cutoff=options.p_cutoff + ) + + plotter.run() diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py index c91e9829..778facb6 100644 --- a/ariba/tasks/summary.py +++ b/ariba/tasks/summary.py @@ -69,7 +69,7 @@ def run(options): min_id=options.min_id, cluster_cols=options.cluster_cols, make_phandango_tree=(not options.no_tree), - only_clusters=None if options.only_cluster is None else {options.only_cluster}, + only_clusters=None if options.only_clusters is None else set(options.only_clusters.split(',')), show_var_groups=options.v_groups, show_known_vars=options.known_variants, show_novel_vars=options.novel_variants, diff --git a/ariba/tests/assembly_compare_test.py b/ariba/tests/assembly_compare_test.py index 7d4648e0..03da24bc 100644 --- a/ariba/tests/assembly_compare_test.py +++ b/ariba/tests/assembly_compare_test.py @@ -107,6 +107,37 @@ def test_nucmer_hits_to_ref_coords(self): self.assertEqual(expected, got) + def test_nucmer_hits_to_ref_and_qry_coords(self): + '''test _nucmer_hits_to_ref_and_qry_coords''' + hits = [ + ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['31', '52', '1', '22', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['11', '32', '1000', '1022', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'], + ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'], + ] + nucmer_hits = { + 'contig1': [ + pymummer.alignment.Alignment('\t'.join(hits[0])), + pymummer.alignment.Alignment('\t'.join(hits[1])), + pymummer.alignment.Alignment('\t'.join(hits[2])), + ], + 'contig2': [ + pymummer.alignment.Alignment('\t'.join(hits[3])), + ] + } + + got_ctg, got_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_and_qry_coords(nucmer_hits) + expected_ctg = { + 'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)], + 'contig2': [pyfastaq.intervals.Interval(99, 109)] + } + expected_ref = { + 'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)], + 'contig2': [pyfastaq.intervals.Interval(99, 109)] + } + + def test_ref_cov_per_contig(self): '''test ref_cov_per_contig''' hits = [ diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py index 898106b3..a149fd51 100644 --- a/ariba/tests/assembly_variants_test.py +++ b/ariba/tests/assembly_variants_test.py @@ -309,7 +309,12 @@ def test_get_variants_presence_absence(self): v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1')) v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1')) - nucmer_coords = { + ref_nucmer_coords = { + 'contig1': [pyfastaq.intervals.Interval(0, 30)], + 'contig2': [pyfastaq.intervals.Interval(10, 41)], + } + + ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 30)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } @@ -328,7 +333,7 @@ def test_get_variants_presence_absence(self): } a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file) - got = a_variants.get_variants('presence_absence', nucmer_coords) + got = a_variants.get_variants('presence_absence', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got) @@ -352,11 +357,15 @@ def test_get_variants_variants_only(self): v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) - nucmer_coords = { + ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 41)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } + ref_nucmer_coords = { + 'contig1': [pyfastaq.intervals.Interval(0, 41)], + 'contig2': [pyfastaq.intervals.Interval(10, 41)], + } expected = { 'contig1': [ (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()), @@ -367,6 +376,6 @@ def test_get_variants_variants_only(self): } a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file) - got = a_variants.get_variants('variants_only', nucmer_coords) + got = a_variants.get_variants('variants_only', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got) diff --git a/ariba/tests/data/mic_plotter_load_mic_file.tsv b/ariba/tests/data/mic_plotter_load_mic_file.tsv new file mode 100644 index 00000000..70aa315f --- /dev/null +++ b/ariba/tests/data/mic_plotter_load_mic_file.tsv @@ -0,0 +1,7 @@ +Sample antibio1 antibio2 +sample1 0.25 0.004 +sample2 <0.25 <=0.004 +sample3 < 0.25 <= 0.004 +sample4 >256 >=256 +sample5 > 256 >= 256 +sample6 NA 1 diff --git a/ariba/tests/data/mic_plotter_load_summary_file.tsv b/ariba/tests/data/mic_plotter_load_summary_file.tsv new file mode 100644 index 00000000..05c56772 --- /dev/null +++ b/ariba/tests/data/mic_plotter_load_summary_file.tsv @@ -0,0 +1,3 @@ +name,cluster1.assembled,cluster1.match,cluster1.ref_seq,cluster1.pct_id,cluster1.known_var,cluster1.novel_var,cluster1.group1.A42T,cluster1.group1.A42T.%,cluster2.assembled,cluster2.match,cluster2.ref_seq,cluster2.pct_id,cluster2.known_var,cluster2.novel_var,cluster2.group1.A42T,cluster2.group1.A42T.% +name1,yes,yes,ref1,100.0,no,no,no,NA,yes,yes,ref2,99.0,yes,no,yes,95.42 +name2,yes,yes_nonunique,ref3,99.0,yes,no,yes,90.90,no,no,NA,NA,NA,NA,NA,NA diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv new file mode 100644 index 00000000..b4c4325a --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T +name1 0.25 cluster3.interrupted diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv new file mode 100644 index 00000000..59743fb6 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv @@ -0,0 +1,2 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T.cluster3.interrupted diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv new file mode 100644 index 00000000..7ef4771f --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T +name1 0.25 cluster3.interrupted +name2 0.125 cluster1.group1.A42T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv new file mode 100644 index 00000000..4806ebc3 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.no.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T.cluster3.interrupted +name2 0.125 cluster1.group1.A42T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv new file mode 100644 index 00000000..0092bde7 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv @@ -0,0 +1,5 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T +name1 0.25 cluster3.interrupted +name2 0.125 cluster1.group1.A42T +name2 0.125 cluster4.group4.A44T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv new file mode 100644 index 00000000..cb8b103c --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio1.yes.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.25 cluster2.group2.A43T.cluster3.interrupted +name2 0.125 cluster1.group1.A42T.cluster4.group4.A44T diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv new file mode 100644 index 00000000..1912500e --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T +name1 0.004 cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv new file mode 100644 index 00000000..37b20976 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T.cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv new file mode 100644 index 00000000..1912500e --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T +name1 0.004 cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv new file mode 100644 index 00000000..37b20976 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.no.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T.cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv new file mode 100644 index 00000000..1912500e --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv @@ -0,0 +1,4 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T +name1 0.004 cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv new file mode 100644 index 00000000..37b20976 --- /dev/null +++ b/ariba/tests/data/mic_plotter_to_boxplot_tsv.antibio2.yes.tsv @@ -0,0 +1,3 @@ +Sample MIC Mutations +name1 0.004 cluster2.group2.A43T.cluster3.interrupted +name3 0.002 without_mutation diff --git a/ariba/tests/data/summary_test_load_fofn b/ariba/tests/data/summary_test_load_fofn new file mode 100644 index 00000000..95e7d6df --- /dev/null +++ b/ariba/tests/data/summary_test_load_fofn @@ -0,0 +1,4 @@ +/foo/bar/abc.tsv +/spam/eggs/file1.tsv short_name1 +/spam/eggs/file2.tsv short_name2 +/spam/eggs/file3.tsv diff --git a/ariba/tests/data/summary_test_whole_run.out.csv b/ariba/tests/data/summary_test_whole_run.out.csv index 1a282e3f..daa0ae38 100644 --- a/ariba/tests/data/summary_test_whole_run.out.csv +++ b/ariba/tests/data/summary_test_whole_run.out.csv @@ -1,3 +1,3 @@ -name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.C2597T,23S.C2597T.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,mdfA.G261GGGTGTGGTGTGGT/GGGTGTGGT,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.C100T,noncoding10.C100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.G101A,noncoding11.G101A.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.A42T,noncoding5.A42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.A52T,noncoding6.A52T.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.A53T,noncoding7.A53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id -/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2 -/nfs/users/nfs_m/mh12/sanger-pathogens/ariba/ariba/tests/data/summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA +name,23S.assembled,23S.match,23S.ref_seq,23S.pct_id,23S.known_var,23S.2597CT,23S.2597CT.%,23S.2597TC,23S.2597TC.%,coding1.assembled,coding1.match,coding1.ref_seq,coding1.pct_id,coding2.assembled,coding2.match,coding2.ref_seq,coding2.pct_id,coding3.assembled,coding3.match,coding3.ref_seq,coding3.pct_id,coding5.assembled,coding5.match,coding5.ref_seq,coding5.pct_id,coding5.known_var,coding5.A42S,coding6.assembled,coding6.match,coding6.ref_seq,coding6.pct_id,coding6.known_var,coding6.A52S,coding7.assembled,coding7.ref_seq,coding7.pct_id,coding8.assembled,coding8.match,coding8.ref_seq,coding8.pct_id,coding8.novel_var,coding8.A53S,mdfA.assembled,mdfA.ref_seq,mdfA.pct_id,mdfA.novel_var,noncoding1.assembled,noncoding1.match,noncoding1.ref_seq,noncoding1.pct_id,noncoding10.assembled,noncoding10.match,noncoding10.ref_seq,noncoding10.pct_id,noncoding10.novel_var,noncoding10.100T,noncoding10.100T.%,noncoding11.assembled,noncoding11.match,noncoding11.ref_seq,noncoding11.pct_id,noncoding11.novel_var,noncoding11.101AG,noncoding11.101AG.%,noncoding2.assembled,noncoding2.match,noncoding2.ref_seq,noncoding2.pct_id,noncoding3.assembled,noncoding3.match,noncoding3.ref_seq,noncoding3.pct_id,noncoding5.assembled,noncoding5.match,noncoding5.ref_seq,noncoding5.pct_id,noncoding5.known_var,noncoding5.42T,noncoding5.42T.%,noncoding6.assembled,noncoding6.match,noncoding6.ref_seq,noncoding6.pct_id,noncoding6.known_var,noncoding6.52CT,noncoding6.52CT.%,noncoding7.assembled,noncoding7.match,noncoding7.ref_seq,noncoding7.pct_id,noncoding7.known_var,noncoding7.53T,noncoding7.53T.%,noncoding8.assembled,noncoding8.match,noncoding8.ref_seq,noncoding8.pct_id,noncoding9.assembled,noncoding9.ref_seq,noncoding9.pct_id +summary_test_whole_run.in.1.tsv,yes,yes,23S.rDNA_WHO_F_01358c,99.86,yes,no,NA,yes,100.0,interrupted,no,coding1_ref1,99.1,yes,yes,coding2_ref1,98.2,no,no,NA,NA,yes,no,coding5_ref1,97.4,no,no,yes,yes,coding6_ref1,95.5,yes,yes,yes,coding7_ref1,95.4,yes,yes,coding8_ref1,95.3,yes,yes,interrupted,mdfA.3001328.JQ394987.0_1233.561,97.0,yes,yes,yes,noncoding1_ref1,99.1,yes,yes,noncoding10_ref1,95.1,yes,yes,99.0,yes,yes,noncoding11_ref1,95.05,het,het,30.0,yes,yes,noncoding2_ref1,98.2,no,no,NA,NA,yes,yes,noncoding5_ref1,97.4,yes,yes,100.0,yes,yes,noncoding6_ref1,95.5,yes,het,70.0,yes,yes,noncoding7_ref1,95.4,yes,yes,98.6,yes,yes,noncoding8_ref1,95.3,yes,noncoding9_ref1,95.2 +summary_test_whole_run.in.2.tsv,yes_nonunique,no,23S.rDNA_WHO_F_01358c,99.84,het,het,12.8,no,NA,yes,yes,coding1_ref2,99.2,no,no,NA,NA,yes,yes,coding3_ref1,97.6,yes,yes,coding5_ref1,97.4,yes,yes,no,no,NA,NA,NA,NA,no,NA,NA,no,no,NA,NA,NA,NA,no,NA,NA,NA,yes,yes,noncoding1_ref2,99.2,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,yes,yes,noncoding3_ref1,97.6,yes,no,noncoding5_ref1,99.42,no,no,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,NA,NA,NA,no,no,NA,NA,no,NA,NA diff --git a/ariba/tests/mic_plotter_test.py b/ariba/tests/mic_plotter_test.py new file mode 100644 index 00000000..4ee33771 --- /dev/null +++ b/ariba/tests/mic_plotter_test.py @@ -0,0 +1,283 @@ +import unittest +import copy +import filecmp +import os +from ariba import mic_plotter + +modules_dir = os.path.dirname(os.path.abspath(mic_plotter.__file__)) +data_dir = os.path.join(modules_dir, 'tests', 'data') + + +class TestMicPlotter(unittest.TestCase): + def test_mic_string_to_float(self): + '''Test _mic_string_to_float''' + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('42.42'), 42.42) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>42'), 84.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('> 42'), 84.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>=42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('>= 42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<42'), 21.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('< 42'), 21.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<=42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('<= 42'), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(' <= 42.0 '), 42.0) + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('na'), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('NA'), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float('.'), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(' '), 'NA') + self.assertEqual(mic_plotter.MicPlotter._mic_string_to_float(''), 'NA') + + + def test_load_mic_file(self): + '''Test _load_mic_file''' + infile = os.path.join(data_dir, 'mic_plotter_load_mic_file.tsv') + got = mic_plotter.MicPlotter._load_mic_file(infile) + expected = { + 'sample1': {'antibio1': 0.25, 'antibio2': 0.004}, + 'sample2': {'antibio1': 0.125, 'antibio2': 0.004}, + 'sample3': {'antibio1': 0.125, 'antibio2': 0.004}, + 'sample4': {'antibio1': 512.0, 'antibio2': 256.0}, + 'sample5': {'antibio1': 512.0, 'antibio2': 256.0}, + 'sample6': {'antibio1': 'NA', 'antibio2': 1.0}, + } + + self.assertEqual(got, expected) + + + def test_load_summary_file(self): + '''Test _load_summary_file''' + infile = os.path.join(data_dir, 'mic_plotter_load_summary_file.tsv') + got = mic_plotter.MicPlotter._load_summary_file(infile) + expected = { + 'name1': { + 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, + 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 95.42}, + }, + 'name2': { + 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90}, + 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group1.A42T': 'NA', 'group1.A42T.%': 'NA'}, + }, + } + self.maxDiff = None + self.assertEqual(got, expected) + + + def test_get_colours(self): + '''test _get_colours''' + col1 = (0.0, 0.0, 0.5, 1.0) + col2 = (0.0, 0.0, 0.517825311942959, 1.0) + + tests = [ + (1, 1, 'jet', ["black"]), + (2, 1, 'jet', ["black", "black"]), + (3, 1, 'jet', ["black", "black", "black"]), + (2, 2, 'jet', [col1, col2]), + (3, 2, 'jet', [col1, col2, col1]), + (4, 2, 'jet', [col1, col2, col1, col2]), + (3, 0, 'jet', [(0.0, 0.0, 0.5, 1.0), (0.49019607843137247, 1.0, 0.47754585705249841, 1.0), (0.5, 0.0, 0.0, 1.0)]) + ] + + for total_length, number_of_colours, colormap, expected in tests: + self.assertEqual(expected, mic_plotter.MicPlotter._get_colours(total_length, number_of_colours, colormap)) + + + def test_get_top_plot_data(self): + '''Test _get_top_plot_data''' + mic_data = { + 'name1': {'antibio1': 0.25, 'antibio2': 0.004}, + 'name2': {'antibio1': 0.125, 'antibio2': 'NA'}, + 'name3': {'antibio1': 'NA', 'antibio2': 0.002}, + } + + summary_data = { + 'name1': { + 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref1', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, + 'cluster2': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref2', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group2.A43T': 'yes', 'group2.A43T.%': 95.42}, + 'cluster3': {'assembled': 'interrupted', 'match': 'no', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'no', 'novel_var': 'yes', 'A42T': 'no', 'A44T.%': 'NA'}, + 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group4.A44T': 'no', 'group4.A44T.%': 'NA'}, + }, + 'name2': { + 'cluster1': {'assembled': 'yes', 'match': 'yes_nonunique', 'ref_seq': 'ref3', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group1.A42T': 'yes', 'group1.A42T.%': 90.90}, + 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'}, + 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'}, + 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 99.0, 'known_var': 'yes', 'novel_var': 'no', 'group4.A44T': 'het', 'group4.A44T.%': '50.0'}, + }, + 'name3': { + 'cluster1': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref_seq42', 'pct_id': 100.0, 'known_var': 'no', 'novel_var': 'no', 'group1.A42T': 'no', 'group1.A42T.%': 'NA'}, + 'cluster2': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'NA', 'novel_var': 'NA', 'group2.A43T': 'NA', 'group2.A43T.%': 'NA'}, + 'cluster3': {'assembled': 'no', 'match': 'no', 'ref_seq': 'NA', 'pct_id': 'NA', 'known_var': 'no', 'novel_var': 'no', 'A42T': 'no', 'A44T.%': 'NA'}, + 'cluster4': {'assembled': 'yes', 'match': 'yes', 'ref_seq': 'ref4', 'pct_id': 100.0, 'known_var': 'yes', 'novel_var': 'no', 'group4.A44T': 'no', 'group4.A44T.%': 'NA'}, + }, + } + + expected_top_plot_data = { + 'antibio1': { + 'yes': {'cluster1.group1.A42T.cluster4.group4.A44T': [0.125], 'cluster2.group2.A43T.cluster3.interrupted': [0.25]}, + 'no': {'cluster1.group1.A42T': [0.125], 'cluster2.group2.A43T.cluster3.interrupted': [0.25]}, + 'exclude': {'cluster2.group2.A43T.cluster3.interrupted': [0.25]}, + }, + 'antibio2': { + 'yes': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]}, + 'no': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]}, + 'exclude': {'without_mutation': [0.002], 'cluster2.group2.A43T.cluster3.interrupted': [0.004]}, + } + } + + expected_top_plot_data_no_combs = { + 'antibio1': { + 'yes': {'cluster2.group2.A43T': [0.25], 'cluster4.group4.A44T': [0.125], 'cluster3.interrupted': [0.25], 'cluster1.group1.A42T': [0.125]}, + 'no': {'cluster2.group2.A43T': [0.25], 'cluster3.interrupted': [0.25], 'cluster1.group1.A42T': [0.125]}, + 'exclude': {'cluster2.group2.A43T': [0.25], 'cluster3.interrupted': [0.25]}, + }, + 'antibio2': { + 'yes': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]}, + 'no': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]}, + 'exclude': {'cluster2.group2.A43T': [0.004], 'without_mutation': [0.002], 'cluster3.interrupted': [0.004]}, + } + } + + expected_mutations = { + 'antibio1': { + 'yes': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted', 'cluster4.group4.A44T'}, + 'no': {'cluster1.group1.A42T', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + 'exclude': {'cluster2.group2.A43T', 'cluster3.interrupted'}, + }, + 'antibio2': { + 'yes': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + 'no': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + 'exclude': {'without_mutation', 'cluster2.group2.A43T', 'cluster3.interrupted'}, + } + } + + expected_combs = { + 'antibio1': { + 'yes': {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T', 'cluster4.group4.A44T')}, + 'no': {('cluster2.group2.A43T', 'cluster3.interrupted'), ('cluster1.group1.A42T',)}, + 'exclude': {('cluster2.group2.A43T', 'cluster3.interrupted')} + }, + 'antibio2': { + 'yes': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')}, + 'no': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')}, + 'exclude': {('without_mutation',), ('cluster2.group2.A43T', 'cluster3.interrupted')} + } + } + + + expected_no_combs = { + 'antibio1': { + 'yes': {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',), ('cluster4.group4.A44T',)}, + 'no': {('cluster2.group2.A43T',), ('cluster3.interrupted',), ('cluster1.group1.A42T',)}, + 'exclude': {('cluster2.group2.A43T',), ('cluster3.interrupted',)} + }, + 'antibio2': { + 'yes': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)}, + 'no': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)}, + 'exclude': {('without_mutation',), ('cluster2.group2.A43T', ), ('cluster3.interrupted',)} + } + } + + + tmp_tsv = 'tmp.mic_plotter_test.to_boxplot.tsv' + + for antibio in ['antibio1', 'antibio2']: + for use_het in ['no', 'yes', 'exclude']: + got_data, got_muts, got_combs = mic_plotter.MicPlotter._get_top_plot_data(summary_data, mic_data, antibio, use_het, interrupted=True, outfile=tmp_tsv) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + self.assertEqual(got_muts, expected_mutations[antibio][use_het]) + self.assertEqual(got_combs, expected_combs[antibio][use_het]) + self.assertEqual(got_data, expected_top_plot_data[antibio][use_het]) + os.unlink(tmp_tsv) + + got_data, got_muts, got_combs = mic_plotter.MicPlotter._get_top_plot_data(summary_data, mic_data, antibio, use_het, no_combinations=True, interrupted=True, outfile=tmp_tsv) + expected_tsv = os.path.join(data_dir, 'mic_plotter_to_boxplot_tsv.' + antibio + '.' + use_het + '.no_combinations.tsv') + self.assertTrue(filecmp.cmp(tmp_tsv, expected_tsv, shallow=False)) + self.assertEqual(got_muts, expected_mutations[antibio][use_het]) + self.assertEqual(got_combs, expected_no_combs[antibio][use_het]) + self.assertEqual(got_data, expected_top_plot_data_no_combs[antibio][use_het]) + os.unlink(tmp_tsv) + + + def test_filter_top_plot_data(self): + '''test _filter_top_plot_data''' + top_plot_data = { + 'var1': [1, 2, 3], + 'var2.var3': [1], + 'var1.var3': [1, 2], + } + + all_mutations = {'var1', 'var2', 'var3'} + seen_combinations = {('var1',), ('var1', 'var3'), ('var2', 'var3')} + + got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 1) + self.assertEqual(got_top, top_plot_data) + self.assertEqual(got_all, all_mutations) + self.assertEqual(got_seen, seen_combinations) + + + got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 2) + expected_top_plot_data = { + 'var1': [1, 2, 3], + 'var1.var3': [1, 2], + } + expected_all_mutations = {'var1', 'var3'} + expected_seen_combinations = {('var1',), ('var1', 'var3')} + self.assertEqual(got_top, expected_top_plot_data) + self.assertEqual(got_all, expected_all_mutations) + self.assertEqual(got_seen, expected_seen_combinations) + + got_top, got_all, got_seen = mic_plotter.MicPlotter._filter_top_plot_data(top_plot_data, all_mutations, seen_combinations, 3) + expected_top_plot_data = { + 'var1': [1, 2, 3], + } + expected_all_mutations = {'var1'} + expected_seen_combinations = {('var1',),} + self.assertEqual(got_top, expected_top_plot_data) + self.assertEqual(got_all, expected_all_mutations) + self.assertEqual(got_seen, expected_seen_combinations) + + + def test_ordered_bottom_plot_rows(self): + '''test _ordered_bottom_plot_rows''' + to_order = {'clust1.grp1.42T', 'clust1.grp1.47G', 'clust0.10T', 'abcdefg'} + got = mic_plotter.MicPlotter._ordered_bottom_plot_rows(to_order) + expected = ['abcdefg', 'clust0.10T', 'clust1.grp1.42T', 'clust1.grp1.47G'] + self.assertEqual(expected, got) + + + def test_ordered_columns(self): + '''test _ordered_colunns''' + top_plot_data = {} + # FIXME + + + def test_bottom_scatter_data(self): + '''test _bottom_scatter_data''' + #FIXME + pass + + + def test_top_plot_y_ticks(self): + '''test _top_plot_y_ticks''' + # FIXME + pass + + + def test_top_plot_scatter_counts(self): + '''test _top_plot_scatter_counts''' + top_plot_data = {} + # FIXME + + + def test_top_plot_scatter_data(self): + '''test _top_plot_scatter_data''' + top_plot_data = {} + # FIXME + + + def test_top_plot_violin_data(self): + '''test _top_plot_violin_data''' + top_plot_data = {} + # FIXME + diff --git a/ariba/tests/summary_cluster_variant_test.py b/ariba/tests/summary_cluster_variant_test.py index f88cc0c7..10af7d93 100644 --- a/ariba/tests/summary_cluster_variant_test.py +++ b/ariba/tests/summary_cluster_variant_test.py @@ -23,51 +23,73 @@ def test_has_nonsynonymous(self): self.assertEqual(expected[i], summary_cluster_variant.SummaryClusterVariant._has_nonsynonymous(dicts[i])) - def test_depths_look_het(self): - '''test _depths_look_het''' + def test_filter_depths(self): + '''test _filter_depths''' tests = [ - ([1], False), - ([2], False), - ([3], False), - ([4], False), - ([5], False), - ([90, 1], False), - ([90, 9], False), - ([90, 10], True), - ([9, 1], False), - ([9, 2], True), - ([1, 2], True), - ([90, 5, 5], True), - ([90, 2, 1, 1], False), - ([97, 2, 1], False), + ('A', {'A': 1}, {'A': 1}), + ('A', {'A': 2}, {'A': 2}), + ('A', {'A': 3}, {'A': 3}), + ('A', {'A': 4}, {'A': 4}), + ('A', {'A': 5}, {'A': 5}), + ('A', {'A': 90, 'C': 9}, {'A': 90}), + ('C', {'A': 90, 'C': 9}, {'A': 90, 'C': 9}), + ('C', {'A': 90, 'C': 9, 'G':1}, {'A': 90, 'C': 9}), + ('A', {'A': 90, 'C': 10}, {'A': 90, 'C': 10}), + ('A', {'A': 90, 'C': 5, 'G': 5}, {'A': 90}), + ('A', {'A': 89, 'C': 10, 'G': 1}, {'A': 89, 'C': 10}), + ('A', {'A': 80, 'C': 10, 'G': 10}, {'A': 80, 'C': 10, 'G': 10}), ] - for depths, expected in tests: - self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._depths_look_het(depths)) + for ref_base, depths, expected in tests: + self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._filter_depths(ref_base, depths)) + + + #def test_depths_look_het(self): + # '''test _depths_look_het''' + # tests = [ + # ([1], False), + # ([2], False), + # ([3], False), + # ([4], False), + # ([5], False), + # ([90, 1], False), + # ([90, 9], False), + # ([90, 10], True), + # ([9, 1], False), + # ([9, 2], True), + # ([1, 2], True), + # ([89, 10, 1], True), + # ([89, 9, 2], False), + # ([90, 2, 1, 1], False), + # ([97, 2, 1], False), + # ] + + # for depths, expected in tests: + # self.assertEqual(expected, summary_cluster_variant.SummaryClusterVariant._depths_look_het(depths)) def test_get_is_het_and_percent(self): '''test _get_is_het_and_percent''' tests = [ - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', (False, 100.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tT,A\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', (True, 25.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tA,T\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', (True, 75.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 40.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t95,5\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 5.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t90,10\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 10.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t90,6,4\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 6.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t3,7,90\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 7.0)), - ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tHET\t.\t.\t.\t.\t.\t.\t.\t.\t84\t84\tA\t50\tA,T\t40,10\t.\t.', (True, 20.0)), - ('ariba_ref1\t23S.rDNA_WHO_F_01358c\t0\t1\t531\t9914\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3120\t744.8\t1\tSNP\tn\tC2597T\t1\tC2597T\tSNP\t2597\t2597\tC\t2755\t2755\tT\t823\tTC,T\t487,1\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T.\tHigh-level resistance to Azithromycin', (False, 100.0)), - ('ariba\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 9.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t70,30\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 30.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 91.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 90.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t10,90\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0)), - ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t9,91\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 9.0)), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnon_coding1:0:0:A14T:id1:foo_bar\tspam eggs', (False, 100.0, 'T')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA42T\t1\tA42T\tSNP\t42\t42\tA\t84\t84\tT\t40\tT,A\t10,30\tnon_coding1:0:0:A42T:id1:foo_bar\tspam eggs', (True, 25.0, 'AT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA62T\t1\tA62T\tSNP\t62\t62\tA\t84\t84\tA\t40\tA,T\t10,30\tnon_coding1:0:0:A62T:id2:foo_bar\tspam eggs', (True, 75.0, 'AT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,G\t10,40,50\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 40.0, 'AGT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t95,5\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 5.0, 'A')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T\t90,10\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 10.0, 'AT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t90,6,4\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (False, 6.0, 'A')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA82T\t1\tA82T\tSNP\t82\t82\tA\t84\t84\tA\t100\tA,T,C\t3,7,90\tnon_coding1:0:0:A82T:.:foo_bar\tspam eggs', (True, 7.0, 'ACT')), + ('ariba_ref1\tref1\t0\t0\t531\t78\tcluster1\t120\t100\t98.33\tctg_name\t279\t24.4\t0\tHET\t.\t.\t.\t.\t.\t.\t.\t.\t84\t84\tA\t50\tA,T\t40,10\t.\t.', (True, 20.0, 'AT')), + ('ariba_ref1\t23S.rDNA_WHO_F_01358c\t0\t1\t531\t9914\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3120\t744.8\t1\tSNP\tn\tC2597T\t1\tC2597T\tSNP\t2597\t2597\tC\t2755\t2755\tT\t823\tC,T\t487,1\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T.\tHigh-level resistance to Azithromycin', (False, 0.2, 'C')), + ('ariba\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 9.0, 'C')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t0\t.\t.\t2597\t2597\tC\t2928\t2928\tC\t410\tC,T\t70,30\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 30.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t91,9\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (False, 91.0, 'T')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t90,10\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 90.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t50,50\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 50.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t10,90\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 10.0, 'CT')), + ('ariba_23S.rDNA_WHO_F_01358c\t23S.rDNA_WHO_F_01358c\t0\t1\t659\t4168\t23S\t2890\t2890\t99.86\t23S.scaffold.1\t3628\t344.0\t1\tSNP\tn\tC2597T\t1\t.\t.\t2597\t2597\tC\t2928\t2928\tT\t410\tT,C\t9,91\t23S.rDNA_WHO_F_01358c:0:1:C2597T:.:E coli C2611T\t.', (True, 9.0, 'CT')), ] for line, expected in tests: @@ -88,10 +110,10 @@ def test_init(self): expected = [ {'coding': True, 'known': True, 'var_string': 'I14L', 'var_group': '.', 'het_percent': None}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 100.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 25.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 50.0}, - {'coding': False, 'known': True, 'var_string': 'A14T', 'var_group': 'id1', 'het_percent': 10.0}, + {'coding': False, 'known': True, 'var_string': '14T', 'var_group': 'id1', 'het_percent': 100.0}, + {'coding': False, 'known': True, 'var_string': '14AT', 'var_group': 'id1', 'het_percent': 25.0}, + {'coding': False, 'known': True, 'var_string': '14AGT', 'var_group': 'id1', 'het_percent': 50.0}, + {'coding': False, 'known': True, 'var_string': '14AT', 'var_group': 'id1', 'het_percent': 10.0}, ] assert len(lines) == len(expected) diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py index f98e6e60..90665fe9 100644 --- a/ariba/tests/summary_test.py +++ b/ariba/tests/summary_test.py @@ -11,11 +11,11 @@ def test_init(self): '''Test init''' fofn = os.path.join(data_dir, 'summary_test_init.fofn') s = summary.Summary('out', fofn=fofn) - self.assertEqual(s.filenames, ['file1', 'file2']) - s = summary.Summary('out', filenames=['file42']) - self.assertEqual(s.filenames, ['file42']) - s = summary.Summary('out', fofn=fofn, filenames=['file42']) - self.assertEqual(s.filenames, ['file42', 'file1', 'file2']) + self.assertEqual(s.filenames, {'file1': None, 'file2': None}) + s = summary.Summary('out', filenames={'file42': None}) + self.assertEqual(s.filenames, {'file42': None}) + s = summary.Summary('out', fofn=fofn, filenames={'file42': None}) + self.assertEqual(s.filenames, {'file42': None, 'file1': None, 'file2': None}) def test_determine_cluster_cols(self): @@ -43,6 +43,19 @@ def test_determine_cluster_cols(self): self.assertEqual(expected[i], summary.Summary._determine_cluster_cols(col_strings[i])) + def test_load_fofn(self): + '''Test _load_fofn''' + infile = os.path.join(data_dir, 'summary_test_load_fofn') + got = summary.Summary._load_fofn(infile) + expected = { + '/foo/bar/abc.tsv': None, + '/spam/eggs/file1.tsv': 'short_name1', + '/spam/eggs/file2.tsv': 'short_name2', + '/spam/eggs/file3.tsv': None + } + self.assertEqual(expected, got) + + def test_load_input_files(self): '''Test _load_input_files''' file1 = os.path.join(data_dir, 'summary_test_load_input_files.1.tsv') @@ -214,7 +227,7 @@ def test_gather_unfiltered_output_data(self): self.maxDiff = None s = summary.Summary('out', filenames=infiles) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -226,20 +239,20 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[1]]['noncoding1']['groups'] = {'id1': 'het', 'id1.%': 80.0, 'id3': 'yes', 'id3.%': 100.0} expected_all[infiles[1]]['noncoding2']['groups'] = {'id2': 'het', 'id2.%': 40.0} s = summary.Summary('out', filenames=infiles, show_var_groups=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) - expected_potential_cols['noncoding1']['vars'] = {'A14T.%', 'A6G', 'A6G.%', 'A14T'} - expected_potential_cols['noncoding2']['vars'] = {'A52T', 'A52T.%', 'A42T', 'A42T.%'} + expected_potential_cols['noncoding1']['vars'] = {'14T', '14T.%', '14GT', '14GT.%', '6G', '6G.%'} + expected_potential_cols['noncoding2']['vars'] = {'52GT', '52GT.%', '42T', '42T.%'} - expected_all[infiles[0]]['noncoding1']['vars'] = {'A14T': 'yes', 'A14T.%': 100.0} - expected_all[infiles[0]]['noncoding2']['vars'] = {'A42T': 'yes', 'A42T.%': 100.0, 'A52T': 'het', 'A52T.%': 40.0} - expected_all[infiles[1]]['noncoding1']['vars'] = {'A14T': 'het', 'A14T.%': 80.0, 'A6G': 'yes', 'A6G.%': 100.0} - expected_all[infiles[1]]['noncoding2']['vars'] = {'A52T': 'het', 'A52T.%': 40.0} + expected_all[infiles[0]]['noncoding1']['vars'] = {'14T': 'yes', '14T.%': 100.0} + expected_all[infiles[0]]['noncoding2']['vars'] = {'42T': 'yes', '42T.%': 100.0, '52GT': 'het', '52GT.%': 40.0} + expected_all[infiles[1]]['noncoding1']['vars'] = {'14GT': 'het', '14GT.%': 80.0, '6G': 'yes', '6G.%': 100.0} + expected_all[infiles[1]]['noncoding2']['vars'] = {'52GT': 'het', '52GT.%': 40.0} s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -250,7 +263,7 @@ def test_gather_unfiltered_output_data(self): expected_all[infiles[0]]['presence_absence2']['vars'] = {'V175L': 'yes'} expected_all[infiles[1]]['presence_absence1']['vars'] = {'A10V': 'yes'} s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() self.assertEqual(expected_potential_cols, s.all_potential_columns) self.assertEqual(expected_all, s.all_data) @@ -263,16 +276,23 @@ def test_to_matrix_all_cols(self): os.path.join(data_dir, 'summary_to_matrix.2.tsv') ] - s = summary.Summary('out', filenames=infiles, show_var_groups=True, show_known_vars=True, show_novel_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + fofn = 'tmp.summary_to_matrix_all_cols' + with open(fofn, 'w') as f: + print(infiles[0], 'sample1', file=f) + print(infiles[1], file=f) + + + s = summary.Summary('out', fofn=fofn, show_var_groups=True, show_known_vars=True, show_novel_vars=True) + os.unlink(fofn) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) - expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] - expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding1.14GT:o1', 'noncoding1.14GT.%:c2', 'noncoding1.14T:o1', 'noncoding1.14T.%:c2', 'noncoding1.6G:o1', 'noncoding1.6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'noncoding2.42T:o1', 'noncoding2.42T.%:c2', 'noncoding2.52GT:o1', 'noncoding2.52GT.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding1.14GT', 'noncoding1.14GT.%', 'noncoding1.14T', 'noncoding1.14T.%', 'noncoding1.6G', 'noncoding1.6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'noncoding2.42T', 'noncoding2.42T.%', 'noncoding2.52GT', 'noncoding2.52GT.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] expected_matrix = [ - [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], - [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] + ['sample1', 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes_multi_het', 'NA', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'het', 80.0, 'no', 'NA', 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'het', 40.0, 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] ] self.assertEqual(expected_phandango_header, got_phandango_header) @@ -288,9 +308,9 @@ def test_to_matrix_with_groups(self): ] s = summary.Summary('out', filenames=infiles, show_var_groups=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.id1:o1', 'noncoding1.id1.%:c2', 'noncoding1.id3:o1', 'noncoding1.id3.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.id2:o1', 'noncoding2.id2.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.id1', 'noncoding1.id1.%', 'noncoding1.id3', 'noncoding1.id3.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.id2', 'noncoding2.id2.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var'] @@ -312,15 +332,15 @@ def test_to_matrix_with_vars(self): ] s = summary.Summary('out', filenames=infiles, show_known_vars=True, show_novel_vars=True) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) - expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.A14T:o1', 'noncoding1.A14T.%:c2', 'noncoding1.A6G:o1', 'noncoding1.A6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.A42T:o1', 'noncoding2.A42T.%:c2', 'noncoding2.A52T:o1', 'noncoding2.A52T.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] - expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.A14T', 'noncoding1.A14T.%', 'noncoding1.A6G', 'noncoding1.A6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.A42T', 'noncoding2.A42T.%', 'noncoding2.A52T', 'noncoding2.A52T.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] + expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding1.14GT:o1', 'noncoding1.14GT.%:c2', 'noncoding1.14T:o1', 'noncoding1.14T.%:c2', 'noncoding1.6G:o1', 'noncoding1.6G.%:c2', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'noncoding2.42T:o1', 'noncoding2.42T.%:c2', 'noncoding2.52GT:o1', 'noncoding2.52GT.%:c2', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1', 'presence_absence1.A10V:o1'] + expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding1.14GT', 'noncoding1.14GT.%', 'noncoding1.14T', 'noncoding1.14T.%', 'noncoding1.6G', 'noncoding1.6G.%', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'noncoding2.42T', 'noncoding2.42T.%', 'noncoding2.52GT', 'noncoding2.52GT.%', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var', 'presence_absence1.A10V'] expected_matrix = [ - [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], - [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] + [infiles[0], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'no', 'NA', 'yes', 100.0, 'no', 'NA', 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'yes', 100.0, 'het', 40.0, 'yes', 'yes', 'presence_absence_ref1', '98.96', 'no', 'yes', 'yes'], + [infiles[1], 'yes', 'yes', 'noncoding_ref1', '98.33', 'yes', 'no', 'het', 80.0, 'no', 'NA', 'yes', 100.0, 'yes', 'yes', 'noncoding_ref2', '98.33', 'yes', 'no', 'no', 'NA', 'het', 40.0, 'yes', 'yes', 'presence_absence1', '98.96', 'no', 'yes', 'yes'] ] self.assertEqual(expected_phandango_header, got_phandango_header) @@ -336,9 +356,9 @@ def test_to_matrix_cluster_only(self): ] s = summary.Summary('out', filenames=infiles) - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding1.match:o1', 'noncoding1.ref_seq:o2', 'noncoding1.pct_id:c1', 'noncoding1.known_var:o1', 'noncoding1.novel_var:o1', 'noncoding2.assembled:o1', 'noncoding2.match:o1', 'noncoding2.ref_seq:o3', 'noncoding2.pct_id:c1', 'noncoding2.known_var:o1', 'noncoding2.novel_var:o1', 'presence_absence1.assembled:o1', 'presence_absence1.match:o1', 'presence_absence1.ref_seq:o4', 'presence_absence1.pct_id:c1', 'presence_absence1.known_var:o1', 'presence_absence1.novel_var:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding1.match', 'noncoding1.ref_seq', 'noncoding1.pct_id', 'noncoding1.known_var', 'noncoding1.novel_var', 'noncoding2.assembled', 'noncoding2.match', 'noncoding2.ref_seq', 'noncoding2.pct_id', 'noncoding2.known_var', 'noncoding2.novel_var', 'presence_absence1.assembled', 'presence_absence1.match', 'presence_absence1.ref_seq', 'presence_absence1.pct_id', 'presence_absence1.known_var', 'presence_absence1.novel_var'] @@ -360,9 +380,9 @@ def test_to_matrix_assembled_only(self): ] s = summary.Summary('out', filenames=infiles, cluster_cols='assembled') - s.samples = summary.Summary._load_input_files(infiles, 90) + s.samples = summary.Summary._load_input_files(s.filenames, 90) s._gather_unfiltered_output_data() - got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(infiles, s.all_data, s.all_potential_columns, s.cluster_columns) + got_phandango_header, got_csv_header, got_matrix = summary.Summary._to_matrix(s.filenames, s.all_data, s.all_potential_columns, s.cluster_columns) expected_phandango_header = ['name', 'noncoding1.assembled:o1', 'noncoding2.assembled:o1', 'presence_absence1.assembled:o1'] expected_csv_header = ['name', 'noncoding1.assembled', 'noncoding2.assembled', 'presence_absence1.assembled'] diff --git a/scripts/ariba b/scripts/ariba index 2ba73d82..d59e2649 100755 --- a/scripts/ariba +++ b/scripts/ariba @@ -56,6 +56,55 @@ subparser_getref.add_argument('outprefix', help='Prefix of output filenames') subparser_getref.set_defaults(func=ariba.tasks.getref.run) +#----------------------------- micplot ------------------------------- +subparser_micplot = subparsers.add_parser( + 'micplot', + help='Make violin/dot plots using MIC data', + usage='ariba prepareref [options] ', + description='Makes a violin and scatter plot of MIC per variant in the summary file', +) +subparser_micplot.add_argument('prepareref_dir', help='Name of output directory when "ariba prepareref" was run') +subparser_micplot.add_argument('antibiotic', help='Antibiotic name. Must exactly match a column from the MIC file') +subparser_micplot.add_argument('mic_file', help='File containing MIC data for each sample and one or more antibiotics') +subparser_micplot.add_argument('summary_file', help='File made by running "ariba summary"') +subparser_micplot.add_argument('outprefix', help='Prefix of output files') + +micplot_general_group = subparser_micplot.add_argument_group('General options') +micplot_general_group.add_argument('--out_format', help='Output format of image file. Use anything that matplotlib can save to, eg pdf or png [%(default)s]', default='pdf') +micplot_general_group.add_argument('--main_title', help='Main title of plot. Default is to use the antibiotic name', metavar='"title in quotes"') +micplot_general_group.add_argument('--plot_height', help='Height of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--plot_width', help='Width of plot in inches [%(default)s]', default=7, type=float, metavar='FLOAT') +micplot_general_group.add_argument('--use_hets', choices=['yes', 'no', 'exclude'], help='How to deal with HET snps. Choose from yes,no,exclude. yes: count a het SNP as present. no: do not count a het SNP as present. exclude: completely remove any sample with any het SNP [%(default)s]', default='yes') +micplot_general_group.add_argument('--interrupted', action='store_true', help='Include interrupted genes (as in the assembled column of the ariba summary files)') +micplot_general_group.add_argument('--min_samples', type=int, help='Minimum number of samples in each column required to include in plot [%(default)s]', metavar='INT', default=1) +micplot_general_group.add_argument('--no_combinations', action='store_true', help='Do not show combinations of variants. Instead separate out into one box/violin plot per variant.') +micplot_general_group.add_argument('--panel_heights', help='Two integers that determine relative height of top and bottom plots. eg 5,1 means ratio of 5:1 between top and bottom panel heights [%(default)s]', default='9,2', metavar='height1,height2') +micplot_general_group.add_argument('--panel_widths', help='Two integers that determine relative width of plots and space used by counts legend. eg 5,1 means ratio of 5:1 between top and bottom panel widths. Only applies when plotting points and --point_size 0 [%(default)s]', default='5,1', metavar='width1,width2') +micplot_general_group.add_argument('--count_legend_x', type=float, help='Control x position of counts legend when plotting points and --point_size 0 [%(default)s]', default=-2, metavar='FLOAT') +micplot_general_group.add_argument('--p_cutoff', type=float, help='p-value cutoff for Mann-Whitney tests [%(default)s]', default=0.05) +micplot_general_group.add_argument('--xkcd', action='store_true', help='Best used with xkcd font installed ;)') + +micplot_colour_group = subparser_micplot.add_argument_group('Colour options') +micplot_colour_group.add_argument('--colourmap', help='Colours to use. See http://matplotlib.org/users/colormaps.html [%(default)s]', default='Accent', metavar='colourmap name') +micplot_colour_group.add_argument('--number_of_colours', type=int, help='Number of colours in plot. 0:same number as columns in the plot. 1:all black. >1: take the first N colours from the colourmap specified by --colourmap and cycle them [%(default)s]', default=0, metavar='INT') +micplot_colour_group.add_argument('--colour_skip', help='If using a continuous colourmap, --colour_skip a,b (where 0 <= a < b <= 1) will skip the range between a and b. Useful for excluding near-white colours', metavar='FLOAT1,FLOAT2') + +micplot_upper_plot_group = subparser_micplot.add_argument_group('Upper plot options') +micplot_upper_plot_group.add_argument('--plot_types', help='Types of plots to make, separated by commas. Choose from violin,point [%(default)s]', default='violin,point', metavar='type1,type2,...') +micplot_upper_plot_group.add_argument('--hlines', help='Comma-separated list of positions at which to draw horizontal lines. Default is to draw no lines.', metavar='float1,float2,...', default='') +micplot_upper_plot_group.add_argument('--jitter_width', help='Jitter width option when plotting points [%(default)s]', default=0.1, type=float, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--log_y', type=float, help='Base of log applied to y values. Set to zero to not log [%(default)s]', default=2, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--point_size', type=float, help='Size of points when --plot_types includes point. If zero, will group points and size them proportional to the group size [%(default)s]', default=4, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--point_scale', type=float, help='Scale point sizes when --point_size 0. All point sizes are multiplied by this number. Useful if you have large data set [%(default)s]', default=1, metavar='FLOAT') +micplot_upper_plot_group.add_argument('--violin_width', type=float, help='Width of violins [%(default)s]', default=0.75) + +micplot_lower_plot_group = subparser_micplot.add_argument_group('Lower plot options') +micplot_lower_plot_group.add_argument('--dot_size', type=float, help='Size of dots in lower part of plot [%(default)s]', default=100, metavar='FLOAT') +micplot_lower_plot_group.add_argument('--dot_outline', action='store_true', help='Black outline around all dots (whether coloured or not) in lower part of plots') +micplot_lower_plot_group.add_argument('--dot_y_text_size', type=int, help='Text size of labels [%(default)s]', default=7, metavar='INT') + +subparser_micplot.set_defaults(func=ariba.tasks.micplot.run) + #----------------------------- prepareref ------------------------------- subparser_prepareref = subparsers.add_parser( 'prepareref', @@ -187,14 +236,14 @@ subparser_summary = subparsers.add_parser( epilog='Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input.' ) -subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME') +subparser_summary.add_argument('-f', '--fofn', help='File of filenames of ariba reports to be summarised. Must be used if no input files listed after the outfile. The first column should be the filename. An optional second column can be used to specify a sample name for that file, which will be used instead of the filename in output files. Columns separated by whitespace.', metavar='FILENAME') subparser_summary.add_argument('--preset', choices=summary_presets, help='Shorthand for setting --cluster_cols,--col_filter,--row_filter,--v_groups,--variants. Using this overrides those options', metavar='|'.join(summary_presets)) subparser_summary.add_argument('--cluster_cols', help='Comma separated list of cluster columns to include. Choose from: assembled, match, ref_seq, pct_id, known_var, novel_var [%(default)s]', default='match', metavar='col1,col2,...') subparser_summary.add_argument('--col_filter', choices=['y', 'n'], default='y', help='Choose whether columns where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') subparser_summary.add_argument('--no_tree', action='store_true', help='Do not make phandango tree') subparser_summary.add_argument('--row_filter', choices=['y', 'n'], default='y', help='Choose whether rows where all values are "no" or "NA" are removed [%(default)s]', metavar='y|n') subparser_summary.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT') -subparser_summary.add_argument('--only_cluster', help='Only report data for the given cluster name', metavar='Cluster_name') +subparser_summary.add_argument('--only_clusters', help='Only report data for the given comma-separated list of cluster names, eg: cluster1,cluster2,cluster42', metavar='Cluster_names') subparser_summary.add_argument('--v_groups', action='store_true', help='Show a group column for each group of variants') subparser_summary.add_argument('--known_variants', action='store_true', help='Report all known variants') subparser_summary.add_argument('--novel_variants', action='store_true', help='Report all novel variants') diff --git a/setup.py b/setup.py index b50f6a61..96a51b9b 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ setup( ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod], name='ariba', - version='2.7.2', + version='2.8.1', description='ARIBA: Antibiotic Resistance Identification By Assembly', packages = find_packages(), package_data={'ariba': ['test_run_data/*']}, @@ -68,6 +68,7 @@ install_requires=[ 'BeautifulSoup4 >= 4.1.0', 'dendropy >= 4.2.0', + 'matplotlib', 'pyfastaq >= 3.12.0', 'pysam >= 0.9.1', 'pymummer>=0.10.2',