Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Summary inconsistent bug #73

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions ariba/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def __init__(
filenames=None,
fofn=None,
include_all_variant_columns=False,
min_id=90.0
min_id=90.0,
verbose=False,
):
if filenames is None and fofn is None:
raise Error('Error! Must supply filenames or fofn to Summary(). Cannot continue')
Expand All @@ -32,6 +33,7 @@ def __init__(
self.include_all_variant_columns = include_all_variant_columns
self.min_id = min_id
self.outprefix = outprefix
self.verbose = verbose


def _load_fofn(self, fofn):
Expand All @@ -48,11 +50,13 @@ def _check_files_exist(self):


@classmethod
def _load_input_files(cls, filenames, min_id):
def _load_input_files(cls, filenames, min_id, verbose=False):
samples = {}
for filename in filenames:
samples[filename] = summary_sample.SummarySample(filename, min_pc_id=min_id)
samples[filename].run()
if verbose:
print('Loaded file', filename, flush=True)
return samples


Expand Down Expand Up @@ -250,22 +254,45 @@ def _newick_from_dist_matrix(cls, distance_file, outfile):


def run(self):
if self.verbose:
print('Loading input files...', flush=True)
self._check_files_exist()
self.samples = self._load_input_files(self.filenames, self.min_id)
self.samples = self._load_input_files(self.filenames, self.min_id, verbose=self.verbose)
if self.verbose:
print('Generating output rows', flush=True)
self.rows = self._gather_output_rows()

if self.verbose:
print('Filtering columns', flush=True)
self.rows, remaining_clusters = Summary._filter_clusters(self.rows)

if remaining_clusters == 0:
print('No clusters found that are present in any sample. Will not write any output files', file=sys.stderr)
sys.exit(1)

Summary._write_csv(self.filenames, self.rows, self.outprefix + '.csv', phandango=False)
csv_file = self.outprefix + '.csv'
if self.verbose:
print('Writing csv file', csv_file, flush=True)
Summary._write_csv(self.filenames, self.rows, csv_file, phandango=False)

if len(self.samples) > 1:
lines = Summary._write_csv(self.filenames, self.rows, self.outprefix + '.phandango.csv', phandango=True)
if self.verbose:
print('Making Phandango csv file', csv_file, flush=True)
csv_file = self.outprefix + '.phandango.csv'
lines = Summary._write_csv(self.filenames, self.rows, csv_file, phandango=True)
dist_matrix_file = self.outprefix + '.phandango.distance_matrix'
tree_file = self.outprefix + '.phandango.tre'

if self.verbose:
print('Making Phandango distance matrix', dist_matrix_file, flush=True)
Summary._write_distance_matrix(lines, dist_matrix_file)

if self.verbose:
print('Making Phandango tree file', tree_file, flush=True)
Summary._newick_from_dist_matrix(dist_matrix_file, tree_file)
os.unlink(dist_matrix_file)
else:
print('Made csv file. Not making Phandango files because only one input file given', file=sys.stderr)

if self.verbose:
print('Finished', flush=True)
8 changes: 5 additions & 3 deletions ariba/summary_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,19 @@ def _get_nonsynonymous_var(data_dict):

if not has_nonsyn:
return None
elif data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.':
raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change and ref_ctg_change both equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
elif data_dict['known_var_change'] == data_dict['ref_ctg_change'] == '.' == data_dict['ref_ctg_effect']:
raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change, ref_ctg_change, ref_ctg_effect all equal to ".", but has a non synonymous change. Something is inconsistent. Cannot continue')
else:
if '.' not in [data_dict['known_var_change'], data_dict['ref_ctg_change']] and \
data_dict['known_var_change'] != data_dict['ref_ctg_change']:
raise Error('Unexpected data in ariba summary... \n' + str(data_dict) + '\n... known_var_change != ref_ctg_change. Cannot continue')

if data_dict['known_var_change'] != '.':
return data_dict['known_var_change']
else:
elif data_dict['ref_ctg_change'] != '.':
return data_dict['ref_ctg_change']
else:
return data_dict['ref_ctg_effect']


def column_summary_data(self):
Expand Down
2 changes: 2 additions & 0 deletions ariba/tasks/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def run():
parser.add_argument('--no_var_columns', action='store_true', help='Do not keep a column for every variant. Default is to include them')
parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by default, they are removed from the output)')
parser.add_argument('--verbose', action='store_true', help='Be verbose')
parser.add_argument('outprefix', help='Prefix of output files')
parser.add_argument('infiles', nargs='*', help='Files to be summarised')
options = parser.parse_args()
Expand All @@ -22,5 +23,6 @@ def run():
filenames=options.infiles,
include_all_variant_columns=(not options.no_var_columns),
min_id=options.min_id,
verbose=options.verbose
)
s.run()
7 changes: 7 additions & 0 deletions ariba/tests/summary_cluster_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,13 @@ def test_get_nonsynonymous_var(self):
d['known_var_change'] = '.'
self.assertEqual('P43Q', summary_cluster.SummaryCluster._get_nonsynonymous_var(d))

d['ref_ctg_change'] = '.'
with self.assertRaises(summary_cluster.Error):
summary_cluster.SummaryCluster._get_nonsynonymous_var(d)

d['ref_ctg_effect'] = 'MULTIPLE'
self.assertEqual('MULTIPLE', summary_cluster.SummaryCluster._get_nonsynonymous_var(d))


def test_column_summary_data(self):
'''Test column_summary_data'''
Expand Down