From c3cb0ed616a16d3ddaa5b25b84c9ea37d6e3a6a1 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 30 May 2024 20:23:22 -0500 Subject: [PATCH] Cleaning up errors, adding restrictions --- neat/common/io.py | 35 +++++++++++-------- neat/gen_mut_model/runner.py | 12 +++---- neat/gen_mut_model/utils.py | 2 +- neat/model_fragment_lengths/runner.py | 11 +++--- neat/model_sequencing_error/utils.py | 10 ++++-- neat/models/models.py | 6 ++-- neat/read_simulator/utils/bed_func.py | 7 ++-- neat/read_simulator/utils/generate_reads.py | 4 +-- .../read_simulator/utils/generate_variants.py | 7 ++-- neat/read_simulator/utils/options.py | 32 +++++++++-------- neat/read_simulator/utils/read.py | 5 ++- neat/read_simulator/utils/vcf_func.py | 18 ++++++---- neat/variants/duplication.py | 7 +++- neat/variants/inversion.py | 6 ++-- 14 files changed, 98 insertions(+), 64 deletions(-) diff --git a/neat/common/io.py b/neat/common/io.py index e0566c23..7b03950e 100644 --- a/neat/common/io.py +++ b/neat/common/io.py @@ -14,11 +14,13 @@ import gzip import logging import os +import sys + from pathlib import Path from typing import Callable, Iterator, TextIO from Bio import bgzf -log = logging.getLogger(__name__) +_LOG = logging.getLogger(__name__) def is_compressed(file: str | Path) -> bool: @@ -85,9 +87,9 @@ def open_output(path: str | Path, mode: str = 'wt') -> Iterator[TextIO]: Raises ------ - FileExistsError + 3 = FileExistsError Raised if the output file already exists. - PermissionError + 11 = PermissionError Raised if the calling process does not have adequate access rights to write to the output file. """ @@ -100,7 +102,8 @@ def open_output(path: str | Path, mode: str = 'wt') -> Iterator[TextIO]: # bgzf is old code and doesn't use "xt" mode, annoyingly. This manual check should suffice. if mode == "xt": if output_path.exists(): - raise FileExistsError(f"file '{path}' already exists") + _LOG.error(f"file '{path}' already exists") + sys.exit(3) else: mode = "wt" open_ = bgzf.open @@ -125,11 +128,11 @@ def validate_input_path(path: str | Path): Raises ------ - FileNotFoundError + 5 = FileNotFoundError Raised if the input file does not exist or is not a file. - RuntimeError + 7 = RuntimeError Raised if the input file is empty. - PermissionError + 9 = PermissionError Raised if the calling process has no read access to the file. """ path = Path(path) @@ -137,14 +140,16 @@ def validate_input_path(path: str | Path): if not path.is_file(): mssg += f"Path '{path}' does not exist or not a file" - raise FileNotFoundError(mssg) + _LOG.error(mssg) + sys.exit(5) stats = path.stat() if stats.st_size == 0: mssg += f"File '{path}' is empty" - raise RuntimeError(mssg) + _LOG.error(mssg) + sys.exit(7) if not os.access(path, os.R_OK): mssg += f"cannot read from '{path}': access denied" - raise PermissionError(mssg) + _LOG.error(9) def validate_output_path(path: str | Path, is_file: bool = True, overwrite: bool = False): @@ -161,18 +166,20 @@ def validate_output_path(path: str | Path, is_file: bool = True, overwrite: bool Raises ------ - FileExistsError + 3 = FileExistsError Raised if path is a file and already exists. - PermissionError + 11 = PermissionError Raised if the calling process does not have adequate access rights to. """ path = Path(path) if is_file: if path.is_file() and not overwrite: - raise FileExistsError(f"file '{path}' already exists") + _LOG.error(f"file '{path}' already exists") + sys.exit(3) else: if path.is_dir(): if not os.access(path, os.W_OK): - raise PermissionError(f"cannot write to '{path}', access denied") + _LOG.error(f"cannot write to '{path}', access denied") + sys.exit(11) else: path.parent.mkdir(parents=True, exist_ok=True) diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py index 308f4362..5fb1b22d 100644 --- a/neat/gen_mut_model/runner.py +++ b/neat/gen_mut_model/runner.py @@ -5,11 +5,11 @@ import os.path import pathlib import pickle +import sys import numpy as np from Bio import SeqIO - from pathlib import Path import logging @@ -81,7 +81,7 @@ def runner(reference_index, if len(ignore) == len(reference_index): _LOG.error("No valid human chromosome names detected. Check contig names reference.") - raise ValueError + sys.exit(1) # Pre-parsing to find all the matching chromosomes between ref and vcf _LOG.info('Processing VCF file...') @@ -91,7 +91,7 @@ def runner(reference_index, if not matching_variants or not matching_chromosomes: _LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.") - raise ValueError + sys.exit(1) trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index, bed, @@ -100,7 +100,7 @@ def runner(reference_index, if not trinuc_ref_count: _LOG.error("No valid trinucleotides detected in reference.") - raise ValueError + sys.exit(1) """ Collect and analyze the data in the VCF file @@ -155,7 +155,7 @@ def runner(reference_index, else: _LOG.error(f'Ref allele in variant call does not match reference: {variant}') - raise ValueError + sys.exit(1) else: indel_len = len(variant[3]) - len(variant[2]) @@ -214,7 +214,7 @@ def runner(reference_index, if not total_var: _LOG.error('Error: No valid variants were found, model could not be created. ' 'Check that names are compatible.') - raise ValueError + sys.exit(1) # COMPUTE PROBABILITIES diff --git a/neat/gen_mut_model/utils.py b/neat/gen_mut_model/utils.py index 175c508b..78356d81 100644 --- a/neat/gen_mut_model/utils.py +++ b/neat/gen_mut_model/utils.py @@ -315,7 +315,7 @@ def convert_trinuc_transition_matrix(trans_probs): _LOG.error("Repeat Trinuc detected.") _LOG.debug(f'Error on {ALL_CONTEXTS[context]}: ' f'{ALLOWED_NUCL[mutation_ref]} -> {ALLOWED_NUCL[mutation_alt]}') - raise ValueError + sys.exit(1) return ret_matrix diff --git a/neat/model_fragment_lengths/runner.py b/neat/model_fragment_lengths/runner.py index 44cb1c03..11dbc787 100755 --- a/neat/model_fragment_lengths/runner.py +++ b/neat/model_fragment_lengths/runner.py @@ -6,6 +6,7 @@ import pickle import numpy as np import logging +import sys from pathlib import Path @@ -44,21 +45,21 @@ def compute_fraglen_runner(file: str | Path, filter_minreads: int, output: str | _LOG.info("Counting fragments") all_tlens = count_frags(input_file) if not all_tlens: - raise ValueError("No valid template lengths in sam file_list.") + _LOG.error("No valid template lengths in sam file_list.") + sys.exit(1) _LOG.info("Filtering fragments") filtered_lengths = filter_lengths(all_tlens, filter_minreads) if not filtered_lengths: - raise ValueError("No data passed the filter, nothing to calculate. Try adjusting the filter settings.") + _LOG.error("No data passed the filter, nothing to calculate. Try adjusting the filter settings.") + sys.exit(1) _LOG.info("Building model") st_dev = float(np.std(filtered_lengths)) mean = float(np.mean(filtered_lengths)) - max_tlen = max(filtered_lengths) - min_tlen = min(filtered_lengths) - model = FragmentLengthModel(st_dev, mean, max_tlen, min_tlen) + model = FragmentLengthModel(mean, st_dev) _LOG.info(f'Saving model: {output}') with gzip.open(output, 'w+') as outfile: pickle.dump(model, outfile) diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py index 8a4e61e7..b494d4dd 100644 --- a/neat/model_sequencing_error/utils.py +++ b/neat/model_sequencing_error/utils.py @@ -6,6 +6,7 @@ import numpy as np # TODO implement plotting import matplotlib.pyplot as plt +import sys from scipy.stats import mode from ..common import open_input @@ -31,7 +32,8 @@ def convert_quality_string(qual_str: str, offset: int): try: ret_list.append(ord(qual_str[i]) - offset) except ValueError: - raise ValueError("improperly formatted fastq file") + _LOG.error("improperly formatted fastq file") + sys.exit(1) return ret_list @@ -45,7 +47,8 @@ def expand_counts(count_array: list, scores: list): :return np.ndarray: a one-dimensional array reflecting the expanded count """ if len(count_array) != len(scores): - raise ValueError("Count array and scores have different lengths.") + _LOG.error("Count array and scores have different lengths.") + sys.exit(1) ret_list = [] for i in range(len(count_array)): @@ -89,7 +92,8 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse if readlen_mode.count < (0.5 * len(readlens)): _LOG.warning("Highly variable read lengths detected. Results may be less than ideal.") if readlen_mode.count < 20: - raise ValueError(f"Dataset is too scarce or inconsistent to make a model. Try a different input.") + _LOG.error(f"Dataset is too scarce or inconsistent to make a model. Try a different input.") + sys.exit(1) read_length = int(readlen_mode.mode) else: diff --git a/neat/models/models.py b/neat/models/models.py index 73d2b581..bbbe19c4 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -7,6 +7,7 @@ import re import logging import abc +import sys from numpy.random import Generator from Bio.Seq import Seq @@ -235,7 +236,8 @@ def __init__(self, self.homozygous_freq = homozygous_freq if not np.isclose(sum(variant_probs.values()), 1): - raise ValueError("Probabilities do not add up to 1.") + _LOG.error("Probabilities do not add up to 1.") + sys.exit(1) self.variant_probs = variant_probs self.transition_matrix = transition_matrix @@ -558,12 +560,10 @@ def __init__(self, self.rng = rng def generate_fragments(self, - total_length: int, number_of_fragments: int) -> list: """ Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set - :param total_length: Length of the reference segment we are covering. :param number_of_fragments: The number of fragments needed. :return: A list of fragment random fragment lengths sampled from the model. """ diff --git a/neat/read_simulator/utils/bed_func.py b/neat/read_simulator/utils/bed_func.py index 1d13bfd5..c0c43ff5 100644 --- a/neat/read_simulator/utils/bed_func.py +++ b/neat/read_simulator/utils/bed_func.py @@ -4,6 +4,7 @@ """ import logging import pathlib +import sys from Bio.File import _IndexedSeqFileDict @@ -103,7 +104,7 @@ def parse_single_bed(input_bed: str, [my_chr, pos1, pos2] = line_list[:3] except ValueError: _LOG.error(f"Improperly formatted bed file line {line}") - raise + sys.exit(1) # Bed file chromosome names must match the reference. try: assert my_chr in reference_dictionary @@ -122,7 +123,7 @@ def parse_single_bed(input_bed: str, _LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})") _LOG.error('4th column of mutation rate bed must be a semicolon list of key, value ' 'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".') - raise ValueError + sys.exit(1) # +9 because that's len('mut_rate='). Whatever is that should be our mutation rate. mut_rate = line_list[3][index + 9:] @@ -133,7 +134,7 @@ def parse_single_bed(input_bed: str, _LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})") _LOG.error('4th column of mutation rate bed must be a semicolon list of key, value ' 'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".') - raise + sys.exit(1) if mut_rate > 0.3: _LOG.warning("Found a mutation rate > 0.3. This is unusual.") diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 1dcf94b7..9d832187 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -43,9 +43,9 @@ def cover_dataset( number_reads = ceil((span_length * options.coverage) / options.read_len) # We use fragments to model the DNA - fragment_pool = fragment_model.generate_fragments(span_length, number_reads * 3) + fragment_pool = fragment_model.generate_fragments(number_reads * 3) - # step 1: Divide the span up into segments drawn froam the fragment pool. Assign reads based on that. + # step 1: Divide the span up into segments drawn from the fragment pool. Assign reads based on that. # step 2: repeat above until number of reads exceeds number_reads * 1.5 # step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads read_count = 0 diff --git a/neat/read_simulator/utils/generate_variants.py b/neat/read_simulator/utils/generate_variants.py index 0e44cdba..6883a2c1 100644 --- a/neat/read_simulator/utils/generate_variants.py +++ b/neat/read_simulator/utils/generate_variants.py @@ -8,6 +8,7 @@ import time import numpy as np import re +import sys from Bio import SeqRecord from numpy.random import Generator @@ -207,7 +208,8 @@ def generate_variants(reference: SeqRecord, temp_variant = mutation_model.generate_snv(trinuc, location) else: - raise ValueError(f"Attempting to create an unsupported variant: {variant_type}") + _LOG.error(f"Attempting to create an unsupported variant: {variant_type}") + sys.exit(1) # pick which ploid is mutated temp_variant.genotype = pick_ploids(options.ploidy, mutation_model.homozygous_freq, 1, options.rng) @@ -226,7 +228,8 @@ def generate_variants(reference: SeqRecord, # Here's a counter to make sure we're not getting stuck on a single location debug += 1 if debug > 1000000: - raise RuntimeError("Check this if, as it may be causing an infinite loop.") + _LOG.error("Check this if, as it may be causing an infinite loop.") + sys.exit(999) # No suitable place to put this, so we skip. continue # This sets up a probability array with weights 1 for open spots (x==0) and 0 elsewhere diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index 500687ae..ea2f6271 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -17,6 +17,7 @@ import numpy as np import logging import yaml +import sys try: from yaml import CLoader as Loader, CDumper as Dumper @@ -187,9 +188,8 @@ def check_and_log_error(keyname, value_to_check, lowval, highval): pass elif lowval != "exists" and highval: if not (lowval <= value_to_check <= highval): - mssg = f'`{keyname}` must be between {lowval} and {highval} (input: {value_to_check}).' - _LOG.error(mssg) - raise ValueError(mssg) + _LOG.error(f'`{keyname}` must be between {lowval} and {highval} (input: {value_to_check}).') + sys.exit(1) elif lowval == "exists" and value_to_check: validate_input_path(value_to_check) @@ -210,9 +210,8 @@ def read(self): # check for empty if value is None: if key == "reference": - mssg = "Must entered a value for `reference` in config" - _LOG.error(mssg) - raise ValueError(mssg) + _LOG.error("Must entered a value for `reference` in config") + sys.exit(1) else: _LOG.warning(f"No value entered for `{key}`, skipping.") continue @@ -220,9 +219,8 @@ def read(self): # Now we check that the type is correct, and it is in range, depending on the type defined for it # If it passes that it gets put into the args dictionary. if value != type_of_var(value): - mssg = f"Incorrect type for value entered for {key}: {type_of_var} (found: {value})" - _LOG.error(mssg) - raise ValueError(mssg) + _LOG.error(f"Incorrect type for value entered for {key}: {type_of_var} (found: {value})") + sys.exit(1) self.check_and_log_error(key, value, criteria1, criteria2) self.args[key] = value @@ -251,7 +249,8 @@ def check_options(self): Some sanity checks and corrections to the options. """ if not (self.produce_bam or self.produce_vcf or self.produce_fastq): - raise ValueError('No files would be produced, as all file types are set to false') + _LOG.error('No files would be produced, as all file types are set to false') + sys.exit(1) # This next section just checks all the paired ended stuff if self.paired_ended: @@ -297,12 +296,18 @@ def log_configuration(self): _LOG.info(f"Single threading - 1 thread.") # We'll work on multithreading later... # _LOG.info(f'Multithreading - {options.threads} threads') + _LOG.info(f'Using a read length of {self.read_len}') if self.fragment_mean: + if self.fragment_mean < self.read_len: + _LOG.error(f"`fragment_mean` (input: {self.fragment_mean}) " + f"must be at least as long as `read_len` (input or default: {self.read_len}).") + sys.exit(1) if self.fragment_st_dev: _LOG.info(f'Generating fragments based on mean={self.fragment_mean}, ' f'stand. dev={self.fragment_st_dev}') else: - raise ValueError("Provided fragment mean, but no fragment standard deviation!") + _LOG.error("Provided fragment mean, but no fragment standard deviation!") + sys.exit(1) if self.paired_ended: _LOG.info(f'Running in paired-ended mode.') if self.fragment_model: @@ -310,11 +315,10 @@ def log_configuration(self): elif self.fragment_mean: pass # Already addressed this above else: - raise ValueError("Paired ended mode requires either a fragment model or a mean and standard deviation.") + _LOG.error("Paired ended mode requires either a fragment model or a mean/standard deviation.") + sys.exit(1) else: _LOG.info(f'Running in single-ended mode.') - - _LOG.info(f'Using a read length of {self.read_len}') _LOG.info(f'Average coverage: {self.coverage}') if self.error_model: _LOG.info(f'Using error model: {self.error_model}') diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py index 103a5b5d..c65bbbb5 100644 --- a/neat/read_simulator/utils/read.py +++ b/neat/read_simulator/utils/read.py @@ -9,6 +9,7 @@ """ import logging import numpy as np +import sys from typing import TextIO from Bio.Seq import Seq, MutableSeq @@ -432,7 +433,9 @@ def make_cigar(self): if cig_length < self.length: # Note that samtools will throw an error if this happens. Maybe need to adjust alignment parameters. - raise ValueError("Problem creating cigar string") + _LOG.error("Problem creating cigar string") + sys.exit(1) + # append the final section as we return return cig_string + str(cig_count) + curr_char diff --git a/neat/read_simulator/utils/vcf_func.py b/neat/read_simulator/utils/vcf_func.py index cf12b963..a2c0bb2e 100755 --- a/neat/read_simulator/utils/vcf_func.py +++ b/neat/read_simulator/utils/vcf_func.py @@ -5,7 +5,7 @@ import logging import numpy as np -import re +import sys from Bio import SeqIO @@ -68,7 +68,8 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], _LOG.info(f"Parsing input vcf {vcf_path}") if tumor_normal: - raise RuntimeError("Cancer methods not yet implemented") + _LOG.error("Cancer methods not yet implemented") + sys.exit(1) n_skipped = 0 mismatched = 0 @@ -91,7 +92,8 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], max_col += 1 sample_columns = columns[columns.index('FORMAT') + 1:] if not sample_columns: - raise ValueError('Input vcf has FORMAT column but no sample columns.') + _LOG.error('Input vcf has FORMAT column but no sample columns.') + sys.exit(1) else: _LOG.warning('Missing format column in vcf, using WP for genotype if present, ' 'otherwise genotype will be generated randomly') @@ -106,15 +108,17 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], max_col += 1 # If the code got here, we're dealing with a cancer sample elif len(sample_columns) == 1: - raise ValueError(f'Tumor-Normal samples require both a tumor and normal sample ' - f'column in the VCF. {list(sample_columns)}') + _LOG.error(f'Tumor-Normal samples require both a tumor and normal sample ' + f'column in the VCF. {list(sample_columns)}') + sys.exit(1) else: normals = [label for label in sample_columns if 'normal' in label.lower()] tumors = [label for label in sample_columns if 'tumor' in label.lower()] if not (tumors and normals): - raise ValueError("Input VCF for cancer must contain a column with a label containing " - "'tumor' and 'normal' (case-insensitive).") + _LOG.error("Input VCF for cancer must contain a column with a label containing " + "'tumor' and 'normal' (case-insensitive).") + sys.exit(1) """ Note that this cancer code is not yet full implemented """ sample_columns = {normals[0]: 7, tumors[0]: 8} diff --git a/neat/variants/duplication.py b/neat/variants/duplication.py index e5d145e3..b1408fcf 100644 --- a/neat/variants/duplication.py +++ b/neat/variants/duplication.py @@ -2,9 +2,13 @@ A class describing one type of variant for NEAT. """ import numpy as np +import logging +import sys from .base_variant import BaseVariant +_LOG = logging.getLogger(__name__) + class Duplication(BaseVariant): """ @@ -39,7 +43,8 @@ def __init__(self, if self.position2 == position1 - length: self.stream = -1 elif not self.position2 == position1 + length: - raise ValueError("Second position of a dup must = first position - length or first position + length") + _LOG.error("Second position of a dup must = first position - length or first position + length") + sys.exit(1) self.genotype = genotype self.qual_score = qual_score self.is_input = is_input diff --git a/neat/variants/inversion.py b/neat/variants/inversion.py index 1ce91128..ac1d6dd0 100644 --- a/neat/variants/inversion.py +++ b/neat/variants/inversion.py @@ -2,10 +2,12 @@ A class describing one type of variant for NEAT. """ import numpy as np +import logging -from Bio.Seq import Seq from .base_variant import BaseVariant +_LOG = logging.getLogger(__name__) + class Inversion(BaseVariant): """ @@ -31,7 +33,7 @@ def __init__(self, self.length = length self.orientation = orientation if not self.orientation == -1: - raise ValueError("For an inversion, the orientation must be -1") + _LOG.error("For an inversion, the orientation must be -1") self.genotype = genotype self.qual_score = qual_score self.is_input = is_input