Cleaning up errors, adding restrictions

ncsa · May 31, 2024 · c3cb0ed · c3cb0ed
1 parent c62f156
commit c3cb0ed
Show file tree

Hide file tree

Showing 14 changed files with 98 additions and 64 deletions.
diff --git a/neat/common/io.py b/neat/common/io.py
@@ -14,11 +14,13 @@
 import gzip
 import logging
 import os
+import sys
+
 from pathlib import Path
 from typing import Callable, Iterator, TextIO
 from Bio import bgzf
 
-log = logging.getLogger(__name__)
+_LOG = logging.getLogger(__name__)
 
 
 def is_compressed(file: str | Path) -> bool:
@@ -85,9 +87,9 @@ def open_output(path: str | Path, mode: str = 'wt') -> Iterator[TextIO]:
 
     Raises
     ------
-    FileExistsError
+    3 = FileExistsError
         Raised if the output file already exists.
-    PermissionError
+    11 = PermissionError
         Raised if the calling process does not have adequate access rights to
         write to the output file.
     """
@@ -100,7 +102,8 @@ def open_output(path: str | Path, mode: str = 'wt') -> Iterator[TextIO]:
         # bgzf is old code and doesn't use "xt" mode, annoyingly. This manual check should suffice.
         if mode == "xt":
             if output_path.exists():
-                raise FileExistsError(f"file '{path}' already exists")
+                _LOG.error(f"file '{path}' already exists")
+                sys.exit(3)
             else:
                 mode = "wt"
         open_ = bgzf.open
@@ -125,26 +128,28 @@ def validate_input_path(path: str | Path):
 
     Raises
     ------
-    FileNotFoundError
+    5 = FileNotFoundError
         Raised if the input file does not exist or is not a file.
-    RuntimeError
+    7 = RuntimeError
         Raised if the input file is empty.
-    PermissionError
+    9 = PermissionError
         Raised if the calling process has no read access to the file.
     """
     path = Path(path)
     mssg = ''
 
     if not path.is_file():
         mssg += f"Path '{path}' does not exist or not a file"
-        raise FileNotFoundError(mssg)
+        _LOG.error(mssg)
+        sys.exit(5)
     stats = path.stat()
     if stats.st_size == 0:
         mssg += f"File '{path}' is empty"
-        raise RuntimeError(mssg)
+        _LOG.error(mssg)
+        sys.exit(7)
     if not os.access(path, os.R_OK):
         mssg += f"cannot read from '{path}': access denied"
-        raise PermissionError(mssg)
+        _LOG.error(9)
 
 
 def validate_output_path(path: str | Path, is_file: bool = True, overwrite: bool = False):
@@ -161,18 +166,20 @@ def validate_output_path(path: str | Path, is_file: bool = True, overwrite: bool
 
     Raises
     ------
-    FileExistsError
+    3 = FileExistsError
         Raised if path is a file and already exists.
-    PermissionError
+    11 = PermissionError
         Raised if the calling process does not have adequate access rights to.
     """
     path = Path(path)
     if is_file:
         if path.is_file() and not overwrite:
-            raise FileExistsError(f"file '{path}' already exists")
+            _LOG.error(f"file '{path}' already exists")
+            sys.exit(3)
     else:
         if path.is_dir():
             if not os.access(path, os.W_OK):
-                raise PermissionError(f"cannot write to '{path}', access denied")
+                _LOG.error(f"cannot write to '{path}', access denied")
+                sys.exit(11)
         else:
             path.parent.mkdir(parents=True, exist_ok=True)
diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py
@@ -5,11 +5,11 @@
 import os.path
 import pathlib
 import pickle
+import sys
 
 import numpy as np
 from Bio import SeqIO
 
-
 from pathlib import Path
 import logging
 
@@ -81,7 +81,7 @@ def runner(reference_index,
 
     if len(ignore) == len(reference_index):
         _LOG.error("No valid human chromosome names detected. Check contig names reference.")
-        raise ValueError
+        sys.exit(1)
 
     # Pre-parsing to find all the matching chromosomes between ref and vcf
     _LOG.info('Processing VCF file...')
@@ -91,7 +91,7 @@ def runner(reference_index,
 
     if not matching_variants or not matching_chromosomes:
         _LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.")
-        raise ValueError
+        sys.exit(1)
 
     trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index,
                                                               bed,
@@ -100,7 +100,7 @@ def runner(reference_index,
 
     if not trinuc_ref_count:
         _LOG.error("No valid trinucleotides detected in reference.")
-        raise ValueError
+        sys.exit(1)
 
     """
     Collect and analyze the data in the VCF file
@@ -155,7 +155,7 @@ def runner(reference_index,
 
                 else:
                     _LOG.error(f'Ref allele in variant call does not match reference: {variant}')
-                    raise ValueError
+                    sys.exit(1)
 
             else:
                 indel_len = len(variant[3]) - len(variant[2])
@@ -214,7 +214,7 @@ def runner(reference_index,
     if not total_var:
         _LOG.error('Error: No valid variants were found, model could not be created. '
                    'Check that names are compatible.')
-        raise ValueError
+        sys.exit(1)
 
     # COMPUTE PROBABILITIES
 

diff --git a/neat/gen_mut_model/utils.py b/neat/gen_mut_model/utils.py
@@ -315,7 +315,7 @@ def convert_trinuc_transition_matrix(trans_probs):
             _LOG.error("Repeat Trinuc detected.")
             _LOG.debug(f'Error on {ALL_CONTEXTS[context]}: '
                        f'{ALLOWED_NUCL[mutation_ref]} -> {ALLOWED_NUCL[mutation_alt]}')
-            raise ValueError
+            sys.exit(1)
 
     return ret_matrix
 

diff --git a/neat/model_fragment_lengths/runner.py b/neat/model_fragment_lengths/runner.py
@@ -6,6 +6,7 @@
 import pickle
 import numpy as np
 import logging
+import sys
 
 from pathlib import Path
 
@@ -44,21 +45,21 @@ def compute_fraglen_runner(file: str | Path, filter_minreads: int, output: str |
     _LOG.info("Counting fragments")
     all_tlens = count_frags(input_file)
     if not all_tlens:
-        raise ValueError("No valid template lengths in sam file_list.")
+        _LOG.error("No valid template lengths in sam file_list.")
+        sys.exit(1)
 
     _LOG.info("Filtering fragments")
     filtered_lengths = filter_lengths(all_tlens, filter_minreads)
 
     if not filtered_lengths:
-        raise ValueError("No data passed the filter, nothing to calculate. Try adjusting the filter settings.")
+        _LOG.error("No data passed the filter, nothing to calculate. Try adjusting the filter settings.")
+        sys.exit(1)
 
     _LOG.info("Building model")
     st_dev = float(np.std(filtered_lengths))
     mean = float(np.mean(filtered_lengths))
-    max_tlen = max(filtered_lengths)
-    min_tlen = min(filtered_lengths)
 
-    model = FragmentLengthModel(st_dev, mean, max_tlen, min_tlen)
+    model = FragmentLengthModel(mean, st_dev)
     _LOG.info(f'Saving model: {output}')
     with gzip.open(output, 'w+') as outfile:
         pickle.dump(model, outfile)

diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py
@@ -6,6 +6,7 @@
 import numpy as np
 # TODO implement plotting
 import matplotlib.pyplot as plt
+import sys
 
 from scipy.stats import mode
 from ..common import open_input
@@ -31,7 +32,8 @@ def convert_quality_string(qual_str: str, offset: int):
         try:
             ret_list.append(ord(qual_str[i]) - offset)
         except ValueError:
-            raise ValueError("improperly formatted fastq file")
+            _LOG.error("improperly formatted fastq file")
+            sys.exit(1)
 
     return ret_list
 
@@ -45,7 +47,8 @@ def expand_counts(count_array: list, scores: list):
     :return np.ndarray: a one-dimensional array reflecting the expanded count
     """
     if len(count_array) != len(scores):
-        raise ValueError("Count array and scores have different lengths.")
+        _LOG.error("Count array and scores have different lengths.")
+        sys.exit(1)
 
     ret_list = []
     for i in range(len(count_array)):
@@ -89,7 +92,8 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
         if readlen_mode.count < (0.5 * len(readlens)):
             _LOG.warning("Highly variable read lengths detected. Results may be less than ideal.")
         if readlen_mode.count < 20:
-            raise ValueError(f"Dataset is too scarce or inconsistent to make a model. Try a different input.")
+            _LOG.error(f"Dataset is too scarce or inconsistent to make a model. Try a different input.")
+            sys.exit(1)
         read_length = int(readlen_mode.mode)
 
     else:

diff --git a/neat/models/models.py b/neat/models/models.py
@@ -7,6 +7,7 @@
 import re
 import logging
 import abc
+import sys
 
 from numpy.random import Generator
 from Bio.Seq import Seq
@@ -235,7 +236,8 @@ def __init__(self,
         self.homozygous_freq = homozygous_freq
 
         if not np.isclose(sum(variant_probs.values()), 1):
-            raise ValueError("Probabilities do not add up to 1.")
+            _LOG.error("Probabilities do not add up to 1.")
+            sys.exit(1)
 
         self.variant_probs = variant_probs
         self.transition_matrix = transition_matrix
@@ -558,12 +560,10 @@ def __init__(self,
         self.rng = rng
 
     def generate_fragments(self,
-                           total_length: int,
                            number_of_fragments: int) -> list:
         """
         Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
 
-        :param total_length: Length of the reference segment we are covering.
         :param number_of_fragments: The number of fragments needed.
         :return: A list of fragment random fragment lengths sampled from the model.
         """

diff --git a/neat/read_simulator/utils/bed_func.py b/neat/read_simulator/utils/bed_func.py
@@ -4,6 +4,7 @@
 """
 import logging
 import pathlib
+import sys
 
 from Bio.File import _IndexedSeqFileDict
 
@@ -103,7 +104,7 @@ def parse_single_bed(input_bed: str,
                         [my_chr, pos1, pos2] = line_list[:3]
                     except ValueError:
                         _LOG.error(f"Improperly formatted bed file line {line}")
-                        raise
+                        sys.exit(1)
                     # Bed file chromosome names must match the reference.
                     try:
                         assert my_chr in reference_dictionary
@@ -122,7 +123,7 @@ def parse_single_bed(input_bed: str,
                             _LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})")
                             _LOG.error('4th column of mutation rate bed must be a semicolon list of key, value '
                                        'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".')
-                            raise ValueError
+                            sys.exit(1)
 
                         # +9 because that's len('mut_rate='). Whatever is that should be our mutation rate.
                         mut_rate = line_list[3][index + 9:]
@@ -133,7 +134,7 @@ def parse_single_bed(input_bed: str,
                             _LOG.error(f"Invalid mutation rate: {my_chr}: ({pos1}, {pos2})")
                             _LOG.error('4th column of mutation rate bed must be a semicolon list of key, value '
                                        'pairs, with one key being mut_rate, e.g., "foo=bar;mut_rate=0.001;do=re".')
-                            raise
+                            sys.exit(1)
 
                         if mut_rate > 0.3:
                             _LOG.warning("Found a mutation rate > 0.3. This is unusual.")

diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
@@ -43,9 +43,9 @@ def cover_dataset(
     number_reads = ceil((span_length * options.coverage) / options.read_len)
 
     # We use fragments to model the DNA
-    fragment_pool = fragment_model.generate_fragments(span_length, number_reads * 3)
+    fragment_pool = fragment_model.generate_fragments(number_reads * 3)
 
-    # step 1: Divide the span up into segments drawn froam the fragment pool. Assign reads based on that.
+    # step 1: Divide the span up into segments drawn from the fragment pool. Assign reads based on that.
     # step 2: repeat above until number of reads exceeds number_reads * 1.5
     # step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads
     read_count = 0

diff --git a/neat/read_simulator/utils/generate_variants.py b/neat/read_simulator/utils/generate_variants.py
@@ -8,6 +8,7 @@
 import time
 import numpy as np
 import re
+import sys
 
 from Bio import SeqRecord
 from numpy.random import Generator
@@ -207,7 +208,8 @@ def generate_variants(reference: SeqRecord,
                 temp_variant = mutation_model.generate_snv(trinuc, location)
 
             else:
-                raise ValueError(f"Attempting to create an unsupported variant: {variant_type}")
+                _LOG.error(f"Attempting to create an unsupported variant: {variant_type}")
+                sys.exit(1)
 
             # pick which ploid is mutated
             temp_variant.genotype = pick_ploids(options.ploidy, mutation_model.homozygous_freq, 1, options.rng)
@@ -226,7 +228,8 @@ def generate_variants(reference: SeqRecord,
                         # Here's a counter to make sure we're not getting stuck on a single location
                         debug += 1
                         if debug > 1000000:
-                            raise RuntimeError("Check this if, as it may be causing an infinite loop.")
+                            _LOG.error("Check this if, as it may be causing an infinite loop.")
+                            sys.exit(999)
                         # No suitable place to put this, so we skip.
                         continue
                     # This sets up a probability array with weights 1 for open spots (x==0) and 0 elsewhere