Skip to content

Commit

Permalink
Fixing a bug in gen_mut_model where something got deleted
Browse files Browse the repository at this point in the history
  • Loading branch information
joshfactorial committed Jun 8, 2024
1 parent a305799 commit fc2aced
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 4 deletions.
8 changes: 7 additions & 1 deletion neat/cli/commands/gen_mut_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ def add_arguments(self, parser: argparse.ArgumentParser):
help="Includes a list of common variants, if you want to visualize "
"common variants with plot_mut_model.py.")

parser.add_argument('--save-trinuc',
action='store_true',
required=False,
default=False,
help='Saves trinucleotide counts to a separate file')

parser.add_argument('--overwrite',
action='store_true',
required=False,
Expand All @@ -83,5 +89,5 @@ def execute(self, arguments: argparse.Namespace):
:param arguments: The namespace with arguments and their values.
"""
compute_mut_runner(arguments.reference, arguments.mutations, arguments.bed, arguments.outcounts,
arguments.show_trinuc, arguments.human_sample,
arguments.show_trinuc, arguments.save_trinuc, arguments.human_sample,
arguments.skip_common, arguments.output, arguments.overwrite)
15 changes: 14 additions & 1 deletion neat/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,19 @@ def generate_fragments(self,
:param number_of_fragments: The number of fragments needed.
:return: A list of fragment random fragment lengths sampled from the model.
"""
# Due to natural variation in genome lengths, it's difficult to harden this code against all the possible
# inputs. In order to harden this code against infinite loops caused by fragment means that the wrong size for
# the genome, we introduce a small number of standard fragments, to ensure enough variability that our code can
# complete. a few small fragments should increase the variability of the set. Most of these are too small
# to create a read, so they become spacers instead.
extra_fragments = [10, 11, 12, 13, 14, 28, 31]
# generates a distribution, assuming normality, then rounds the result and converts to ints
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
return [abs(x) for x in dist]
dist = [abs(x) for x in dist]
# We'll append enough fragments to pad out distribution and add variability. Don't know if the cost of doing
# this is worth it though.
number_extra = int(number_of_fragments * 0.1)
for i in range(number_extra):
dist.append(extra_fragments[i % len(extra_fragments)])
self.rng.shuffle(dist) # this shuffle mixes extra fragments in.
return dist[:number_of_fragments]
34 changes: 32 additions & 2 deletions neat/read_simulator/utils/generate_reads.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import time
import pickle
import sys

from math import ceil
from pathlib import Path
Expand Down Expand Up @@ -37,6 +38,10 @@ def cover_dataset(
"""

final_reads = set()
# sanity check
if span_length/options.fragment_mean < 5:
_LOG.warning("The fragment mean is relatively large compared to the chromosome size. You may need to increase "
"standard deviation, or decrease fragment mean, if NEAT cannot complete successfully.")
# precompute how many reads we want
# The numerator is the total number of base pair calls needed.
# Divide that by read length gives the number of reads needed
Expand All @@ -49,9 +54,23 @@ def cover_dataset(
# step 2: repeat above until number of reads exceeds number_reads * 1.5
# step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads
read_count = 0
loop_count = 0
while read_count <= number_reads:
start = 0
loop_count += 1
# if loop_count > options.coverage * 100:
# _LOG.error("The selected fragment mean and standard deviation are causing NEAT to get stuck.")
# _LOG.error("Please try adjusting fragment mean or standard deviation to see if that fixes the issue.")
# _LOG.error(f"parameters:\n"
# f"chromosome length: {span_length}\n"
# f"read length: {options.read_len}\n"
# f"fragment mean: {options.fragment_mean}\n"
# f"fragment standard deviation: {options.fragment_st_dev}")
# sys.exit(1)
temp_fragments = []
# trying to get enough variability to harden NEAT against edge cases.
if loop_count % 10 == 0:
fragment_model.rng.shuffle(fragment_pool)
# Breaking the gename into fragments
while start < span_length:
# We take the first element and put it back on the end to create an endless pool of fragments to draw from
Expand All @@ -61,9 +80,9 @@ def cover_dataset(
if end - start < options.read_len:
# add some random flavor to try to keep it to falling into a loop
if options.rng.normal() < 0.5:
fragment_pool.insert(fragment, len(fragment_pool)//2)
fragment_pool.insert(len(fragment_pool)//2, fragment)
else:
fragment_pool.insert(fragment, len(fragment_pool) - 3)
fragment_pool.insert(len(fragment_pool) - 3, fragment)
else:
fragment_pool.append(fragment)
temp_fragments.append((start, end))
Expand All @@ -87,6 +106,16 @@ def cover_dataset(
# where start and end are ints with end > start. Reads can overlap, so right_start < left_end
# is possible, but the reads cannot extend past each other, so right_start < left_start and
# left_end > right_end are not possible.

# sanity check that we haven't created an unrealistic read:
insert_size = read2[0] - read1[1]
if insert_size > 2 * options.read_len:
# Probably an outlier fragment length. We'll just pitch one of the reads
# and consider it lost to the ages.
if fragment_model.rng.choice((True, False)):
read1 = (0, 0)
else:
read2 = (0, 0)
read = read1 + read2
if read not in final_reads:
final_reads.add(read)
Expand All @@ -97,6 +126,7 @@ def cover_dataset(
# Now we shuffle them to add some randomness
options.rng.shuffle(final_reads)
# And only return the number we needed
_LOG.debug(f"Coverage required {loop_count} loops")
if options.paired_ended:
# Since each read is actually 2 reads, we only need to return half as many. But to cheat a few extra, we scale
# that down slightly to 1.85 reads per read. This factor is arbitrary and may even be a function. But let's see
Expand Down

0 comments on commit fc2aced

Please sign in to comment.