Skip to content

Commit

Permalink
Merge pull request #138 from ncsa/feature/fix_output
Browse files Browse the repository at this point in the history
Feature/fix output
  • Loading branch information
joshfactorial authored Jan 20, 2025
2 parents 374f415 + ffa45a1 commit fd00e28
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 37 deletions.
6 changes: 3 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: neat
channels:
- bioconda
- conda-forge
- defaults
- conda-forge
- bioconda

dependencies:
- python=3.10.*
- biopython=1.79
Expand Down
2 changes: 1 addition & 1 deletion neat/models/error_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def get_sequencing_errors(
if self.average_error == 0:
return introduced_errors
else:
for i in range(self.read_length):
for i in range(len(quality_scores)):
if rng.random() < quality_score_error_rate[quality_scores[i]]:
error_indexes.append(i)

Expand Down
25 changes: 14 additions & 11 deletions neat/read_simulator/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,18 +177,20 @@ def read_simulator_runner(config: str, output: str):

# TODO check into SeqIO.index_db()
reference_index = SeqIO.index(str(options.reference), "fasta")
reference_keys_with_lens = {key: len(value) for key, value in reference_index.items()}
_LOG.debug("Reference file indexed.")

if _LOG.getEffectiveLevel() < 20:
count = 0
for contig in reference_index:
count += len(reference_index[contig])
for contig in reference_keys_with_lens:
count += reference_keys_with_lens[contig]
_LOG.debug(f"Length of reference: {count / 1_000_000:.2f} Mb")

input_variants_dict = {x: ContigVariants() for x in reference_index}
input_variants_dict = {x: ContigVariants() for x in reference_keys_with_lens}
if options.include_vcf:
_LOG.info(f"Reading input VCF: {options.include_vcf}.")
if options.cancer:
# TODO Check if we need full ref index or just keys and lens
sample_names = parse_input_vcf(
input_variants_dict,
options.include_vcf,
Expand All @@ -202,6 +204,7 @@ def read_simulator_runner(config: str, output: str):
tumor_ind = sample_names['tumor_sample']
normal_ind = sample_names['normal_sample']
else:
# TODO Check if we need full ref index or just keys and lens
sample_names = parse_input_vcf(
input_variants_dict,
options.include_vcf,
Expand All @@ -224,7 +227,7 @@ def read_simulator_runner(config: str, output: str):
target_regions_dict,
discard_regions_dict,
mutation_rate_dict
) = parse_beds(options, reference_index, mut_model.avg_mut_rate)
) = parse_beds(options, reference_keys_with_lens, mut_model.avg_mut_rate)

if any(bed_files):
_LOG.debug("Finished reading input beds.")
Expand All @@ -234,7 +237,7 @@ def read_simulator_runner(config: str, output: str):
if options.produce_bam:
# This is a dictionary that is the list of the contigs and the length of each.
# This information will be needed later to create the bam header.
bam_header = {key: len(reference_index[key]) for key in reference_index}
bam_header = reference_keys_with_lens

# Creates files and sets up objects for files that can be written to as needed.
# Also creates headers for bam and vcf.
Expand All @@ -259,7 +262,7 @@ def read_simulator_runner(config: str, output: str):
"""
_LOG.info("Beginning simulation.")

breaks = find_file_breaks(reference_index)
breaks = find_file_breaks(reference_keys_with_lens)

_LOG.debug("Input reference partitioned for run")

Expand Down Expand Up @@ -345,20 +348,20 @@ def read_simulator_runner(config: str, output: str):

if options.produce_bam:
_LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
contig_list = list(reference_index)
contig_list = list(reference_keys_with_lens)
contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)


def find_file_breaks(reference_index: dict) -> dict:
def find_file_breaks(reference_keys_with_lens: dict) -> dict:
"""
Returns a dictionary with the chromosomes as keys, which is the start of building the chromosome map
:param reference_index: a dictionary with chromosome keys and sequence values
:param reference_keys_with_lens: a dictionary with chromosome keys and sequence values
:return: a dictionary containing the chromosomes as keys and either "all" for values, or a list of indices
"""
partitions = {}
for contig in reference_index.keys():
partitions[contig] = [(0, len(reference_index[contig]))]
for contig in reference_keys_with_lens:
partitions[contig] = [(0, reference_keys_with_lens[contig])]

return partitions
33 changes: 17 additions & 16 deletions neat/read_simulator/utils/bed_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
_LOG = logging.getLogger(__name__)


def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mutation_rate: float) -> list:
def parse_beds(options: Options, ref_keys_counts: dict, average_mutation_rate: float) -> list:
"""
This single function parses the three possible bed file types for NEAT.
:param options: The options object for this run
:param reference_dict: The reference dict generated by SeqIO for this run
:param ref_keys_counts: A dictionary containing the reference keys and the associated lengths.
:param average_mutation_rate: The average mutation rate from the model or user input for this run
:return: target_dict, discard_dict, mutation_rate_dict
"""
Expand Down Expand Up @@ -57,13 +57,13 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu
for i in range(len(bed_files)):
temp_bed_dict = parse_single_bed(
bed_files[i],
reference_dict,
ref_keys_counts,
processing_structure[i]
)
# If there was a bed this function will fill in any gaps the bed might have missed, so we have a complete map
# of each chromosome. The uniform case in parse_single_bed handles this in cases of no input bed.
if processing_structure[i][2]:
final_dict = fill_out_bed_dict(reference_dict, temp_bed_dict, processing_structure[i])
final_dict = fill_out_bed_dict(ref_keys_counts, temp_bed_dict, processing_structure[i])
else:
final_dict = temp_bed_dict

Expand All @@ -73,7 +73,7 @@ def parse_beds(options: Options, reference_dict: _IndexedSeqFileDict, average_mu


def parse_single_bed(input_bed: str,
reference_dictionary: _IndexedSeqFileDict,
ref_keys_counts: dict,
process_key: tuple[str, float, bool],
) -> dict:
"""
Expand All @@ -82,12 +82,13 @@ def parse_single_bed(input_bed: str,
then we will also import the mutation data.
:param input_bed: A bed file containing the regions of interest
:param reference_dictionary: A list of chromosomes to check, along with their reads and lengths
:param ref_keys_counts: A dict of chromosomes to check (key), along with their lengths (value)
:param process_key: Indicates which bed we are processing and the factor we need to process it.
:return: a dictionary of chromosomes: [(pos1, pos2, factor), etc...]
"""
ret_dict = {x: [] for x in reference_dictionary}
ret_dict = {x: [] for x in ref_keys_counts}
in_bed_only = []
key_pool = list(ref_keys_counts.keys())

# process_key[2] is True when there is an input bed, False otherwise.
if process_key[2]:
Expand All @@ -107,7 +108,7 @@ def parse_single_bed(input_bed: str,
sys.exit(1)
# Bed file chromosome names must match the reference.
try:
assert my_chr in reference_dictionary
assert my_chr in key_pool
except AssertionError:
in_bed_only.append(my_chr)
_LOG.warning(
Expand Down Expand Up @@ -149,7 +150,7 @@ def parse_single_bed(input_bed: str,
ret_dict[my_chr].append((int(pos1), int(pos2), True))

# some validation
in_ref_only = [k for k in reference_dictionary if k not in ret_dict]
in_ref_only = [k for k in ref_keys_counts if k not in ret_dict]
if in_ref_only:
_LOG.warning(f'Warning: Reference contains sequences not found in BED file {input_bed}. '
'These chromosomes will be omitted from the outputs.')
Expand All @@ -161,13 +162,13 @@ def parse_single_bed(input_bed: str,
_LOG.debug(f'Regions ignored: {list(set(in_bed_only))}')

else:
for my_chr in reference_dictionary.keys():
ret_dict[my_chr].append((0, len(reference_dictionary[my_chr]), process_key[1]))
for my_chr in ref_keys_counts:
ret_dict[my_chr].append((0, ref_keys_counts[my_chr], process_key[1]))

return ret_dict


def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
def fill_out_bed_dict(ref_keys_counts: dict,
region_dict: dict | None,
default_factor: tuple[str, bool, bool],
) -> dict:
Expand All @@ -177,9 +178,9 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,
The input to this function is the dict for a single chromosome.
:param ref_dict: The reference dictionary for the run. Needed to calulate max values. We shouldn't need to access
the data it refers to.
:param region_dict: A dict based on the input from the bed file, with keys being all the chronmosomes
:param ref_keys_counts: A dictionary with the keys as the contigs from the reference dictionary, and the values as
the length of the contig. Needed to calculate max values.
:param region_dict: A dict based on the input from the bed file, with keys being all the chromosomes
present in the reference.
:param default_factor: This is a tuple representing the type and any extra information. The extra info is for
mutation rates. If beds were input, then these values come from the bed, else they are set to one value
Expand All @@ -191,7 +192,7 @@ def fill_out_bed_dict(ref_dict: _IndexedSeqFileDict,

for contig in region_dict:
regions = []
max_value = len(ref_dict[contig])
max_value = ref_keys_counts[contig]
start = 0

# The trivial case of no data yet for this contig the region gets filled out as 0 to len(contig_sequence)
Expand Down
5 changes: 3 additions & 2 deletions neat/read_simulator/utils/output_file_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import gzip
import re
import shutil
import time
from struct import pack
import logging
Expand Down Expand Up @@ -101,7 +102,7 @@ def __init__(self,
files_to_write.extend(self.fastq_fns)
elif self.write_fastq:
self.fastq1_fn = options.output.parent / f'{options.output.stem}.fastq.gz'
self.fastq2_fn = options.output.parent / "dummy.fastq.gz"
self.fastq2_fn = self.temporary_dir / "dummy.fastq.gz"
self.fastq_fns = [self.fastq1_fn, self.fastq2_fn]
files_to_write.extend(self.fastq_fns)
if self.write_bam:
Expand All @@ -115,7 +116,7 @@ def __init__(self,
self.files_to_write = files_to_write

# Create files as applicable
for file in [x for x in self.files_to_write if x != "dummy.fastq.gz"]:
for file in [x for x in self.files_to_write if x.name != "dummy.fastq.gz"]:
validate_output_path(file, True, options.overwrite_output)

mode = 'xt'
Expand Down
10 changes: 6 additions & 4 deletions neat/read_simulator/utils/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,12 @@ def make_cigar(self):
cig_string, cig_count, curr_char, cig_length = self.align_seqs()
if cig_length < self.run_read_length:
_LOG.warning("Poor alignment, trying reversed")
cig_string2, cig_count2, curr_char2, cig_length = self.align_seqs()
if cig_length < self.run_read_length:
_LOG.error("Alignment still not working")
sys.exit(1)
cig_string2, cig_count2, curr_char2, cig_length2 = self.align_seqs()
if cig_length2 < cig_length:
cig_length = cig_length2
while cig_length < self.run_read_length:
cig_string += "M"
cig_length += 1

# append the final section as we return
return cig_string + str(cig_count) + curr_char
Expand Down

0 comments on commit fd00e28

Please sign in to comment.