Skip to content

Commit

Permalink
fixed bug causing crash with >1 unplaced sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
Alaina Shumate committed Jun 29, 2020
1 parent d6e8169 commit 86f2b95
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 26 deletions.
14 changes: 9 additions & 5 deletions align_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ def align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
threads_per_alignment = max(1, math.floor(processes / len(ref_chroms)))
sam_files = []
pool = Pool(processes)
func = partial(align_subset, ref_chroms, target_chroms, threads_per_alignment, target_fasta, reference_fasta, minimap2_path, inter_files, map, genome_size)
for result in pool.imap_unordered(func, np.arange(0,len(ref_chroms))):
func = partial(align_subset, ref_chroms, target_chroms, threads_per_alignment, target_fasta, reference_fasta, minimap2_path, inter_files, map, genome_size, search_type)
for result in pool.imap_unordered(func, np.arange(0,len(target_chroms))):
sam_files.append(result)
pool.close()
pool.join()
Expand All @@ -45,13 +45,17 @@ def split_target_sequence(target_chroms, target_fasta_name, inter_files):



def align_subset(ref_chroms, target_chroms, threads, target_fasta_name, reference_fasta_name, minimap2_path, inter_files, map, genome_size, index):
if ref_chroms[index] == reference_fasta_name:
def align_subset(ref_chroms, target_chroms, threads, target_fasta_name, reference_fasta_name, minimap2_path, inter_files, map, genome_size, liftover_type, index):
if ref_chroms[index] == reference_fasta_name and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"):
features_name = 'reference_all'
elif liftover_type == "unmapped":
features_name = "unmapped_to_expected_chrom"
elif liftover_type == "unplaced":
features_name = "unplaced"
else:
features_name = ref_chroms[index]
features_file = inter_files+ "/" + features_name + "_genes.fa"
if target_chroms[index] == target_fasta_name:
if liftover_type != "chrm_by_chrm" or target_chroms[0] == target_fasta_name:
target_file = target_fasta_name
out_file_target = "target_all"
else:
Expand Down
16 changes: 10 additions & 6 deletions extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
import os


def extract_features_to_lift(g_arg, db_arg, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files):
def extract_features_to_lift(g_arg, db_arg, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files, liftover_type):
print("extracting features")
if os.path.exists(inter_files) is False:
os.mkdir(inter_files)
feature_db, feature_db_name = create_feature_db_connections(g_arg, db_arg, infer_transcripts, infer_genes)
parent_dict, child_dict, intermediate_dict, parent_order= seperate_parents_and_children(feature_db)
get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files)
get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
return parent_dict, child_dict, intermediate_dict, feature_db, parent_order


Expand Down Expand Up @@ -107,21 +107,25 @@ def has_child(feature, feature_db):



def get_gene_sequences(parent_dict, ref_chroms, reference_fasta_name, processes, inter_files):
def get_gene_sequences(parent_dict, ref_chroms, reference_fasta_name, processes, inter_files, liftover_type):
pool = Pool(processes)
Faidx(reference_fasta_name)
func = partial(get_gene_sequences_subset, parent_dict, reference_fasta_name, inter_files)
func = partial(get_gene_sequences_subset, parent_dict, reference_fasta_name, inter_files, liftover_type)
for result in pool.imap_unordered(func, ref_chroms):
continue
pool.close()
pool.join()
return


def get_gene_sequences_subset(parent_dict, reference_fasta_name, inter_files, chrom_name):
def get_gene_sequences_subset(parent_dict, reference_fasta_name, inter_files, liftover_type, chrom_name):
reference_fasta = Fasta(reference_fasta_name)
if chrom_name == reference_fasta_name:
if chrom_name == reference_fasta_name and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"):
fasta_out_name = "reference_all"
elif liftover_type == "unmapped":
fasta_out_name = "unmapped_to_expected_chrom"
elif liftover_type == "unplaced":
fasta_out_name = "unplaced"
else:
fasta_out_name = chrom_name
fasta_out = open(inter_files+"/"+fasta_out_name + "_genes.fa", 'w')
Expand Down
6 changes: 3 additions & 3 deletions fix_overlapping_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

def fix_incorrectly_overlapping_features(all_lifted_features, features_to_check, parent_dict, all_aligned_segs,
unmapped_features, threshold, intermediate_dict, children_dict, feature_db,
original_parent_order,seq_id_threshold, liftover_type):
original_parent_order,seq_id_threshold):
features_to_remap = check_homologues(all_lifted_features, features_to_check, parent_dict, original_parent_order)
resolve_overlapping_homologues(all_aligned_segs, all_lifted_features, features_to_remap, unmapped_features,
threshold, parent_dict, intermediate_dict,
children_dict, feature_db,original_parent_order, seq_id_threshold, liftover_type)
children_dict, feature_db,original_parent_order, seq_id_threshold)



Expand Down Expand Up @@ -108,7 +108,7 @@ def find_feature_to_remap(feature, overlap_feature, original_parent_order, new_p

def resolve_overlapping_homologues(all_aligned_segs, lifted_feature_list, features_to_remap, unmapped_features,
threshold, parent_dict, intermediate_dict, children_dict,
feature_db,original_parent_order,seq_id_threshold, liftover_type):
feature_db,original_parent_order,seq_id_threshold):
all_overlapping_features = {}
while len(features_to_remap) > 0:
features_to_check = {}
Expand Down
2 changes: 2 additions & 0 deletions liftoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
def parse_args():
parser = argparse.ArgumentParser(description='Lift features from one genome assembly to another')
group = parser.add_mutually_exclusive_group(required=True)
parser.add_argument("-V", "--version", help="show program version", action='version', version="v1.1.1" )
parser.add_argument('-t', required=True, help="target fasta genome to lift genes to", metavar="<target.fasta>")
parser.add_argument('-r', required=True, help = "reference fasta genome to lift genes from", metavar="<reference.fasta>")
group.add_argument('-g', help="annotation file to lift over in gff or gtf format", metavar="<ref_annotation.gff>")
Expand Down Expand Up @@ -64,6 +65,7 @@ def main():
ref_chroms=[reference_fasta]
target_chroms= [target_fasta]


#lift genes
lifted_feature_list = {}
unmapped_features = []
Expand Down
28 changes: 16 additions & 12 deletions liftover_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
def lift_original_annotation(gff, target_fasta, reference_fasta, ref_chroms, target_chroms, processes, db,
lifted_feature_list, unmapped_features, infer_transcripts, infer_genes, cov_threshold, seq_threshold,
minimap2_path, inter_files):
liftover_type = "chrm_by_chrm"
if target_chroms[0] == target_fasta:
cov_threshold, seq_threshold = 0,0
parent_dict, children_dict, intermediate_dict, feature_db, original_parent_order = extract_features.extract_features_to_lift(
gff, db, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files)
gff, db, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files, liftover_type)
aligned_segments = align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
parent_dict, children_dict, "chrm_by_chrm",
parent_dict, children_dict, liftover_type,
unmapped_features, reference_fasta, minimap2_path, inter_files, True)

print("lifting features")
Expand All @@ -22,7 +23,7 @@ def lift_original_annotation(gff, target_fasta, reference_fasta, ref_chroms, tar
fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
aligned_segments, unmapped_features,
cov_threshold, intermediate_dict, children_dict,
feature_db, original_parent_order, seq_threshold, "chrm_by_chrm")
feature_db, original_parent_order, seq_threshold)
return feature_db, parent_dict, intermediate_dict, children_dict, original_parent_order


Expand All @@ -31,43 +32,45 @@ def map_unmapped_genes_agaisnt_all(unmapped_features, target_fasta, reference_fa
children_dict, parent_order,minimap2_path,inter_files):
liftoff_utils.clear_scores(lifted_feature_list, parent_dict)
unmapped_dict = {}
liftover_type = "unmapped"
for feature in unmapped_features:
unmapped_dict[feature.id]=feature
extract_features.get_gene_sequences(unmapped_dict, ref_chroms, reference_fasta, processes, inter_files)
extract_features.get_gene_sequences(unmapped_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
unmapped_features = []
aligned_segments=align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
unmapped_dict, children_dict, "missing", unmapped_features, reference_fasta,
unmapped_dict, children_dict, liftover_type, unmapped_features, reference_fasta,
minimap2_path,inter_files, True)
print("lifting features")
lift_features.lift_all_features(aligned_segments, {}, 0.0, feature_db, unmapped_dict, children_dict,
intermediate_dict, unmapped_features, lifted_feature_list, 0.0)
fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
aligned_segments, unmapped_features, 0.0,
intermediate_dict, children_dict, feature_db,
parent_order, 0.0, "missing")
parent_order, 0.0)
return unmapped_features


def map_unplaced_genes(unmapped_features, target_fasta, reference_fasta, ref_chroms, target_chroms,
processes, lifted_feature_list, feature_db, parent_dict, intermediate_dict,
children_dict, parent_order,minimap2_path,inter_files):
liftoff_utils.clear_scores(lifted_feature_list, parent_dict)
liftover_type = "unplaced"
unplaced_dict = {}
for feature_name in parent_dict:
feature = parent_dict[feature_name]
if feature.seqid in ref_chroms:
unplaced_dict[feature.id] = feature
extract_features.get_gene_sequences(unplaced_dict, ref_chroms, reference_fasta, processes, inter_files)
extract_features.get_gene_sequences(unplaced_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
aligned_segments=align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
unplaced_dict, children_dict, "unplaced", unmapped_features,
unplaced_dict, children_dict, liftover_type, unmapped_features,
reference_fasta,minimap2_path,inter_files, True)
print("lifting features")
lift_features.lift_all_features(aligned_segments, {}, 0.0, feature_db, unplaced_dict, children_dict,
intermediate_dict, unmapped_features, lifted_feature_list, 0.0)

fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
aligned_segments, unmapped_features, 0.0,
intermediate_dict, children_dict, feature_db, parent_order, 0.0, "unplaced")
intermediate_dict, children_dict, feature_db, parent_order, 0.0)



Expand All @@ -76,9 +79,10 @@ def map_extra_copies(target_fasta, reference_fasta, ref_chroms, target_chroms, p
seq_threshold,minimap2_path,inter_files, remap):
liftoff_utils.clear_scores(lifted_feature_list, parent_dict)
unmapped_features = []
extract_features.get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files)
liftover_type = "copies"
extract_features.get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
aligned_segments=align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
parent_dict, children_dict, "copies", unmapped_features, reference_fasta,
parent_dict, children_dict, liftover_type, unmapped_features, reference_fasta,
minimap2_path,inter_files, remap
)

Expand All @@ -87,5 +91,5 @@ def map_extra_copies(target_fasta, reference_fasta, ref_chroms, target_chroms, p
intermediate_dict, unmapped_features, lifted_feature_list, seq_threshold)
fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
aligned_segments, unmapped_features, 0.0,
intermediate_dict, children_dict, feature_db, parent_order, seq_threshold, "copies")
intermediate_dict, children_dict, feature_db, parent_order, seq_threshold)

0 comments on commit 86f2b95

Please sign in to comment.