fixed bug causing crash with >1 unplaced sequence

agshumate · Jun 29, 2020 · 86f2b95 · 86f2b95
1 parent d6e8169
commit 86f2b95
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 26 deletions.
diff --git a/align_features.py b/align_features.py
@@ -20,8 +20,8 @@ def align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
     threads_per_alignment = max(1, math.floor(processes / len(ref_chroms)))
     sam_files = []
     pool = Pool(processes)
-    func = partial(align_subset, ref_chroms, target_chroms, threads_per_alignment, target_fasta, reference_fasta, minimap2_path, inter_files, map, genome_size)
-    for result in pool.imap_unordered(func, np.arange(0,len(ref_chroms))):
+    func = partial(align_subset, ref_chroms, target_chroms, threads_per_alignment, target_fasta, reference_fasta, minimap2_path, inter_files, map, genome_size, search_type)
+    for result in pool.imap_unordered(func, np.arange(0,len(target_chroms))):
         sam_files.append(result)
     pool.close()
     pool.join()
@@ -45,13 +45,17 @@ def split_target_sequence(target_chroms, target_fasta_name, inter_files):
 
 
 
-def align_subset(ref_chroms, target_chroms, threads, target_fasta_name, reference_fasta_name, minimap2_path, inter_files, map, genome_size, index):
-    if ref_chroms[index] == reference_fasta_name:
+def align_subset(ref_chroms, target_chroms, threads, target_fasta_name, reference_fasta_name, minimap2_path, inter_files, map, genome_size, liftover_type, index):
+    if ref_chroms[index] == reference_fasta_name and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"):
         features_name = 'reference_all'
+    elif liftover_type == "unmapped":
+        features_name = "unmapped_to_expected_chrom"
+    elif liftover_type == "unplaced":
+        features_name = "unplaced"
     else:
         features_name = ref_chroms[index]
     features_file =  inter_files+ "/" + features_name + "_genes.fa"
-    if target_chroms[index] == target_fasta_name:
+    if liftover_type != "chrm_by_chrm" or target_chroms[0] == target_fasta_name:
         target_file = target_fasta_name
         out_file_target = "target_all"
     else:

diff --git a/extract_features.py b/extract_features.py
@@ -6,13 +6,13 @@
 import os
 
 
-def extract_features_to_lift(g_arg, db_arg, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files):
+def extract_features_to_lift(g_arg, db_arg, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files, liftover_type):
     print("extracting features")
     if os.path.exists(inter_files) is False:
         os.mkdir(inter_files)
     feature_db, feature_db_name = create_feature_db_connections(g_arg, db_arg, infer_transcripts, infer_genes)
     parent_dict, child_dict, intermediate_dict, parent_order= seperate_parents_and_children(feature_db)
-    get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files)
+    get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
     return parent_dict, child_dict, intermediate_dict, feature_db, parent_order
 
 
@@ -107,21 +107,25 @@ def has_child(feature, feature_db):
 
 
 
-def get_gene_sequences(parent_dict, ref_chroms, reference_fasta_name, processes, inter_files):
+def get_gene_sequences(parent_dict, ref_chroms, reference_fasta_name, processes, inter_files, liftover_type):
     pool = Pool(processes)
     Faidx(reference_fasta_name)
-    func = partial(get_gene_sequences_subset, parent_dict, reference_fasta_name, inter_files)
+    func = partial(get_gene_sequences_subset, parent_dict, reference_fasta_name, inter_files, liftover_type)
     for result in pool.imap_unordered(func, ref_chroms):
         continue
     pool.close()
     pool.join()
     return
 
 
-def get_gene_sequences_subset(parent_dict, reference_fasta_name,  inter_files, chrom_name):
+def get_gene_sequences_subset(parent_dict, reference_fasta_name,  inter_files, liftover_type, chrom_name):
     reference_fasta = Fasta(reference_fasta_name)
-    if chrom_name  == reference_fasta_name:
+    if chrom_name  == reference_fasta_name and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"):
         fasta_out_name = "reference_all"
+    elif liftover_type == "unmapped":
+        fasta_out_name = "unmapped_to_expected_chrom"
+    elif liftover_type == "unplaced":
+        fasta_out_name = "unplaced"
     else:
         fasta_out_name = chrom_name
     fasta_out = open(inter_files+"/"+fasta_out_name + "_genes.fa", 'w')

diff --git a/fix_overlapping_features.py b/fix_overlapping_features.py
@@ -4,11 +4,11 @@
 
 def fix_incorrectly_overlapping_features(all_lifted_features, features_to_check, parent_dict, all_aligned_segs,
                                          unmapped_features, threshold, intermediate_dict, children_dict, feature_db,
-                                         original_parent_order,seq_id_threshold, liftover_type):
+                                         original_parent_order,seq_id_threshold):
     features_to_remap = check_homologues(all_lifted_features, features_to_check, parent_dict, original_parent_order)
     resolve_overlapping_homologues(all_aligned_segs, all_lifted_features, features_to_remap, unmapped_features,
                                 threshold,  parent_dict, intermediate_dict,
-                                   children_dict, feature_db,original_parent_order, seq_id_threshold, liftover_type)
+                                   children_dict, feature_db,original_parent_order, seq_id_threshold)
 
 
 
@@ -108,7 +108,7 @@ def find_feature_to_remap(feature, overlap_feature, original_parent_order, new_p
 
 def resolve_overlapping_homologues(all_aligned_segs, lifted_feature_list, features_to_remap,  unmapped_features,
                                    threshold,  parent_dict, intermediate_dict, children_dict,
-                                   feature_db,original_parent_order,seq_id_threshold, liftover_type):
+                                   feature_db,original_parent_order,seq_id_threshold):
     all_overlapping_features = {}
     while len(features_to_remap) > 0:
         features_to_check = {}

diff --git a/liftoff.py b/liftoff.py
@@ -7,6 +7,7 @@
 def parse_args():
     parser = argparse.ArgumentParser(description='Lift features from one genome assembly to another')
     group = parser.add_mutually_exclusive_group(required=True)
+    parser.add_argument("-V", "--version", help="show program version", action='version', version="v1.1.1" )
     parser.add_argument('-t', required=True, help="target fasta genome to lift genes to", metavar="<target.fasta>")
     parser.add_argument('-r', required=True, help = "reference fasta genome to lift genes from", metavar="<reference.fasta>")
     group.add_argument('-g', help="annotation file to lift over in gff or gtf format", metavar="<ref_annotation.gff>")
@@ -64,6 +65,7 @@ def main():
         ref_chroms=[reference_fasta]
         target_chroms= [target_fasta]
 
+
     #lift genes
     lifted_feature_list = {}
     unmapped_features = []

diff --git a/liftover_types.py b/liftover_types.py
@@ -8,12 +8,13 @@
 def lift_original_annotation(gff, target_fasta, reference_fasta, ref_chroms, target_chroms, processes, db,
                              lifted_feature_list, unmapped_features, infer_transcripts, infer_genes, cov_threshold, seq_threshold,
                              minimap2_path, inter_files):
+    liftover_type = "chrm_by_chrm"
     if target_chroms[0] == target_fasta:
         cov_threshold, seq_threshold = 0,0
     parent_dict, children_dict, intermediate_dict, feature_db, original_parent_order = extract_features.extract_features_to_lift(
-        gff, db, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files)
+        gff, db, ref_chroms, reference_fasta, processes, infer_transcripts, infer_genes, inter_files, liftover_type)
     aligned_segments = align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
-                                                               parent_dict, children_dict, "chrm_by_chrm",
+                                                               parent_dict, children_dict, liftover_type,
                                                                unmapped_features, reference_fasta, minimap2_path, inter_files, True)
 
     print("lifting features")
@@ -22,7 +23,7 @@ def lift_original_annotation(gff, target_fasta, reference_fasta, ref_chroms, tar
     fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
                                                                   aligned_segments, unmapped_features,
                                                                   cov_threshold, intermediate_dict, children_dict,
-                                                                  feature_db, original_parent_order, seq_threshold, "chrm_by_chrm")
+                                                                  feature_db, original_parent_order, seq_threshold)
     return feature_db, parent_dict, intermediate_dict, children_dict, original_parent_order
 
 
@@ -31,43 +32,45 @@ def map_unmapped_genes_agaisnt_all(unmapped_features, target_fasta, reference_fa
                                        children_dict, parent_order,minimap2_path,inter_files):
     liftoff_utils.clear_scores(lifted_feature_list, parent_dict)
     unmapped_dict = {}
+    liftover_type = "unmapped"
     for feature in unmapped_features:
         unmapped_dict[feature.id]=feature
-    extract_features.get_gene_sequences(unmapped_dict, ref_chroms, reference_fasta, processes, inter_files)
+    extract_features.get_gene_sequences(unmapped_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
     unmapped_features = []
     aligned_segments=align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
-                                                               unmapped_dict, children_dict, "missing", unmapped_features, reference_fasta,
+                                                               unmapped_dict, children_dict, liftover_type, unmapped_features, reference_fasta,
                                                              minimap2_path,inter_files, True)
     print("lifting features")
     lift_features.lift_all_features(aligned_segments, {}, 0.0, feature_db, unmapped_dict, children_dict,
                                     intermediate_dict, unmapped_features, lifted_feature_list, 0.0)
     fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
                                                                   aligned_segments, unmapped_features, 0.0,
                                                                   intermediate_dict, children_dict, feature_db,
-                                                                   parent_order, 0.0, "missing")
+                                                                   parent_order, 0.0)
     return unmapped_features
 
 
 def map_unplaced_genes(unmapped_features, target_fasta, reference_fasta, ref_chroms, target_chroms,
                                        processes, lifted_feature_list, feature_db, parent_dict, intermediate_dict,
                                        children_dict, parent_order,minimap2_path,inter_files):
     liftoff_utils.clear_scores(lifted_feature_list, parent_dict)
+    liftover_type = "unplaced"
     unplaced_dict = {}
     for feature_name in parent_dict:
         feature = parent_dict[feature_name]
         if feature.seqid in ref_chroms:
             unplaced_dict[feature.id] = feature
-    extract_features.get_gene_sequences(unplaced_dict, ref_chroms, reference_fasta, processes, inter_files)
+    extract_features.get_gene_sequences(unplaced_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
     aligned_segments=align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
-                                                               unplaced_dict, children_dict, "unplaced", unmapped_features,
+                                                               unplaced_dict, children_dict, liftover_type, unmapped_features,
                                                              reference_fasta,minimap2_path,inter_files, True)
     print("lifting features")
     lift_features.lift_all_features(aligned_segments, {}, 0.0, feature_db, unplaced_dict, children_dict,
                                     intermediate_dict, unmapped_features, lifted_feature_list, 0.0)
 
     fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
                                                                   aligned_segments, unmapped_features, 0.0,
-                                                                  intermediate_dict, children_dict, feature_db, parent_order, 0.0, "unplaced")
+                                                                  intermediate_dict, children_dict, feature_db, parent_order, 0.0)
 
 
 
@@ -76,9 +79,10 @@ def map_extra_copies(target_fasta, reference_fasta, ref_chroms, target_chroms, p
                      seq_threshold,minimap2_path,inter_files, remap):
     liftoff_utils.clear_scores(lifted_feature_list, parent_dict)
     unmapped_features = []
-    extract_features.get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files)
+    liftover_type = "copies"
+    extract_features.get_gene_sequences(parent_dict, ref_chroms, reference_fasta, processes, inter_files, liftover_type)
     aligned_segments=align_features.align_features_to_target(ref_chroms, target_chroms, processes, target_fasta,
-                                                               parent_dict, children_dict, "copies", unmapped_features, reference_fasta,
+                                                               parent_dict, children_dict, liftover_type, unmapped_features, reference_fasta,
                                                              minimap2_path,inter_files, remap
                                                              )
 
@@ -87,5 +91,5 @@ def map_extra_copies(target_fasta, reference_fasta, ref_chroms, target_chroms, p
                                     intermediate_dict, unmapped_features, lifted_feature_list, seq_threshold)
     fix_overlapping_features.fix_incorrectly_overlapping_features(lifted_feature_list, lifted_feature_list, parent_dict,
                                                                   aligned_segments, unmapped_features, 0.0,
-                                                                  intermediate_dict, children_dict, feature_db, parent_order, seq_threshold, "copies")
+                                                                  intermediate_dict, children_dict, feature_db, parent_order, seq_threshold)