Fix annotation intron extraction and update Splam to v1.0.10

Kuanhao-Chao · Nov 16, 2023 · 1c0f993 · 1c0f993
1 parent 10e4e6b
commit 1c0f993
Show file tree

Hide file tree

Showing 8 changed files with 136 additions and 58 deletions.
diff --git a/src/include/htslib b/src/include/htslib
diff --git a/src/setup.py b/src/setup.py
@@ -7,7 +7,7 @@
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-VERSION = "1.0.9"
+VERSION = "1.0.10"
 
 # Convert distutils Windows platform specifiers to CMake -A arguments
 PLAT_TO_CMAKE = {

diff --git a/src/splam/__init__.py b/src/splam/__init__.py
@@ -1 +1 @@
-__version__ = '1.0.9'
+__version__ = '1.0.10'
diff --git a/src/splam/extract_gff.py b/src/splam/extract_gff.py
@@ -10,11 +10,20 @@ def write_introns(fw, intron_dict, junc_count):
         trans_ls = ",".join(eles[5])
         fw.write(f'{eles[0]}\t{str(eles[1])}\t{str(eles[2])}\tJUNC{junc_count:08}\t{str(eles[3])}\t{str(eles[4])}\t{trans_ls}\n') 
         junc_count += 1
+    intron_dict = {}
+    return junc_count, intron_dict
 
-    return junc_count
+def get_parent_features_to_extract(feature_types_file):
+    feature_types = ["gene"]
+    if feature_types_file is not None:
+        feature_types = []
+        f = open(feature_types_file)
+        for line in f.readlines():
+            feature_types.append(line.rstrip())
+    return feature_types
 
 
-def extract_introns(gff_file, gff_db, is_load_gff_db, bed_file, trans_intron_num_txt):
+def extract_introns(gff_file, gff_db, feature_file, is_load_gff_db, bed_file, trans_intron_num_txt):
     print("gff_file: ", gff_file)
     print("gff_db: ", gff_db)
     print("bed_file: ", bed_file)
@@ -26,64 +35,124 @@ def extract_introns(gff_file, gff_db, is_load_gff_db, bed_file, trans_intron_num
         # Load the GFF database
         db = gffutils.FeatureDB(gff_db, keep_order=True)
 
-    genes = db.features_of_type("gene")
-
+    features = get_parent_features_to_extract(feature_file)
     intron_dict = {}
     trans_2_intron_num = {}
-
     junc_count = 0 
     with open(bed_file, 'w') as fw:
-
         bundle_s = 0
         bundle_e = 0
         bundle_chr = ""
+        for feature in features:
+            for locus in db.features_of_type(feature):
+                bundle_s, bundle_e, bundle_chr, junc_count, intron_dict= extract_introns_itr(db, locus, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw)
 
-        for gene in genes:
-
-            # curr_s = gene.start
-            # curr_e = gene.end
-
-            if bundle_chr != gene[0]:
-                # Reconstruct the bundle
-                bundle_s = gene.start
-                bundle_e = gene.end
-                bundle_chr = gene[0]
-                # Write out all introns
-                junc_count = write_introns(fw, intron_dict, junc_count)
-                intron_dict = {}
-
-            elif bundle_chr == gene[0] and gene.start < bundle_e:
-                # Extend the bundle
-                bundle_e = gene.end
-
-            # Retrieve child features
-            children = db.children(gene, level=1)
-
-            # Print child features
-            for child in children:
-                chrom = child[0]
-                trans_id = child.id
-                trans_s = child.start
-                trans_e = child.end
-                trans_2_intron_num[trans_id] = 0
-
-                exons = db.children(child, featuretype='exon', order_by='start', level=1)
-                prev_exon = ""
-                for exon in exons:
-                    if (prev_exon != ""):
-                        key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}'
-                        trans_2_intron_num[trans_id] += 1
-                        if key in intron_dict.keys():
-                            intron_dict[key]["count"] += 1
-                            intron_dict[key]["trans"].add(trans_id)
-                        else:
-                            intron_dict[key] = {}
-                            intron_dict[key]["count"] = 1
-                            intron_dict[key]["trans"] = set([trans_id])
-                    prev_exon = exon
-
         # Write out all introns
-        junc_count = write_introns(fw, intron_dict, junc_count)
+        junc_count, intron_dict = write_introns(fw, intron_dict, junc_count)
         intron_dict = {}
+
+    json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w'))
+
+
+
+def extract_introns_itr(db, locus, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw):
+    children = list(db.children(locus, featuretype='exon', level=1, order_by='start'))
+    if len(children) == 0:
+        children = db.children(locus, level=1)
+        for child in list(children):
+            bundle_s, bundle_e, bundle_chr, junc_count, intron_dict = extract_introns_itr(db, child, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw)
+
+    else:
+        if bundle_chr != locus[0]:
+            # Reconstruct the bundle
+            bundle_s = locus.start
+            bundle_e = locus.end
+            bundle_chr = locus[0]
+            # Write out all introns
+            junc_count, intron_dict = write_introns(fw, intron_dict, junc_count)
+
+        elif bundle_chr == locus[0] and locus.start <= bundle_e:
+            # Extend the bundle
+            bundle_e = locus.end
+
+        elif bundle_chr == locus[0] and locus.start > bundle_e:
+            bundle_s = locus.start
+            bundle_e = locus.end
+            # Write out all introns
+            junc_count, intron_dict = write_introns(fw, intron_dict, junc_count)
+
+        # Print child features
+        exons = db.children(locus, featuretype='exon', level=1, order_by='start')
+        prev_exon = ""
+        for exon in exons:
+            if (prev_exon != ""):
+                key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}'
+                if locus.id not in trans_2_intron_num.keys():
+                    trans_2_intron_num[locus.id] = 1    
+                else:
+                    trans_2_intron_num[locus.id] += 1
+                if key in intron_dict.keys():
+                    intron_dict[key]["count"] += 1
+                    intron_dict[key]["trans"].add(locus.id)
+                else:
+                    intron_dict[key] = {}
+                    intron_dict[key]["count"] = 1
+                    intron_dict[key]["trans"] = set([locus.id])
+            prev_exon = exon
+    return bundle_s, bundle_e, bundle_chr, junc_count, intron_dict
+
+    # with open(bed_file, 'w') as fw:
+
+    #     bundle_s = 0
+    #     bundle_e = 0
+    #     bundle_chr = ""
+
+    #     for gene in genes:
+
+    #         # curr_s = gene.start
+    #         # curr_e = gene.end
+
+    #         if bundle_chr != gene[0]:
+    #             # Reconstruct the bundle
+    #             bundle_s = gene.start
+    #             bundle_e = gene.end
+    #             bundle_chr = gene[0]
+    #             # Write out all introns
+    #             junc_count = write_introns(fw, intron_dict, junc_count)
+    #             intron_dict = {}
+
+    #         elif bundle_chr == gene[0] and gene.start < bundle_e:
+    #             # Extend the bundle
+    #             bundle_e = gene.end
+
+    #         # Retrieve child features
+    #         children = db.children(gene, level=1)
+
+    #         # Print child features
+    #         for child in children:
+    #             chrom = child[0]
+    #             trans_id = child.id
+    #             trans_s = child.start
+    #             trans_e = child.end
+    #             trans_2_intron_num[trans_id] = 0
+
+    #             exons = db.children(child, featuretype='exon', order_by='start', level=1)
+    #             prev_exon = ""
+    #             for exon in exons:
+    #                 if (prev_exon != ""):
+    #                     key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}'
+    #                     trans_2_intron_num[trans_id] += 1
+    #                     if key in intron_dict.keys():
+    #                         intron_dict[key]["count"] += 1
+    #                         intron_dict[key]["trans"].add(trans_id)
+    #                     else:
+    #                         intron_dict[key] = {}
+    #                         intron_dict[key]["count"] = 1
+    #                         intron_dict[key]["trans"] = set([trans_id])
+    #                 prev_exon = exon
+
+    #     # Write out all introns
+    #     junc_count = write_introns(fw, intron_dict, junc_count)
+    #     intron_dict = {}
 
-    json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w'))
+    # json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w'))
diff --git a/src/splam/header.py b/src/splam/header.py
@@ -8,4 +8,4 @@
 __license__ = "MIT"
 __maintainer__ = "developer"
 __status__ = "Kuan-Hao Chao"
-__version__ = "1.0.9"
+__version__ = "1.0.10"
diff --git a/src/splam/main.py b/src/splam/main.py
@@ -31,6 +31,7 @@ def parse_args(args):
     parser_extract.add_argument('-V', '--verbose',
                     action='store_true',
                     help='running Splam in verbose mode.')  # on/off flag
+    parser_extract.add_argument('-F', '--features', metavar='TYPES', default=None, help='list of feature types to extract introns')
     parser_extract.add_argument('-P', '--paired',
                     action='store_true',
                     help='bundling alignments in "paired-end" mode.')  # on/off flag
@@ -178,7 +179,7 @@ def main(argv=None):
 
             if not os.path.exists(outdir):
                 os.makedirs(outdir, exist_ok=True)
-            extract_gff.extract_introns(input, gff_db, is_load_gff_db, junction_bed, trans_intron_num_txt)
+            extract_gff.extract_introns(input, gff_db, args.features,is_load_gff_db, junction_bed, trans_intron_num_txt)
 
         elif file_format == "BAM" or file_format == "bam":
             argv_extract = sys.argv

diff --git a/test/script_annotation.sh b/test/script_annotation.sh
@@ -1,5 +1,5 @@
 # Step 1: extract introns in the annotation
-splam extract refseq_110_GRCh38_chr_fixed.gff -o tmp_out_annotation
+splam extract refseq_110_GRCh38_chr_fixed.gff -o tmp_out_annotation -F feature_gene.txt
 
 # Step 2: score introns in the annotation
 splam score -G chr9_subset.fa -m ../model/splam_script.pt -o tmp_out_annotation tmp_out_annotation/junction.bed

diff --git a/test/script_annotation_lncRNA.sh b/test/script_annotation_lncRNA.sh
@@ -0,0 +1,8 @@
+# Step 1: extract introns in the annotation
+splam extract lncs_and_exons.gff -o tmp_out_annotation_lncRNA -F feature_transcript.txt
+
+# Step 2: score introns in the annotation
+splam score -G ~/Documents/Projects/ref_genome/homo_sapiens/NCBI_Refseq_chr_fixed/GCF_000001405.40_GRCh38.p14_genomic.fna -m ../model/splam_script.pt -o tmp_out_annotation_lncRNA tmp_out_annotation_lncRNA/junction.bed
+
+#Step 3: output statistics of each transcript
+splam clean -o tmp_out_annotation_lncRNA -t 0.8
+1 −0		.gitignore
+16 −8		Makefile
+208 −0		annot-tsv.1
+908 −0		annot-tsv.c
+34 −36		bgzf.c
+13 −41		configure.ac
+7 −2		cram/cram_index.c
+10 −12		cram/cram_io.c
+2 −4		cram/cram_io.h
+2 −1		cram/cram_stats.c
+8 −2		hfile.c
+1 −1		hfile_libcurl.c
+29 −3		hts.c
+8 −37		hts_probe_cc.sh
+1 −1		htscodecs
+3 −0		htslib.mk
+1 −1		htslib/hfile.h
+9 −0		htslib/hts_defs.h
+4 −4		htslib/klist.h
+4 −4		htslib/kseq.h
+2 −2		m4/hts_check_compile_flags_needed.m4
+2 −0		regidx.c
+11 −6		sam.c
+3 −0		test/annot-tsv/dst.1.txt
+4 −0		test/annot-tsv/dst.10.txt
+2 −0		test/annot-tsv/dst.2.txt
+3 −0		test/annot-tsv/dst.3.txt
+3 −0		test/annot-tsv/dst.4.txt
+2 −0		test/annot-tsv/dst.5.txt
+6 −0		test/annot-tsv/dst.6.txt
+4 −0		test/annot-tsv/dst.7.txt
+2 −0		test/annot-tsv/dst.8.txt
+4 −0		test/annot-tsv/dst.9.txt
+3 −0		test/annot-tsv/out.1.1.txt
+3 −0		test/annot-tsv/out.1.2.txt
+3 −0		test/annot-tsv/out.1.3.txt
+3 −0		test/annot-tsv/out.1.4.txt
+3 −0		test/annot-tsv/out.1.5.txt
+3 −0		test/annot-tsv/out.1.6.txt
+4 −0		test/annot-tsv/out.10.1.txt
+3 −0		test/annot-tsv/out.10.2.txt
+2 −0		test/annot-tsv/out.10.3.txt
+4 −0		test/annot-tsv/out.10.4.txt
+2 −0		test/annot-tsv/out.10.5.txt
+3 −0		test/annot-tsv/out.10.6.txt
+2 −0		test/annot-tsv/out.2.1.txt
+2 −0		test/annot-tsv/out.2.2.txt
+2 −0		test/annot-tsv/out.2.3.txt
+2 −0		test/annot-tsv/out.2.4.txt
+2 −0		test/annot-tsv/out.2.5.txt
+2 −0		test/annot-tsv/out.2.6.txt
+3 −0		test/annot-tsv/out.3.1.txt
+3 −0		test/annot-tsv/out.4.1.txt
+2 −0		test/annot-tsv/out.5.1.txt
+6 −0		test/annot-tsv/out.6.1.txt
+3 −0		test/annot-tsv/out.7.1.txt
+2 −0		test/annot-tsv/out.8.1.txt
+4 −0		test/annot-tsv/out.9.1.txt
+5 −0		test/annot-tsv/src.1.txt
+3 −0		test/annot-tsv/src.10.txt
+4 −0		test/annot-tsv/src.2.txt
+5 −0		test/annot-tsv/src.3.txt
+3 −0		test/annot-tsv/src.4.txt
+2 −0		test/annot-tsv/src.5.txt
+3 −0		test/annot-tsv/src.6.txt
+3 −0		test/annot-tsv/src.7.txt
+2 −0		test/annot-tsv/src.8.txt
+3 −0		test/annot-tsv/src.9.txt
+151 −38		test/test.pl
+75 −0		test/usepublic.cpp
+11 −5		vcf.c