Skip to content

Commit

Permalink
Fix annotation intron extraction and update Splam to v1.0.10
Browse files Browse the repository at this point in the history
  • Loading branch information
Kuanhao-Chao committed Nov 16, 2023
1 parent 10e4e6b commit 1c0f993
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 58 deletions.
2 changes: 1 addition & 1 deletion src/include/htslib
Submodule htslib updated 71 files
+1 −0 .gitignore
+16 −8 Makefile
+208 −0 annot-tsv.1
+908 −0 annot-tsv.c
+34 −36 bgzf.c
+13 −41 configure.ac
+7 −2 cram/cram_index.c
+10 −12 cram/cram_io.c
+2 −4 cram/cram_io.h
+2 −1 cram/cram_stats.c
+8 −2 hfile.c
+1 −1 hfile_libcurl.c
+29 −3 hts.c
+8 −37 hts_probe_cc.sh
+1 −1 htscodecs
+3 −0 htslib.mk
+1 −1 htslib/hfile.h
+9 −0 htslib/hts_defs.h
+4 −4 htslib/klist.h
+4 −4 htslib/kseq.h
+2 −2 m4/hts_check_compile_flags_needed.m4
+2 −0 regidx.c
+11 −6 sam.c
+3 −0 test/annot-tsv/dst.1.txt
+4 −0 test/annot-tsv/dst.10.txt
+2 −0 test/annot-tsv/dst.2.txt
+3 −0 test/annot-tsv/dst.3.txt
+3 −0 test/annot-tsv/dst.4.txt
+2 −0 test/annot-tsv/dst.5.txt
+6 −0 test/annot-tsv/dst.6.txt
+4 −0 test/annot-tsv/dst.7.txt
+2 −0 test/annot-tsv/dst.8.txt
+4 −0 test/annot-tsv/dst.9.txt
+3 −0 test/annot-tsv/out.1.1.txt
+3 −0 test/annot-tsv/out.1.2.txt
+3 −0 test/annot-tsv/out.1.3.txt
+3 −0 test/annot-tsv/out.1.4.txt
+3 −0 test/annot-tsv/out.1.5.txt
+3 −0 test/annot-tsv/out.1.6.txt
+4 −0 test/annot-tsv/out.10.1.txt
+3 −0 test/annot-tsv/out.10.2.txt
+2 −0 test/annot-tsv/out.10.3.txt
+4 −0 test/annot-tsv/out.10.4.txt
+2 −0 test/annot-tsv/out.10.5.txt
+3 −0 test/annot-tsv/out.10.6.txt
+2 −0 test/annot-tsv/out.2.1.txt
+2 −0 test/annot-tsv/out.2.2.txt
+2 −0 test/annot-tsv/out.2.3.txt
+2 −0 test/annot-tsv/out.2.4.txt
+2 −0 test/annot-tsv/out.2.5.txt
+2 −0 test/annot-tsv/out.2.6.txt
+3 −0 test/annot-tsv/out.3.1.txt
+3 −0 test/annot-tsv/out.4.1.txt
+2 −0 test/annot-tsv/out.5.1.txt
+6 −0 test/annot-tsv/out.6.1.txt
+3 −0 test/annot-tsv/out.7.1.txt
+2 −0 test/annot-tsv/out.8.1.txt
+4 −0 test/annot-tsv/out.9.1.txt
+5 −0 test/annot-tsv/src.1.txt
+3 −0 test/annot-tsv/src.10.txt
+4 −0 test/annot-tsv/src.2.txt
+5 −0 test/annot-tsv/src.3.txt
+3 −0 test/annot-tsv/src.4.txt
+2 −0 test/annot-tsv/src.5.txt
+3 −0 test/annot-tsv/src.6.txt
+3 −0 test/annot-tsv/src.7.txt
+2 −0 test/annot-tsv/src.8.txt
+3 −0 test/annot-tsv/src.9.txt
+151 −38 test/test.pl
+75 −0 test/usepublic.cpp
+11 −5 vcf.c
2 changes: 1 addition & 1 deletion src/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext

VERSION = "1.0.9"
VERSION = "1.0.10"

# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
Expand Down
2 changes: 1 addition & 1 deletion src/splam/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.0.9'
__version__ = '1.0.10'
173 changes: 121 additions & 52 deletions src/splam/extract_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,20 @@ def write_introns(fw, intron_dict, junc_count):
trans_ls = ",".join(eles[5])
fw.write(f'{eles[0]}\t{str(eles[1])}\t{str(eles[2])}\tJUNC{junc_count:08}\t{str(eles[3])}\t{str(eles[4])}\t{trans_ls}\n')
junc_count += 1
intron_dict = {}
return junc_count, intron_dict

return junc_count
def get_parent_features_to_extract(feature_types_file):
feature_types = ["gene"]
if feature_types_file is not None:
feature_types = []
f = open(feature_types_file)
for line in f.readlines():
feature_types.append(line.rstrip())
return feature_types


def extract_introns(gff_file, gff_db, is_load_gff_db, bed_file, trans_intron_num_txt):
def extract_introns(gff_file, gff_db, feature_file, is_load_gff_db, bed_file, trans_intron_num_txt):
print("gff_file: ", gff_file)
print("gff_db: ", gff_db)
print("bed_file: ", bed_file)
Expand All @@ -26,64 +35,124 @@ def extract_introns(gff_file, gff_db, is_load_gff_db, bed_file, trans_intron_num
# Load the GFF database
db = gffutils.FeatureDB(gff_db, keep_order=True)

genes = db.features_of_type("gene")

features = get_parent_features_to_extract(feature_file)
intron_dict = {}
trans_2_intron_num = {}

junc_count = 0
with open(bed_file, 'w') as fw:

bundle_s = 0
bundle_e = 0
bundle_chr = ""
for feature in features:
for locus in db.features_of_type(feature):
bundle_s, bundle_e, bundle_chr, junc_count, intron_dict= extract_introns_itr(db, locus, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw)

for gene in genes:

# curr_s = gene.start
# curr_e = gene.end

if bundle_chr != gene[0]:
# Reconstruct the bundle
bundle_s = gene.start
bundle_e = gene.end
bundle_chr = gene[0]
# Write out all introns
junc_count = write_introns(fw, intron_dict, junc_count)
intron_dict = {}

elif bundle_chr == gene[0] and gene.start < bundle_e:
# Extend the bundle
bundle_e = gene.end

# Retrieve child features
children = db.children(gene, level=1)

# Print child features
for child in children:
chrom = child[0]
trans_id = child.id
trans_s = child.start
trans_e = child.end
trans_2_intron_num[trans_id] = 0

exons = db.children(child, featuretype='exon', order_by='start', level=1)
prev_exon = ""
for exon in exons:
if (prev_exon != ""):
key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}'
trans_2_intron_num[trans_id] += 1
if key in intron_dict.keys():
intron_dict[key]["count"] += 1
intron_dict[key]["trans"].add(trans_id)
else:
intron_dict[key] = {}
intron_dict[key]["count"] = 1
intron_dict[key]["trans"] = set([trans_id])
prev_exon = exon

# Write out all introns
junc_count = write_introns(fw, intron_dict, junc_count)
junc_count, intron_dict = write_introns(fw, intron_dict, junc_count)
intron_dict = {}

json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w'))



def extract_introns_itr(db, locus, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw):
children = list(db.children(locus, featuretype='exon', level=1, order_by='start'))
if len(children) == 0:
children = db.children(locus, level=1)
for child in list(children):
bundle_s, bundle_e, bundle_chr, junc_count, intron_dict = extract_introns_itr(db, child, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw)

else:
if bundle_chr != locus[0]:
# Reconstruct the bundle
bundle_s = locus.start
bundle_e = locus.end
bundle_chr = locus[0]
# Write out all introns
junc_count, intron_dict = write_introns(fw, intron_dict, junc_count)

elif bundle_chr == locus[0] and locus.start <= bundle_e:
# Extend the bundle
bundle_e = locus.end

elif bundle_chr == locus[0] and locus.start > bundle_e:
bundle_s = locus.start
bundle_e = locus.end
# Write out all introns
junc_count, intron_dict = write_introns(fw, intron_dict, junc_count)

# Print child features
exons = db.children(locus, featuretype='exon', level=1, order_by='start')
prev_exon = ""
for exon in exons:
if (prev_exon != ""):
key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}'
if locus.id not in trans_2_intron_num.keys():
trans_2_intron_num[locus.id] = 1
else:
trans_2_intron_num[locus.id] += 1
if key in intron_dict.keys():
intron_dict[key]["count"] += 1
intron_dict[key]["trans"].add(locus.id)
else:
intron_dict[key] = {}
intron_dict[key]["count"] = 1
intron_dict[key]["trans"] = set([locus.id])
prev_exon = exon
return bundle_s, bundle_e, bundle_chr, junc_count, intron_dict

# with open(bed_file, 'w') as fw:

# bundle_s = 0
# bundle_e = 0
# bundle_chr = ""

# for gene in genes:

# # curr_s = gene.start
# # curr_e = gene.end

# if bundle_chr != gene[0]:
# # Reconstruct the bundle
# bundle_s = gene.start
# bundle_e = gene.end
# bundle_chr = gene[0]
# # Write out all introns
# junc_count = write_introns(fw, intron_dict, junc_count)
# intron_dict = {}

# elif bundle_chr == gene[0] and gene.start < bundle_e:
# # Extend the bundle
# bundle_e = gene.end

# # Retrieve child features
# children = db.children(gene, level=1)

# # Print child features
# for child in children:
# chrom = child[0]
# trans_id = child.id
# trans_s = child.start
# trans_e = child.end
# trans_2_intron_num[trans_id] = 0

# exons = db.children(child, featuretype='exon', order_by='start', level=1)
# prev_exon = ""
# for exon in exons:
# if (prev_exon != ""):
# key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}'
# trans_2_intron_num[trans_id] += 1
# if key in intron_dict.keys():
# intron_dict[key]["count"] += 1
# intron_dict[key]["trans"].add(trans_id)
# else:
# intron_dict[key] = {}
# intron_dict[key]["count"] = 1
# intron_dict[key]["trans"] = set([trans_id])
# prev_exon = exon

# # Write out all introns
# junc_count = write_introns(fw, intron_dict, junc_count)
# intron_dict = {}

json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w'))
# json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w'))
2 changes: 1 addition & 1 deletion src/splam/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
__license__ = "MIT"
__maintainer__ = "developer"
__status__ = "Kuan-Hao Chao"
__version__ = "1.0.9"
__version__ = "1.0.10"
3 changes: 2 additions & 1 deletion src/splam/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def parse_args(args):
parser_extract.add_argument('-V', '--verbose',
action='store_true',
help='running Splam in verbose mode.') # on/off flag
parser_extract.add_argument('-F', '--features', metavar='TYPES', default=None, help='list of feature types to extract introns')
parser_extract.add_argument('-P', '--paired',
action='store_true',
help='bundling alignments in "paired-end" mode.') # on/off flag
Expand Down Expand Up @@ -178,7 +179,7 @@ def main(argv=None):

if not os.path.exists(outdir):
os.makedirs(outdir, exist_ok=True)
extract_gff.extract_introns(input, gff_db, is_load_gff_db, junction_bed, trans_intron_num_txt)
extract_gff.extract_introns(input, gff_db, args.features,is_load_gff_db, junction_bed, trans_intron_num_txt)

elif file_format == "BAM" or file_format == "bam":
argv_extract = sys.argv
Expand Down
2 changes: 1 addition & 1 deletion test/script_annotation.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Step 1: extract introns in the annotation
splam extract refseq_110_GRCh38_chr_fixed.gff -o tmp_out_annotation
splam extract refseq_110_GRCh38_chr_fixed.gff -o tmp_out_annotation -F feature_gene.txt

# Step 2: score introns in the annotation
splam score -G chr9_subset.fa -m ../model/splam_script.pt -o tmp_out_annotation tmp_out_annotation/junction.bed
Expand Down
8 changes: 8 additions & 0 deletions test/script_annotation_lncRNA.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Step 1: extract introns in the annotation
splam extract lncs_and_exons.gff -o tmp_out_annotation_lncRNA -F feature_transcript.txt

# Step 2: score introns in the annotation
splam score -G ~/Documents/Projects/ref_genome/homo_sapiens/NCBI_Refseq_chr_fixed/GCF_000001405.40_GRCh38.p14_genomic.fna -m ../model/splam_script.pt -o tmp_out_annotation_lncRNA tmp_out_annotation_lncRNA/junction.bed

#Step 3: output statistics of each transcript
splam clean -o tmp_out_annotation_lncRNA -t 0.8

0 comments on commit 1c0f993

Please sign in to comment.