diff --git a/src/include/htslib b/src/include/htslib index 7018ee7..2140d03 160000 --- a/src/include/htslib +++ b/src/include/htslib @@ -1 +1 @@ -Subproject commit 7018ee74f87932e89b33c5647e8c9ab716e4e187 +Subproject commit 2140d03e964563e9539295a337852bbb75b7ec92 diff --git a/src/setup.py b/src/setup.py index 7c9d7a7..fc760f4 100644 --- a/src/setup.py +++ b/src/setup.py @@ -7,7 +7,7 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -VERSION = "1.0.9" +VERSION = "1.0.10" # Convert distutils Windows platform specifiers to CMake -A arguments PLAT_TO_CMAKE = { diff --git a/src/splam/__init__.py b/src/splam/__init__.py index c1a7902..36f0696 100644 --- a/src/splam/__init__.py +++ b/src/splam/__init__.py @@ -1 +1 @@ -__version__ = '1.0.9' \ No newline at end of file +__version__ = '1.0.10' \ No newline at end of file diff --git a/src/splam/extract_gff.py b/src/splam/extract_gff.py index 40b7c1e..2d144e0 100644 --- a/src/splam/extract_gff.py +++ b/src/splam/extract_gff.py @@ -10,11 +10,20 @@ def write_introns(fw, intron_dict, junc_count): trans_ls = ",".join(eles[5]) fw.write(f'{eles[0]}\t{str(eles[1])}\t{str(eles[2])}\tJUNC{junc_count:08}\t{str(eles[3])}\t{str(eles[4])}\t{trans_ls}\n') junc_count += 1 + intron_dict = {} + return junc_count, intron_dict - return junc_count +def get_parent_features_to_extract(feature_types_file): + feature_types = ["gene"] + if feature_types_file is not None: + feature_types = [] + f = open(feature_types_file) + for line in f.readlines(): + feature_types.append(line.rstrip()) + return feature_types -def extract_introns(gff_file, gff_db, is_load_gff_db, bed_file, trans_intron_num_txt): +def extract_introns(gff_file, gff_db, feature_file, is_load_gff_db, bed_file, trans_intron_num_txt): print("gff_file: ", gff_file) print("gff_db: ", gff_db) print("bed_file: ", bed_file) @@ -26,64 +35,124 @@ def extract_introns(gff_file, gff_db, is_load_gff_db, bed_file, trans_intron_num # Load the GFF database db = gffutils.FeatureDB(gff_db, keep_order=True) - genes = db.features_of_type("gene") - + features = get_parent_features_to_extract(feature_file) intron_dict = {} trans_2_intron_num = {} - junc_count = 0 with open(bed_file, 'w') as fw: - bundle_s = 0 bundle_e = 0 bundle_chr = "" + for feature in features: + for locus in db.features_of_type(feature): + bundle_s, bundle_e, bundle_chr, junc_count, intron_dict= extract_introns_itr(db, locus, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw) - for gene in genes: - - # curr_s = gene.start - # curr_e = gene.end - - if bundle_chr != gene[0]: - # Reconstruct the bundle - bundle_s = gene.start - bundle_e = gene.end - bundle_chr = gene[0] - # Write out all introns - junc_count = write_introns(fw, intron_dict, junc_count) - intron_dict = {} - - elif bundle_chr == gene[0] and gene.start < bundle_e: - # Extend the bundle - bundle_e = gene.end - - # Retrieve child features - children = db.children(gene, level=1) - - # Print child features - for child in children: - chrom = child[0] - trans_id = child.id - trans_s = child.start - trans_e = child.end - trans_2_intron_num[trans_id] = 0 - - exons = db.children(child, featuretype='exon', order_by='start', level=1) - prev_exon = "" - for exon in exons: - if (prev_exon != ""): - key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}' - trans_2_intron_num[trans_id] += 1 - if key in intron_dict.keys(): - intron_dict[key]["count"] += 1 - intron_dict[key]["trans"].add(trans_id) - else: - intron_dict[key] = {} - intron_dict[key]["count"] = 1 - intron_dict[key]["trans"] = set([trans_id]) - prev_exon = exon - # Write out all introns - junc_count = write_introns(fw, intron_dict, junc_count) + junc_count, intron_dict = write_introns(fw, intron_dict, junc_count) intron_dict = {} + + json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w')) + + + +def extract_introns_itr(db, locus, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw): + children = list(db.children(locus, featuretype='exon', level=1, order_by='start')) + if len(children) == 0: + children = db.children(locus, level=1) + for child in list(children): + bundle_s, bundle_e, bundle_chr, junc_count, intron_dict = extract_introns_itr(db, child, bundle_s, bundle_e, bundle_chr, intron_dict, trans_2_intron_num, junc_count, fw) + + else: + if bundle_chr != locus[0]: + # Reconstruct the bundle + bundle_s = locus.start + bundle_e = locus.end + bundle_chr = locus[0] + # Write out all introns + junc_count, intron_dict = write_introns(fw, intron_dict, junc_count) + + elif bundle_chr == locus[0] and locus.start <= bundle_e: + # Extend the bundle + bundle_e = locus.end + + elif bundle_chr == locus[0] and locus.start > bundle_e: + bundle_s = locus.start + bundle_e = locus.end + # Write out all introns + junc_count, intron_dict = write_introns(fw, intron_dict, junc_count) + + # Print child features + exons = db.children(locus, featuretype='exon', level=1, order_by='start') + prev_exon = "" + for exon in exons: + if (prev_exon != ""): + key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}' + if locus.id not in trans_2_intron_num.keys(): + trans_2_intron_num[locus.id] = 1 + else: + trans_2_intron_num[locus.id] += 1 + if key in intron_dict.keys(): + intron_dict[key]["count"] += 1 + intron_dict[key]["trans"].add(locus.id) + else: + intron_dict[key] = {} + intron_dict[key]["count"] = 1 + intron_dict[key]["trans"] = set([locus.id]) + prev_exon = exon + return bundle_s, bundle_e, bundle_chr, junc_count, intron_dict + + # with open(bed_file, 'w') as fw: + + # bundle_s = 0 + # bundle_e = 0 + # bundle_chr = "" + + # for gene in genes: + + # # curr_s = gene.start + # # curr_e = gene.end + + # if bundle_chr != gene[0]: + # # Reconstruct the bundle + # bundle_s = gene.start + # bundle_e = gene.end + # bundle_chr = gene[0] + # # Write out all introns + # junc_count = write_introns(fw, intron_dict, junc_count) + # intron_dict = {} + + # elif bundle_chr == gene[0] and gene.start < bundle_e: + # # Extend the bundle + # bundle_e = gene.end + + # # Retrieve child features + # children = db.children(gene, level=1) + + # # Print child features + # for child in children: + # chrom = child[0] + # trans_id = child.id + # trans_s = child.start + # trans_e = child.end + # trans_2_intron_num[trans_id] = 0 + + # exons = db.children(child, featuretype='exon', order_by='start', level=1) + # prev_exon = "" + # for exon in exons: + # if (prev_exon != ""): + # key = f'{exon[0]}\t{str(prev_exon.end-1)}\t{str(exon.start)}\t{exon.strand}' + # trans_2_intron_num[trans_id] += 1 + # if key in intron_dict.keys(): + # intron_dict[key]["count"] += 1 + # intron_dict[key]["trans"].add(trans_id) + # else: + # intron_dict[key] = {} + # intron_dict[key]["count"] = 1 + # intron_dict[key]["trans"] = set([trans_id]) + # prev_exon = exon + + # # Write out all introns + # junc_count = write_introns(fw, intron_dict, junc_count) + # intron_dict = {} - json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w')) \ No newline at end of file + # json.dump(trans_2_intron_num, open(trans_intron_num_txt,'w')) \ No newline at end of file diff --git a/src/splam/header.py b/src/splam/header.py index a6aa632..c6ce52a 100644 --- a/src/splam/header.py +++ b/src/splam/header.py @@ -8,4 +8,4 @@ __license__ = "MIT" __maintainer__ = "developer" __status__ = "Kuan-Hao Chao" -__version__ = "1.0.9" \ No newline at end of file +__version__ = "1.0.10" \ No newline at end of file diff --git a/src/splam/main.py b/src/splam/main.py index 7603a3e..3b63aa3 100644 --- a/src/splam/main.py +++ b/src/splam/main.py @@ -31,6 +31,7 @@ def parse_args(args): parser_extract.add_argument('-V', '--verbose', action='store_true', help='running Splam in verbose mode.') # on/off flag + parser_extract.add_argument('-F', '--features', metavar='TYPES', default=None, help='list of feature types to extract introns') parser_extract.add_argument('-P', '--paired', action='store_true', help='bundling alignments in "paired-end" mode.') # on/off flag @@ -178,7 +179,7 @@ def main(argv=None): if not os.path.exists(outdir): os.makedirs(outdir, exist_ok=True) - extract_gff.extract_introns(input, gff_db, is_load_gff_db, junction_bed, trans_intron_num_txt) + extract_gff.extract_introns(input, gff_db, args.features,is_load_gff_db, junction_bed, trans_intron_num_txt) elif file_format == "BAM" or file_format == "bam": argv_extract = sys.argv diff --git a/test/script_annotation.sh b/test/script_annotation.sh index 727cdbc..3821cd4 100755 --- a/test/script_annotation.sh +++ b/test/script_annotation.sh @@ -1,5 +1,5 @@ # Step 1: extract introns in the annotation -splam extract refseq_110_GRCh38_chr_fixed.gff -o tmp_out_annotation +splam extract refseq_110_GRCh38_chr_fixed.gff -o tmp_out_annotation -F feature_gene.txt # Step 2: score introns in the annotation splam score -G chr9_subset.fa -m ../model/splam_script.pt -o tmp_out_annotation tmp_out_annotation/junction.bed diff --git a/test/script_annotation_lncRNA.sh b/test/script_annotation_lncRNA.sh new file mode 100755 index 0000000..10e1ca7 --- /dev/null +++ b/test/script_annotation_lncRNA.sh @@ -0,0 +1,8 @@ +# Step 1: extract introns in the annotation +splam extract lncs_and_exons.gff -o tmp_out_annotation_lncRNA -F feature_transcript.txt + +# Step 2: score introns in the annotation +splam score -G ~/Documents/Projects/ref_genome/homo_sapiens/NCBI_Refseq_chr_fixed/GCF_000001405.40_GRCh38.p14_genomic.fna -m ../model/splam_script.pt -o tmp_out_annotation_lncRNA tmp_out_annotation_lncRNA/junction.bed + +#Step 3: output statistics of each transcript +splam clean -o tmp_out_annotation_lncRNA -t 0.8