forked from Oshlack/Lace
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMobius.py
executable file
·116 lines (92 loc) · 5.7 KB
/
Mobius.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#Author: Anthony Hawkins
#A little script to construct an annotation from an SJ.out.tab file, a standard STAR output.
# This creates the dynamic block annotation
import argparse
import pandas as pd
import os, sys
def Mobius(sjfile,gfile,ft,flat_ann,read_thresh):
#First read in the SJ.out.tab as sys.argv[1]
sj = pd.read_csv(sjfile,sep='\t',header=None,names=['Gene','Start','End','strand','intron motif','Annotated','Unique','Multi','Overhang'])
#Read in SuperTranscript fasta file
sf = open(sys.argv[2],'r')
glength = {} #A dictionary holding the SuperTranscript gene lengths
gene=''
for line in sf:
if('>' in line):
gene= (line.split()[0]).split('>')[1]
glength[gene] = ''
else:
glength[gene] = glength[gene] + line.split('\n')[0].split('\r')[0]
#Create gtf file
gtf = open('Spliced.gtf','w')
slist = {}
#Make a dictionary for each gene, which holds a list of splice junction start and end points
#For each row
for i in range(0,len(sj['Gene'])):
curr_gene = sj.iloc[i,0]
if(curr_gene not in slist.keys()): slist[curr_gene] = [1]
if((sj.iloc[i,6] + sj.iloc[i,7]) > read_thresh): #More than 5 reads (either unique or multi spanning junction)
slist[curr_gene].append(int(sj.iloc[i,1])) #This is actually the intron start part (i.e one base over)
slist[curr_gene].append(int(sj.iloc[i,2])+1) #This is the end of the exon-exon junction, so need to add one for the actual exon start place
#If forcing transcript start and ends
if(ft):
#Make a dictionary with the transcript list per gene
igtf = pd.read_csv(flat_ann,sep='\t',header=None,names=['Gene','Source','Type','Start','End','v1','v2','v3','v4'],skiprows=2)
prev_gene =''
prev_trans=''
for i in range(0,len(igtf)):
curr_gene = igtf.iloc[i,0]
curr_trans = igtf.iloc[i,8].split(';')[1].split('"')[1]
#If gene not already in spliced dict, then make a key with an empty list value
if (curr_gene not in slist.keys()): slist[curr_gene]=[]
#Switch transcripts in gene
if(curr_trans != prev_trans and curr_gene==prev_gene): #Just switched from one transcript in gene to another one in same gene
slist[curr_gene].append(prev_end) #End of previous transcript
slist[curr_gene].append(igtf.iloc[i,3]) #Start of new transcript
#Switch gene
if(curr_gene != prev_gene): #Beginning a whole new gene
slist[curr_gene].append(igtf.iloc[i,3]) #Start site of first transcript in new gene
if(i != 0): slist[prev_gene].append(prev_end) #Make sure add in the end point of the last transcript on the last gene
prev_gene = curr_gene
prev_trans = curr_trans
prev_end = igtf.iloc[i,4]
#Now sort each list for each gene, only keep unique elements
for key in glength.keys():
exon_counter = 1
if(key in slist.keys()):
slist[key] = list(set(slist[key]))
slist[key].sort()
#Now for each coord in list make a block
for i in range(1,len(slist[key])):
ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(slist[key][i-1]) + '\t' + str(slist[key][i]-1) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n' #Note: need to minus one off end to account for the fact that the exon ends before the exon-exon boundary exists
exon_counter +=1
gtf.write(ann)
#For the list splice junnction, we need to make a block from the last sj to the end of the ST
if(key not in slist): last = 1
else: last = slist[key][-1]
#If not forcing transcript start and ends
if(not ft):
if(last != len(glength[key])):
ann = str(key) + '\t' + 'SuperTranscript' + '\t' + 'exon' + '\t' + str(last) + '\t' + str(len(glength[key])) + '\t' + '.' + '\t' + '.' + '\t' + '0' + '\t' + 'gene_id "' +str(key) +'"; transcript_id "' + str(key) + '"; exon_number ' + str(exon_counter) + '; exon_id "' + str(key)+':'+str(exon_counter)+ '"' + '\n'
gtf.write(ann)
if __name__ == '__main__':
#ASCII art
print(" ___ ___ ___ ____ ____ __ __ _____")
print("| | | / \ | \ | || | |/ __/")
print("| _ _ || || o ) | | | | ( \_ ")
print("| \_/ || o || | | | | | |\__ |")
print("| | || || O | | | | : |/ \ |")
print("| | || || | | | | |\ |")
print("|_______| \___/ |_____||____| \__,_| \___|")
#Make argument parser
parser = argparse.ArgumentParser()
#Add Arguments
parser.add_argument("SpliceJunctions",help="The name of the Splice Junctions tab file (in the format of the one STAR produces)")
parser.add_argument("GenomeFasta",help="A fasta file containing the sequence for all genes in genome")
parser.add_argument("-forceTrans","-ft",help="Force blocks where annotated transcripts start and end",action='store_true')
parser.add_argument("-AnnoTrans","-a",help="Flattened SuperTranscript annotation file",default="")
parser.add_argument("-readThresh","-reads",help="The minimum number of reads in all combined samples required to support a splice junction (default=5)",default=5)
args= parser.parse_args()
print('Constructing Dynamic Blocks')
Mobius(args.SpliceJunctions,args.GenomeFasta,args.forceTrans,args.AnnoTrans,args.readThresh)
print('Done')