-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFindHomologs.py
134 lines (98 loc) · 5.24 KB
/
FindHomologs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
## Command line program with input usage: "python FindHomologs.py refseq-file blast-file output-file"
## Program parses a file containing specific genes of interest within a given reference genome.
## After performing a BLAST analysis on the reference genome and the query genome, the resulting file is
## parsed for the genes of interest which are reported in a tab delimited format.
import sys
import re
class BlastHomologs(object):
def __init__(self, reference_file, blast_file, output_file):
self.reference_file = reference_file
self.blast_file = blast_file
self.output_file = output_file
self.refseq = []
def find_interest_genes(self):
# parse through file containing the wanted genes and appends them
# to a list called refseq
rfh = open(self.reference_file)
for line in rfh:
line = line.strip()
self.refseq.append(line)
rfh.close()
def find_homologs(self):
bfh = open(blast_file)
ofh = open(output, 'w')
header = "Mouse_Refseq\tHuman_Refseq\tQuery_Length\tSearch_Length\tAlignment\tMatch\tBit_Score\tE_Score"
print >>ofh, header
#RegEx patterns for items of interest
Signif_search = re.compile("^ref\|(.+)\|")
M_id_search = re.compile("Query=.+ref\|(.+)\|")
H_id_search = re.compile(">ref\|(.+)\|")
M_len_search = re.compile("\((\d+)\s*letters\)")
H_len_search = re.compile("Length\s*=\s*(\d+)")
Score_search = re.compile("Score\s*=\s*(.+)\s*bits.+Expect\s*=\s*(.+)")
Align_match_search = re.compile("Identities\s*=\s*(\d+)\/(\d+)")
data = m_name = h_name = m_len = h_len = e_val = bit_score = align = match = None
signif_align = [] # keeps a list of the significant alignments per gene
for line in bfh:
line = line.strip()
if line.startswith("Query="): # start of a new mouse gene end of the previous gene
id_match = M_id_search.search(line)
if match: # prints the previous gene
data = data = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (m_name, h_name, m_len, h_len, align, match, bit_score, e_val)
print >>ofh, data
data = m_name = h_name = m_len = h_len = e_val = bit_score = align = match = None
signif_align = []
if id_match:
option = id_match.group(1)
# check to see if mouse gene is a gene of interest
if option in self.refseq:
m_name = option
elif m_name:
if "letters" in line:
len_match = M_len_search.search(line)
if len_match:
m_len = len_match.group(1)
elif line.startswith("ref"):
significant_match = Signif_search.search(line)
if significant_match: # add all significant alignments to a list to reference later
option = significant_match.group(1)
signif_align.append(option)
elif line.startswith(">"): # looks for the start of a new alignment
id_match = H_id_search.search(line)
if match: # print previous result
data = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (m_name, h_name, m_len, h_len, align, match, bit_score, e_val)
print >>ofh, data
data = h_name = h_len = e_val = bit_score = align = match = None
if id_match:
option = id_match.group(1)
if option in signif_align:
h_name = option
elif h_name: # checks to see if significant alignment gene and procedes with >ref
if "Length" in line:
len_match = H_len_search.search(line)
if len_match:
h_len = len_match.group(1)
elif "Score" in line:
score_match = Score_search.search(line)
if score_match:
score = float(score_match.group(1))
# if no bit score has been logged set current to the bit score
# for each alignment after, only procede if bit score is higher than current bit score
if not bit_score or score > bit_score:
bit_score = score
e_val = score_match.group(2)
elif e_val and "Identities" in line:
align_match = Align_match_search.search(line)
if align_match:
align = align_match.group(2)
match = align_match.group(1)
bfh.close()
ofh.close()
if ofh:
print "File parse complete. See results in %s" % self.output_file
if len(sys.argv) < 4:
print "USAGE: python %s refseq-file blast-file output-file" % (sys.argv[0])
sys.exit()
refseq_file = sys.argv[1]
blast_file = sys.argv[2]
output = sys.argv[3]