-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdeepmito.py
executable file
·151 lines (142 loc) · 6.38 KB
/
deepmito.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
import sys
import logging
import os
if 'DEEPMITO_ROOT' in os.environ:
sys.path.append(os.environ['DEEPMITO_ROOT'])
else:
logging.error("DEEPMITO_ROOT environment varible is not set")
logging.error("Please, set and export DEEPMITO_ROOT to point to deepmito root folder")
sys.exit(1)
import argparse
import numpy
from Bio import SeqIO
import deepmitolib.deepmitoconfig as cfg
from deepmitolib.cnn import MultiCNNWrapper
from deepmitolib.utils import encode, annotToText, check_sequence_pssm_match, printDate, write_gff_output, write_json_output, get_data_cache
from deepmitolib.workenv import TemporaryEnv
from deepmitolib.blast import runPsiBlast
def run_multifasta(ns):
multiModel = MultiCNNWrapper(cfg.MODELS)
annotation = {}
data_cache = get_data_cache(ns.cache_dir)
try:
for record in SeqIO.parse(ns.fasta, 'fasta'):
workEnv = TemporaryEnv()
prefix = record.id.replace("|","_")
fastaSeq = workEnv.createFile(prefix+".", ".fasta")
printDate("Processing sequence %s" % record.id)
SeqIO.write([record], fastaSeq, 'fasta')
printDate("Running PSIBLAST")
pssmFile = runPsiBlast(prefix, ns.dbfile, fastaSeq, workEnv, data_cache=data_cache,
num_alignments=ns.pbnalign, num_iterations=ns.pbniter, evalue=ns.pbeval,
threads=ns.threads)
printDate("Predicting sumbitochondrial localization")
acc, X = encode(fastaSeq, cfg.AAIDX10, pssmFile)
pred = multiModel.predict(X)
cc = numpy.argmax(pred)
score = round(numpy.max(pred),2)
annotation[record.id] = {'sequence': str(record.seq), 'loc': cc, 'score': score}
workEnv.destroy()
#annotToText(annotation, ns.outf)
ofs = open(ns.outf, 'w')
if ns.outfmt == "gff3":
write_gff_output(annotation, ofs)
else:
write_json_output(annotation, ofs)
ofs.close()
except:
logging.exception("Errors occurred:")
sys.exit(1)
else:
sys.exit(0)
def run_pssm(ns):
workEnv = TemporaryEnv()
multiModel = MultiCNNWrapper(cfg.MODELS)
annotation = {}
try:
record = SeqIO.read(ns.fasta, "fasta")
except:
logging.exception("Error reading FASTA: file is not FASTA or more than one sequence is present")
sys.exit(1)
else:
printDate("Processing sequence %s" % record.id)
printDate("Using user-provided PSSM file, skipping PSIBLAST")
pssmFile = ns.psiblast_pssm
try:
check_sequence_pssm_match(str(record.seq), pssmFile)
except:
logging.exception("Error in PSSM: sequence and provided PSSM do not match.")
sys.exit(1)
else:
try:
prefix = record.id.replace("|","_")
fastaSeq = workEnv.createFile(prefix+".", ".fasta")
SeqIO.write([record], fastaSeq, 'fasta')
printDate("Predicting sumbitochondrial localization")
acc, X = encode(fastaSeq, cfg.AAIDX10, pssmFile)
pred = multiModel.predict(X)
cc = numpy.argmax(pred)
score = round(numpy.max(pred),2)
annotation[record.id] = {'sequence': str(record.seq), 'loc': cc, 'score': score}
ofs = open(ns.outf, 'w')
if ns.outfmt == "gff3":
write_gff_output(annotation, ofs)
else:
write_json_output(annotation, ofs)
ofs.close()
except:
logging.exception("Errors occurred:")
sys.exit(1)
else:
workEnv.destroy()
sys.exit(0)
def main():
DESC = "DeepMito: Predictor of protein submitochondrial localization"
parser = argparse.ArgumentParser(description=DESC, prog="deepmito.py")
subparsers = parser.add_subparsers(title = "subcommands",
description = "valid subcommands",
help = "additional help")
multifasta = subparsers.add_parser("multi-fasta",
help = "Multi-FASTA input module",
description = "DeepMito: Multi-FASTA input module.")
pssm = subparsers.add_parser("pssm", help = "PSSM input module (one sequence at a time)",
description = "DeepMito: PSSM input module.")
multifasta.add_argument("-f", "--fasta",
help = "The input multi-FASTA file name",
dest = "fasta", required = True)
multifasta.add_argument("-d", "--dbfile",
help = "The PSIBLAST DB file",
dest = "dbfile", required= True)
multifasta.add_argument("-o", "--outf",
help = "The output file",
dest = "outf", required = True)
multifasta.add_argument("-m", "--outfmt",
help = "The output format: json or gff3 (default)",
choices=['json', 'gff3'], required = False, default = "gff3")
multifasta.add_argument("-c", "--cache-dir", help="Cache dir for alignemnts", dest="cache_dir", required=False, default=None)
multifasta.add_argument("-a", "--threads", help="Number of threads (default 1)", dest="threads", required=False, default=1, type=int)
multifasta.add_argument("-j", "--psiblast-iter", help="Number of PSIBLAST iterations (default 3)", dest="pbniter", required=False, default=3, type=int)
multifasta.add_argument("-n", "--psiblast-nalign", help="PSIBLAST num_alignments parameter (default 5000)", dest="pbnalign", required=False, default=5000, type=int)
multifasta.add_argument("-e", "--psiblast-evalue", help="PSIBLAST evalue parameter (default 0.001)", dest="pbeval", required=False, default=0.001, type=float)
multifasta.set_defaults(func=run_multifasta)
pssm.add_argument("-f", "--fasta",
help = "The input FASTA file name (one sequence)",
dest = "fasta", required = True)
pssm.add_argument("-p", "--pssm",
help = "The PSIBLAST PSSM file",
dest = "psiblast_pssm", required= True)
pssm.add_argument("-o", "--outf",
help = "The output file",
dest = "outf", required = True)
pssm.add_argument("-m", "--outfmt",
help = "The output format: json or gff3 (default)",
choices=['json', 'gff3'], required = False, default = "gff3")
pssm.set_defaults(func=run_pssm)
if len(sys.argv) == 1:
parser.print_help()
else:
ns = parser.parse_args()
ns.func(ns)
if __name__ == "__main__":
main()