-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_genbank_file4
executable file
·203 lines (158 loc) · 4.62 KB
/
get_genbank_file4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
"""
Description:
AccNum.txt is a file of accession numbers
Usage:
GenbankDownload pathToThegenomeFasta file
"""
#********************* MODULES**********************
from __future__ import division
from __future__ import print_function
import sys
import os
import os.path
import sys
import re #regular expressions
import time
# Use the correct module based on the python version.
if sys.version_info[0] == 2:
import httplib as httplib
else: # More modern python
import http.client as httplib
# XML parsing used below is common to 2.7 and 3.x
import xml.etree.ElementTree as ET
# Entrez Utils global variable (hostname for NIH EUtils)
eutilsHost = "eutils.ncbi.nlm.nih.gov"
#******************* FUNCTION DEFINITIONS *******************
def get(hostname, uri):
GET="GET"
connection = httplib.HTTPSConnection(hostname)
connection.request(GET, uri)
response = connection.getresponse()
data = response.read()
connection.close()
return (response.status, response.reason, data)
def EFetch(acc, retmode="text", rettype="fasta", db="nuccore",
email="[email protected]"):
efetchPrefix = "/entrez/eutils/efetch.fcgi?"
argSeparator = "&"
arguments={
"tool": "kSNP4",
"email": email,
"db": db,
"rettype": rettype,
"retmode": retmode,
"id": acc,
}
argumentsList = [ "=".join(item) for item in arguments.items()]
argumentsString = argSeparator.join(argumentsList)
URI = efetchPrefix + argumentsString
# Sleep for 1/3 second; enforce a rate limit of 3 requests per second
time.sleep(0.34)
(status, reason, fetched) = get(eutilsHost,URI)
if status != 200:
sys.stderr.write("HTTP Error %d: %s\n" % (status, reason))
sys.exit(1)
return fetched
def ESearch(searchTerm, db="nuccore", email="[email protected]", retmax=100):
idListParent='IdList'
idListChildren='Id'
efetchPrefix = "/entrez/eutils/esearch.fcgi?"
argSeparator = "&"
arguments={
"tool": "kSNP4",
"email": email,
"db": db,
"rettype": 'uilist',
"retmode": 'xml',
"term": searchTerm,
"idtype": "acc",
"retmax": str(retmax),
}
URI = efetchPrefix + argSeparator.join([
"=".join(item) for item in arguments.items()
])
# Sleep for 1/3 of a second here to enforce a rate limit of 3
time.sleep(0.34)
(status, reason, searchData) = get(eutilsHost,URI)
if status != 200:
sys.stderr.write("HTTP Error %d: %s\n" % (status, reason))
sys.exit(1)
searchRoot = ET.fromstring(searchData)
elementsWithIds = searchRoot.find(idListParent).findall(idListChildren)
ids = [ element.text for element in elementsWithIds ]
return ids
#******************* FUNCTION DEFINITIONS *******************
def extractAccNum(line):
separator = '|'
prefix = '>'
genbank = 'gb'
refseq = 'ref'
# Fails for text: 'gi|1002344102|gb|LPTZ01001107.1|' output as accnum
temp = line.split()
ident = temp[0]
ident = ident.lstrip(prefix)
if separator not in ident:
# This is a basic, accession-number-only identifier
return(ident)
else:
# This is an NCBI standard identifier per
# https://en.wikipedia.org/wiki/FASTA_format
# so we assume it has a 'gb' reference.
idParts = ident.split(separator)
if genbank in idParts:
gbAcc = idParts.index(genbank) + 1
return idParts[gbAcc]
elif refseq in idParts:
refAcc = idParts.index(refseq) + 1
return idParts[refAcc]
else:
return None
#********************** Main *************************
#variables
fileExist = ''
pathToFastaGenome = sys.argv[1]
accnum = ''
GenomeFile = ''
seqData = False # et to True iuf line is sequence data
AccNumList = []
startTime = time.time()
#extract the accession number from the sequence file header
INFILE = open(pathToFastaGenome, 'r')
for line in INFILE:
if line.startswith('>'):
AccNumList.append(extractAccNum(line))
print(AccNumList[-1])
INFILE.close()
OUTFILE = open('genbank_from_NCBI.gbk', 'ab',)
for accnum in AccNumList: #for each accession number
#find the NCBI ID for the accnum
EntrezEmail = '[email protected]'
IDs = ESearch(accnum, db='nuccore', email=EntrezEmail, retmax=100)
print("Found %d entries"%(len(IDs)))
if len(IDs) > 0:
ID = str(IDs[0])
#print("The ID is ", ID)
try:
text = EFetch(db="nuccore", acc=accnum,
rettype="gb", retmode="text")
print("Fetched %s"%(accnum))
except:
continue
temp = text.split(b'\n')
for j in range(len(temp)):
if temp[j].startswith(b"ORIGIN"):
OUTFILE.write(temp[j])
OUTFILE.write(b'\n')
OUTFILE.write(b'//\n\n')
break
else:
# download the accnum file
OUTFILE.write(temp[j])
OUTFILE.write(b'\n')
OUTFILE.write(b'//\n\n')
time.sleep(1)
OUTFILE.close()
endTime = time.time()
elapsedTime = endTime-startTime
print("Used ",elapsedTime, "seconds.")