-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_desktop_pull_data_Entrez_OPTIMIZED_AGG.py
313 lines (272 loc) · 12.4 KB
/
_desktop_pull_data_Entrez_OPTIMIZED_AGG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import pandas as pd
import numpy as np
import datetime
import json
import sys
# Overall spider class that manages the spider
class ChamberSpider(scrapy.Spider):
DOWNLOAD_DELAY = 1
name="chamber"
start_urls = ["http://cm.pschamber.com/list/"]
layer_to_stop = 1
# First method called in scraping data
def parse(self, response):
global search_id
global _pub_year
global _pub_id
global _pub_name
global _pub_doi
global _pub_la
global _pub_fa
global _pub_authors
global alert_name
artinfo = json.loads(response.text)
artinfo = json.loads(response.text)
result_set = artinfo["result"]
found_pub_date = False
global _update_new_id
for uid in result_set:
# UID must be an integer
if uid == str(search_id):
_update_new_id = search_id
_pub_id = _update_new_id
for above_element in result_set[str(uid)]:
if above_element == "authors":
for author_element in result_set[str(uid)]["authors"]:
df_author = author_element["name"]
# Add an entry to the dataframe for each author
add_data(_update_new_id, df_author, 0, 0, 0, 0, 'title')
_pub_authors.append(df_author)
elif above_element == "pubdate":
found_pub_date = True
_pub_year = result_set[str(uid)]["pubdate"].split()[0]
elif not found_pub_date and above_element == "epubdate":
_pub_year = result_set[str(uid)]["epubdate"].split()[0]
elif above_element == "title":
_pub_name = result_set[str(uid)]["title"]
elif above_element == "articleids":
for item in result_set[str(uid)]["articleids"]:
if item['idtype'] == "doi":
_pub_doi = item['value']
break
elif above_element == "sortfirstauthor":
_pub_fa = result_set[str(uid)]["sortfirstauthor"]
elif above_element == "lastauthor":
_pub_la = result_set[str(uid)]["lastauthor"]
references_link = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id=" + str(_update_new_id) + "&retmode=json"
print(references_link)
yield scrapy.Request(references_link,
callback=self.parse_references,
meta={
'id': _update_new_id,
'layer': 1,
'fb': 0,
}
)
def parse_base_article_string(self, response):
# Parse the JSON response
pp_refs = json.loads(response.text)
if "result" in pp_refs:
result_set = pp_refs["result"]
# Find all authors of each UID
for uid in result_set:
# UID must be an integer
if uid != "uids":
df_uid = uid
df_date = "0"
print("uid",uid)
# Extract all authors
for above_element in result_set[str(uid)]:
if above_element == "authors":
for author_element in result_set[str(uid)]["authors"]:
df_author = author_element["name"]
# Add an entry to the dataframe for each author
if response.meta["layer"] == 0:
add_data(df_uid, df_author, response.meta["layer"], df_date, df_uid, response.meta["fb"], result_set[str(uid)]["title"])
else:
add_data(df_uid, df_author, response.meta["layer"], df_date, response.meta["parent"], response.meta["fb"], result_set[str(uid)]["title"])
def parse_references(self, response):
# Parse JSON
pp_refs = json.loads(response.text)
linkset = pp_refs["linksets"]
link_dict = {}
found_refs = False
found_citedin = False
# Extract publication IDs for the references (papers cited by this publication)
for linksetdb in linkset:
linksetdbs = linksetdb["linksetdbs"]
for item in linksetdbs:
#print(item)
if item["linkname"] == "pubmed_pubmed_refs" or item["linkname"] == "pubmed_pubmed": # Backwards
print(linksetdb)
pubmed_refs = item["links"]
found_refs = True
for link in pubmed_refs:
print(link)
link_dict[link] = -1
break
# Extract publication IDs for the cited-in field (papers that cite this publication)
for linksetdb in linkset:
linksetdbs = linksetdb["linksetdbs"]
for item in linksetdbs:
print(item["linkname"])
if item["linkname"] == "pubmed_pubmed_citedin": # Forwards
found_citedin = True
pubmed_citedin = item["links"]
for link in pubmed_citedin:
link_dict[link] = 1
break
keys = link_dict.keys()
split = 20
key_string_forwards = {0: ""}
key_string_backwards = {0: ""}
counter_forwards = 0
counter_backwards = 0
print("keydict", keys)
for key in keys:
print(link_dict[key] == 1)
if link_dict[key] == -1:
if len(key_string_backwards[counter_backwards] + str(key) + ",") + 100 >= 1024:
counter_backwards += 1
key_string_backwards[counter_backwards] = ""
key_string_backwards[counter_backwards] += str(key) + ","
elif link_dict[key] == 1:
if len(key_string_forwards[counter_forwards] + str(key) + ",") + 100 >= 1024:
counter_forwards += 1
key_string_forwards[counter_forwards] = ""
key_string_forwards[counter_forwards] += str(key) + ","
print("ksb", key_string_backwards)
print("ksf", key_string_forwards)
print("stringback", key_string_backwards)
#exit(0)
if len(key_string_backwards[0]) > 4:
for keyb in key_string_backwards.keys():
yield scrapy.Request(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=" + key_string_backwards[
keyb] + "&retmode=json",
callback=self.parse_base_article_string,
meta={
'layer': response.meta["layer"] + 1,
'parent': 0, # response.meta["id"],
'fb': -1,
}
)
if len(key_string_forwards[0]) > 4:
for keyf in key_string_forwards.keys():
print("keyf", keyf)
yield scrapy.Request(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=" + key_string_forwards[
keyf] + "&retmode=json",
callback=self.parse_base_article_string,
meta={
'layer': response.meta["layer"] + 1,
'parent': 0, # response.meta["id"],
'fb': 1,
}
)
# for i in range(split):
# if i in key_string_backwards:
# search_string_b = key_string_backwards[i]
# yield scrapy.Request(
# "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=" + search_string_b + "&retmode=json",
# callback=self.parse_base_article_string,
# meta={
# 'layer': response.meta["layer"] + 1,
# 'parent': 0, # response.meta["id"],
# 'fb': -1
# }
# )
#
# if i in key_string_forwards:
# search_string_f = key_string_forwards[i]
# yield scrapy.Request(
# "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=" + search_string_f + "&retmode=json",
# callback=self.parse_base_article_string,
# meta={
# 'layer': response.meta["layer"] + 1,
# 'parent': 0, # response.meta["id"],
# 'fb': 1
# }
# )
# -- -- -- -- -- -- -- -- -- -- Helper methods for data handling -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
# Tracks the IDs that were dropped due to a lack of available citation data
dropped_rows = list()
# Adds data to the overall dataframe
def add_data(id, author, layer, date, parent, f_b, tti):
author = author.lower()
# Create a temporary dataframe to hold the function's inputs
temp_df = pd.DataFrame(data=[date, id, author, layer, parent, f_b, tti]).transpose()
temp_df.columns = ['date', 'id', 'name', 'layer', 'parent', 'fb', 'title']
# Append the temporary dataframe to the global dataframe "gdf"
global gdf
gdf = gdf.append(temp_df)
# Update the user on progress
# print("Added new data point [" + str(id) + "; layer = " + str(layer) + "]")
# Drop a given ID's rows from a dataframe
def drop_rows(id):
dropped_rows.append(id)
global gdf
gdf = gdf[gdf.id != id]
# Format a search term (replaces all spaces with "%20")
def format_search(word):
return word.replace(" ", "%20")
def represents_int(s):
for i in range(10):
if s.startswith(str(i)):
return True
return False
# -- -- -- -- -- -- -- -- -- -- Script initiation code -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
first_arg = sys.argv[1]#"9862982"#sys.argv[1]#25958395
second_arg = sys.argv[2]#"100"#sys.argv[2]
third_arg = sys.argv[3]#"False"#sys.argv[3]
output_name = sys.argv[4]#"Me"#sys.argv[4]
search_id = first_arg
to_id = second_arg
represents_string = "True" == third_arg
alert_name = False
alert_name_permanent = False
_update_new_id = "0"
if represents_string:
search_id = search_id.replace("_", "+")
alert_name = True
alert_name_permanent = True
_pub_name = "::name not found::"
_pub_year = "::year not found::"
_pub_doi = "::doi not found::"
_pub_la = "::la not found::"
_pub_fa = "::fa not found::"
_pub_id = "::id::"
_pub_authors = list()
#search_id = 29656858
# Global dataframe which contains all data retrieved on parent and child publications
gdf = pd.DataFrame(columns=['date', 'id', 'name', 'layer', 'parent', 'fb', 'title'])
# Create a spider to perform the web crawling process
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=" + str(search_id) + "&retmode=json"
if alert_name:
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + search_id + "[title]&retmax=100&retmode=json"
ChamberSpider.start_urls = [ url ]
process.crawl(ChamberSpider)
x = process.start()
if alert_name_permanent:
search_id = output_name
# Output the contents of the dataframe into a csv file
gdf.to_csv(path_or_buf="/Users/mkorovkin/Desktop/citations_" + str(search_id) + ".csv", encoding='utf-8')
# Notify the user once data collection is finished
print("Finished data collection on the publication history of ID \"" + str(search_id) + "\"")
print("Dropped " + str(len(dropped_rows)) + " ids.")
f = open("/Users/mkorovkin/Desktop/citations_" + str(search_id) + "_info.txt","w+")
f.write("name:" + _pub_name + "\n")
f.write("year:" + _pub_year + "\n")
f.write("doi:" + _pub_doi + "\n")
f.write("id:" + _pub_id + "\n")
f.write("fa:" + _pub_fa + "\n")
f.write("la:" + _pub_la + "\n")
for _author in _pub_authors:
f.write("gauthor:" + _author + "\n")
f.close()