-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjournal.py
110 lines (97 loc) · 3.2 KB
/
journal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import requests
import re
import os
import tarfile
import glob
import shutil
from urllib.request import urlopen
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from pathlib import Path
from bs4 import BeautifulSoup
def save_existing_url(doi):
with open('doi.txt', 'w') as f:
f.writelines(doi)
def downloard(response, url, size):
filename = re.search(r'/([\w_-]+[.](pdf))$', url)
if filename is None:
name = f"{size}.pdf"
else:
name = filename.group()
with open(name, 'wb') as f:
f.write(response.content)
def get_all_documents():
dir = Path('.')
print(str(dir))
all_files = [x for x in dir.iterdir() if x.is_dir()]
ll = []
for each in all_files:
ll.append((list)(each.glob('*.txt')))
return len(ll)
def get_all_pmid(soup):
pmids = soup.findAll('dl', class_='rprtid')
ids = []
for pmid in pmids:
x = pmid.find('dd')
if x is not None:
ids.append(x.contents[0][3:])
return ids
def save_pdfs(ftpUrl, file_name, dest):
resp = urlopen(ftpUrl)
with open(dest/file_name, 'wb') as f:
shutil.copyfileobj(resp, f)
def save_tgz(ftpUrl, dest):
resp = urlopen(ftpUrl)
tar = tarfile.open(fileobj=resp, mode='r|gz')
tar.extractall(path=dest)
def fetch_apis(id_list, dest_path):
base_api = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
for id in id_list:
url = f"{base_api}?id=PMC{id}"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
y = soup.find('records')
links = y.findAll('link')
for link in links:
if link.get('format', None) == 'pdf':
save_pdfs(link.get('href', None), f"PMC{id}.pdf", dest_path)
return
else:
tgz_link = link.get('href', None)
save_tgz(tgz_link, dest_path)
def main():
# search_input = input("Please enter search keywords, separated by space: ")
# search_word = search_input.split()
# size = input("Please enter the number of pages of articles (20 article per page): ")
search_word = ['NIR-II', 'imaging']
query_string = '+'.join(search_word)
path_string = '_'.join(search_word)
# print(query_string)
base_url = "https://www.ncbi.nlm.nih.gov/pmc/"
final_url = f"{base_url}?term={query_string}"
req = requests.get(final_url)
soup = BeautifulSoup(req.content, 'html.parser')
id_curr_page = get_all_pmid(soup)
path = Path('.')
dest = path / path_string
if not dest.exists():
dest.mkdir()
fetch_apis(id_curr_page, dest)
# Starting from 2nd page, need to use selenium instead of soup
p = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(executable_path=p)
driver.get(final_url)
for i in range(2, 5):
y = driver.find_element_by_id('pageno')
y.clear()
y.send_keys(i)
y.send_keys(Keys.ENTER)
ids = driver.find_elements_by_xpath("//dl[@class='rprtid']")
id_list = []
for id in ids:
reg = r"[0-9]+"
res = re.findall(reg, id.text)
id_list.append(res[0])
fetch_apis(id_list, dest)
if __name__ == "__main__":
main()