-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjameswebb_archiver.py
133 lines (97 loc) · 3.62 KB
/
jameswebb_archiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import re
import html
import argparse
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://webbtelescope.org"
CATEGORY_URL = "https://webbtelescope.org/resource-gallery/images?itemsPerPage=15&Type=Observations&keyword=NIRCam&"
ABOUT_IMAGE_REGEX = '<h4>About This Image</h4>\s+(.+)<p><em>(NIRCam was built|MIRI was contributed).+</p>\s+<footer>'
CHUNK_SIZE = 4096
def remove_html_tags(text):
clean = re.compile('<[^<]+?>')
return re.sub(clean, '', text)
def parse_facts_table(soup):
data = []
table = soup.find('table')
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
header = row.find('th')
if header:
data.append(header.text)
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
if len(cols) == 0 or len(cols[1]) == 0: continue
data.append([ele for ele in cols if ele]) # Get rid of empty values
return data
def download_file(url, filename):
file_extension = url.split('/')[-1].split('.')[1]
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(f"{filename}.{file_extension}", 'wb') as f:
for chunk in r.iter_content(chunk_size=CHUNK_SIZE):
f.write(chunk)
return filename
def save_facts_to_file(data, filename, release_date, about_image):
with open(f'{filename}.txt', 'w') as fp:
fp.write(f'{release_date}\n\n')
fp.write(f'About this image:\n{about_image}\n')
for fact in data:
if isinstance(fact, str):
fp.write(f"\n{fact}\n{'-'*20}\n")
else:
fp.write(f"{fact[0]}: {fact[1]}\n")
def get_one_image(url):
res = requests.get(f"{BASE_URL}{url}")
soup = BeautifulSoup(res.text, 'html.parser')
title_tag = soup.find("meta", property="og:title")
about_image_match = re.search(ABOUT_IMAGE_REGEX, res.text, re.DOTALL)
if not title_tag:
print(f"[-] No title found: {BASE_URL}{url}")
return
title = html.unescape(title_tag["content"])
filename = title.replace(' ', '_').replace(' ', '')
if os.path.exists(f"{filename}.txt"):
return
print(f"[*] Downloading image: {title}")
resource_gallery = soup.find('div', attrs={'class': 'resource-gallery-detail'})
release_date_tag = resource_gallery.find("strong", text="Release Date:")
if release_date_tag:
release_date = release_date_tag.parent.text
else:
release_date = ""
if about_image_match:
about_image = about_image_match.group(1)
about_image = remove_html_tags(about_image).replace('\n', '\n\n').replace(' ', '')
else:
about_image = ""
links_list = resource_gallery.find("div", attrs={'class': 'media-library-links-list'}).find_all("a")
for link in links_list:
if 'Full Res' not in link.text or 'TIF' not in link.text: continue
download_url = f"https:{link['href']}"
download_file(download_url, filename)
facts = parse_facts_table(soup)
save_facts_to_file(facts, filename, release_date, about_image)
def run():
print(f"[*] Getting observations webpage...")
res = requests.get(CATEGORY_URL)
print(f"[*] Got observations! Processing results.")
soup = BeautifulSoup(res.text, 'html.parser')
images = soup.find_all('div', attrs={'class': "ad-research-box"})
for image in images:
get_one_image(image.find('a')['href'])
print("[*] Done.")
def main():
parser = argparse.ArgumentParser(description="Archives James Webb Telescope's Near Infrared Camera (NIRCam) Images including metdata.")
parser.add_argument('-o', '--output')
args = parser.parse_args()
if args.output is not None:
print(f"[*] Using output folder: {args.output}")
if not os.path.exists(args.output):
print(f"[*] Folder does not exist. Creating it...")
os.makedirs(args.output)
os.chdir(args.output)
run()
if __name__ == "__main__":
main()