-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_script.py
73 lines (62 loc) · 2.61 KB
/
load_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import json
import time
import os
import sys
import requests
def fetch_digital_commonwealth():
start = time.time()
BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="
PAGE = sys.argv[1]
END_PAGE = sys.argv[2]
file_name = f"out{PAGE}_{END_PAGE}.json"
FINAL_PAGE = 13038 # hardcoded from old version, I suggest doing logic to determine final page. This was used to keep us from going out of index.
output = []
file_path = f"./{file_name}"
# file_path = './output.json'
if os.path.exists(file_path):
with open(file_path,'r') as file:
output = json.load(file)
if int(PAGE) < (len(output) + 1):
PAGE = len(output) + 1
if int(PAGE) >= int(END_PAGE):
return None
print(f'Reading page {PAGE} up to page {END_PAGE}')
retries = 0
while True:
try:
response = requests.get(f"{BASE_URL}&page={PAGE}")
response.raise_for_status()
data = response.json()
# Append current page data to the output list
output.append(data)
# Save the entire output to a JSON file after each iteration
with open(file_path, 'w') as f:
json.dump(output, f)
# check if theres a next page
# print(len(response))
if data['meta']['pages']['next_page']:
if data['meta']['pages']['next_page'] == int(END_PAGE):
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
break
elif data['meta']['pages']['next_page'] == FINAL_PAGE: # This is hardcoded from an old version
print(f"finished page {PAGE}")
PAGE = FINAL_PAGE
else:
print(f"finished page {PAGE}")
PAGE = data['meta']['pages']['next_page']
else:
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
break
retries = 0
# time.sleep(0.5) was concerned about rate limiting
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
retries += 1
if retries >= 5:
break
end = time.time()
print(f"Timer: {end - start}")
print(f"Finished processing all pages. Total pages saved: {len(output)}")
if __name__ == "__main__":
fetch_digital_commonwealth()