-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrape.py
167 lines (135 loc) · 5.37 KB
/
webscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Web scraping script
# imports
from bs4 import BeautifulSoup
import requests
import re
import json
import subprocess
def getHTMLContent(url):
"""
Get the raw HTML content of the passed in argument
Args:
url (string): url to the webpage to be scraped.
Returns:
BeautifulSoup: HTML contents of the webpage.
"""
site = requests.get(url)
htmlContent = BeautifulSoup(site.content, 'html.parser')
# find the desired header tag
header = htmlContent.find('h2', string='Graduate')
# find all tags after the header tag
tags_to_remove = header.find_all_next()
# loop through the tags and remove them
for tag in tags_to_remove:
tag.decompose()
return htmlContent
def getCourseCatalog(htmlContent):
"""
Fill a dictionary with keys = courses, values = array of prerequisites
Args:
htmlContent (BeautifulSoup): html text from getHTMLContent() function
Returns:
dict: keys = courses, values = array of prerequisites
"""
courseCatalog = dict()
# get a list of course titles
courseTitles = htmlContent.find_all('p', {'class': 'course-name'})
# call getPrereqs() to populate each course with its prerequisites
for course in courseTitles:
# get course description of given course
courseDesc = course.find_next_sibling()
# find "Prerequisites:", that begins list of prerequisites
preq_tag = courseDesc.find_next(['em', 'span', 'strong'])
if preq_tag.text[0] == 'p':
preq_tag = preq_tag.find_next(['em', 'span', 'strong'])
prerequisitesText = None
if type(preq_tag) == type(None):
prerequisites = []
else:
# store list of prerequisites
prerequisitesText = preq_tag.next_sibling.strip()
prerequisites = getPrereqs(prerequisitesText)
# removes units because unicode incompability
justId = course.text.index(".")
# inserts course and its prerequisites into courseCatalog
courseCatalog[course.text[:justId]] = prerequisites
return courseCatalog
def getPrereqs(prerequisitesText):
"""
Get the prerequisites of the given course by parsing prerequisitesText
Args:
prerequisitesText (string): prerequisites list scraped from
getHTMLContent() and getCourseCatalog()
Returns:
string: array of prerequisites
"""
prerequisitesArr = []
# Define the pattern to match, "DEPT 123(H/R/AH)"
aCourse = r"[A-Z]{3,4}\s+\d{1,3}[A-Z]{0,2}"
# get string of only prerequisites
match = re.search(aCourse, prerequisitesText)
if match:
start = match.start()
else:
start = 0
end = prerequisitesText.index(".")
prerequisitesText = prerequisitesText[start:end]
#print(prerequisitesText)
prerequisitesText = prerequisitesText.replace('\u2013', '-')
prerequisitesText = prerequisitesText.replace("(", "")
prerequisitesText = prerequisitesText.replace(")", "")
# Define the regular expression pattern to match the substrings you want to keep
prerequisitesText = re.sub(r',+\s+or+\s', r' or ', prerequisitesText)
prerequisitesText = re.sub(r',+\s+and+\s', r' and ', prerequisitesText)
prerequisitesText = re.sub(r',+\s', r' and ', prerequisitesText)
pattern = re.compile(rf'( or {aCourse}| and {aCourse}|{aCourse})')
# Use re.findall to extract all matches of the pattern from the input string
matches = pattern.findall(prerequisitesText)
# Join the matches together with a delimiter to create a new string containing only the desired substrings
prerequisitesText = ', '.join(matches).replace(', ', '')
if prerequisitesText == "":
return prerequisitesArr
#print(prerequisitesText)
# separate all prerequisites
# ex: [CSE 15L, CSE 20 or MATH 109]
prerequisitesArr = prerequisitesText.split(" and ")
prerequisitesArr = [req.split(" or ") for req in prerequisitesArr]
return prerequisitesArr
def webScrape(dept):
"""
Output two .json's of the upper and lower division requirements to be read
by the web app.
Args:
dept (string): 3-4 letter department code
"""
# obtain link through given dept
url = f'https://catalog.ucsd.edu/courses/{dept}.html'
# get BeautifulSoup of course catalog
lower = getHTMLContent(url)
# find upper division cut off point
division = lower.find('h2', string='Upper Division')
# extract all siblings after the target heading
target_content = list(division.next_siblings)
# create a new BeautifulSoup object with the target content
upper = BeautifulSoup('', 'html.parser')
upper.append(division)
for sibling in target_content:
if sibling.name:
upper.append(sibling)
# construct dictionary of upper divs
upperDivs = getCourseCatalog(upper)
# find all tags after the header tag
tags_to_remove = division.find_all_next()
# loop through the tags and remove them
for tag in tags_to_remove:
tag.decompose()
# construct dictionary of lower divs
lowerDivs = getCourseCatalog(lower)
# convert dictionary to json
json_lower = json.dumps(lowerDivs, indent=4)
json_upper = json.dumps(upperDivs, indent=4)
with open("lowerDivs.json", "w") as outfile:
outfile.write(json_lower)
with open("upperDivs.json", "w") as outfile:
outfile.write(json_upper)
webScrape("CSE")