-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
173 lines (142 loc) · 5.89 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import json
import os
import re
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# 设置浏览器选项
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # 不显示浏览器窗口
options.add_argument('--disable-gpu')
# enable javascript
options.add_argument('--enable-javascript')
# no download no pictures and videos
prefs = {
"download.default_directory": "/dev/null",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
# "safebrowsing.enabled": True,
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.videos": 2
}
options.add_experimental_option("prefs", prefs)
# options.add_argument('--no-sandbox')
# Install ChromeDriver using ChromeDriverManager and wrap the path with Service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
class Crawler:
def __init__(self, start_url, course_url, group_url, depth_limit):
self.course_url = course_url
self.group_url = group_url
self.start_url = start_url
self.depth_limit = depth_limit
self.visited = set()
def find_links(self):
# Get the page source
page_source = driver.page_source
# with open("test.html", "w") as f:
# f.write(page_source)
# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')
# Find all <a> elements that are not descendants of a <header> with id 'header' and do not contain 'quiz' or 'assignment' in their href
links = [
a for a in soup.find_all('a', href=True)
if (
'quiz' not in a['href'].lower() and
'assignment' not in a['href'].lower() and
not a.find_parent('header', id='header')
)
]
# Filter out links that are part of the course or group and not download links
filtered_links = []
for link in links:
href = link.get('href')
pattern = r"/download\?.*$" # download links
if href and (href.startswith(self.course_url) or href.startswith(self.group_url)) and "#" not in href and href not in self.visited and not re.search(pattern, href):
filtered_links.append(link)
return filtered_links
def deep_crawl(self, url, depth):
if depth == 0 or url in self.visited:
return
# try:
driver.get(url)
# wait for the page to load
time.sleep(1)
# save page source into web-cache dir, named by the title of the page
filename = driver.title.replace('/', '_')
# if filename start with BUS or HUM skip
if filename.startswith("BUS") or filename.startswith("HUM"):
return
# check if the web-cache dir already exists
if not os.path.exists("web-cache"):
os.mkdir("web-cache")
with open(f"web-cache/{filename}.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
# save the [filename,url] into web-cache.csv
if not os.path.exists("./web-cache/web-cache.csv"):
with open('./web-cache/web-cache.csv', 'w') as f:
f.write("filename\turl\n")
with open("./web-cache/web-cache.csv", "r+", encoding="utf-8") as f:
line = "\t".join([filename, url])
lines = f.readlines()
if line not in lines:
f.write(line + "\n")
print(f"Depth {depth}: {driver.title} - {url}")
self.visited.add(url)
links = self.find_links()
for link in links:
href = link.get('href')
self.deep_crawl(href, depth - 1)
def login(self):
# check if cookies.json exists
driver.get("https://canvas.sydney.edu.au/")
# 读取 cookies
if os.path.exists("cookies.json"):
with open("cookies.json", "r") as file:
cookies = json.load(file)
for cookie in cookies:
cookie['domain'] = '.sydney.edu.au'
driver.add_cookie(cookie)
# 等待页面加载完成
WebDriverWait(driver, 10).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
print("Page is fully loaded!")
# Check if already logged in. If accessing start_url does not redirect, already logged in; otherwise, not logged in
driver.get(self.start_url)
driver.get(self.start_url)
WebDriverWait(driver, 10).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
if driver.current_url == self.start_url:
print("Already logged in!")
return
# 否则登录
driver.delete_all_cookies()# clear cookies
driver.get("https://canvas.sydney.edu.au/")
input("Login First, Press Enter to continue...")
# 保存登录后的 cookies
with open("cookies.json", "w") as f:
cookies = driver.get_cookies()
json.dump(cookies, f)
def start(self):
# 登录
self.login()
self.deep_crawl(self.start_url, self.depth_limit)
def main():
course_url = "https://canvas.sydney.edu.au/courses/61585"
group_url = "https://canvas.sydney.edu.au/groups/624156/pages"
start_url = "https://canvas.sydney.edu.au/groups/624156/pages"
depth_limit = 99 # 设置最大深度
crawler = Crawler(start_url, course_url, group_url, depth_limit)
crawler.start()
input("Press Enter to close the browser...")
# close the browser
driver.quit()
if __name__ == '__main__':
main()