-
Notifications
You must be signed in to change notification settings - Fork 356
/
Copy pathmiddlewares.py
151 lines (118 loc) · 5.54 KB
/
middlewares.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
from importlib import import_module
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
from .http import SeleniumRequest
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
def __init__(self, driver_name, driver_executable_path,
browser_executable_path, command_executor, driver_arguments):
"""Initialize the selenium webdriver
Parameters
----------
driver_name: str
The selenium ``WebDriver`` to use
driver_executable_path: str
The path of the executable binary of the driver
driver_arguments: list
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
command_executor: str
Selenium remote server endpoint
"""
webdriver_base_path = f'selenium.webdriver.{driver_name}'
driver_klass_module = import_module(f'{webdriver_base_path}.webdriver')
driver_klass = getattr(driver_klass_module, 'WebDriver')
driver_options_module = import_module(f'{webdriver_base_path}.options')
driver_options_klass = getattr(driver_options_module, 'Options')
driver_options = driver_options_klass()
if browser_executable_path:
driver_options.binary_location = browser_executable_path
for argument in driver_arguments:
driver_options.add_argument(argument)
driver_kwargs = {
'executable_path': driver_executable_path,
f'{driver_name}_options': driver_options
}
# locally installed driver
if driver_executable_path is not None:
driver_kwargs = {
'executable_path': driver_executable_path,
f'{driver_name}_options': driver_options
}
self.driver = driver_klass(**driver_kwargs)
# remote driver
elif command_executor is not None:
from selenium import webdriver
capabilities = driver_options.to_capabilities()
self.driver = webdriver.Remote(command_executor=command_executor,
desired_capabilities=capabilities)
# webdriver-manager
else:
# selenium4+ & webdriver-manager
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
if driver_name and driver_name.lower() == 'chrome':
# options = webdriver.ChromeOptions()
# options.add_argument(o)
self.driver = webdriver.Chrome(options=driver_options,
service=ChromeService(ChromeDriverManager().install()))
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware with the crawler settings"""
driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
command_executor = crawler.settings.get('SELENIUM_COMMAND_EXECUTOR')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
if driver_name is None:
raise NotConfigured('SELENIUM_DRIVER_NAME must be set')
# let's use webdriver-manager when nothing specified instead | RN just for Chrome
if (driver_name.lower() != 'chrome') and (driver_executable_path is None and command_executor is None):
raise NotConfigured('Either SELENIUM_DRIVER_EXECUTABLE_PATH '
'or SELENIUM_COMMAND_EXECUTOR must be set')
middleware = cls(
driver_name=driver_name,
driver_executable_path=driver_executable_path,
browser_executable_path=browser_executable_path,
command_executor=command_executor,
driver_arguments=driver_arguments
)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
"""Process a request using the selenium driver if applicable"""
if not isinstance(request, SeleniumRequest):
return None
self.driver.get(request.url)
for cookie_name, cookie_value in request.cookies.items():
self.driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)
if request.wait_until:
WebDriverWait(self.driver, request.wait_time).until(
request.wait_until
)
if request.screenshot:
request.meta['screenshot'] = self.driver.get_screenshot_as_png()
if request.script:
self.driver.execute_script(request.script)
body = str.encode(self.driver.page_source)
# Expose the driver via the "meta" attribute
request.meta.update({'driver': self.driver})
return HtmlResponse(
self.driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
self.driver.quit()