-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler_pyspider_rota_concursos_ciencia_computacao_questions.py
259 lines (220 loc) · 10.7 KB
/
crawler_pyspider_rota_concursos_ciencia_computacao_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""Crawler to extract IT Computer Science questions from Rota dos Concursos website.
Created at: 2018-07-21 19:26:12
Project: rota_concursos_ciencia_computacao
Framework used: PySpider
Classes:
" >>> Misc(object)
" >>> Accents(object)
" >>> Handler(BaseHandler)
Misc class static methods:
" >>> get_nested_pyspider_elements(html_doc, mid_tags, el_tag, tag_attr, method_name, save_attr, parent_name)
" >>> get_first_main_theme(sub_main_theme, theme_list)
Accents class static methods:
" >>> remove_accents(s)
Handler class methods (in execution sequence flow):
" >>> on_start()
" >>> index_page(response)
" >>> detail_page(response)
todo: accept URL JSON config file
returns: JSON from each crawled page in the web pyspider application
"""
import re
import unicodedata
from pyspider.libs.base_handler import *
url = 'https://questoes.grancursosonline.com.br/'
theme = 'ciencia-da-computacao'
theme_crawled = url + theme
url_pat = url + 'questoes-de-concursos/' + theme + '-'
regex_url_rule = '(' + url_pat + ')(' + '.+$)'
class Misc(object):
def get_nested_pyspider_elements(self, html_doc, mid_tags, el_tag, tag_attr, method_name, save_attr, parent_name):
"""Loop recursively through each el_tag inside mid_tags and gets a specific attribute value from el_tag.
i.e Get attribute value 'href' from 'a' tag inner mid_tags 'ul > li'.
:param html_doc:
:param mid_tags:
:param el_tag:
:param tag_attr:
:param method_name:
:param save_attr:
:param parent_name:
:return [array] nested_list:
Example: html_doc = <pyquery obj>,
mid_tags = 'ul > li',
el_tag = 'a',
tag_attr = 'href',
method_name = 'text',
save_attr = 'main_theme',
parent_name = 'banco de dados',
nested_list = [{'main_theme': ['seguranca'], 'href': 'http://...'}]
"""
nested_list = []
mid_tags_list = mid_tags.replace(' ', '').split('>')
# item can correspond to the <li> level
for item in html_doc.children().items():
inner_el_tag_name = getattr(item.children(el_tag), method_name)()
inner_el_tag_attr_value = item.children(el_tag).attr[tag_attr]
if inner_el_tag_attr_value is None:
continue
else:
# Example: {id: 'mysql', 'main_theme: 'banco de dados', 'a': 'https://...'}
id_name = re.match(regex_url_rule, inner_el_tag_attr_value).group(2)
nested_list.append({'id_name': id_name, 'name': inner_el_tag_name,
save_attr: parent_name, tag_attr: inner_el_tag_attr_value})
# Check if it hasn't nested els (i.e. <ul>) and then add to nested_list with possible nested_save
inner_html_doc = item.children(mid_tags_list[0])
if inner_html_doc.html() is not None:
# nested_list += self.get_nested_pyspider_elements(inner_html_doc, mid_tags, el_tag, tag_attr,
# method_name, save_attr, inner_el_tag_name)
nested_list += self.get_nested_pyspider_elements(inner_html_doc, mid_tags, el_tag, tag_attr,
method_name, save_attr, id_name)
return nested_list
def get_first_main_theme(self, sub_main_theme, theme_list):
"""Recursively go through tree of themes for inner main themes to most outer one to get the ultimate main theme.
:param sub_main_theme:
:param theme_list:
:return [string] first_main_theme:
"""
try:
pos = [theme['id_name'] for theme in theme_list].index(sub_main_theme)
if theme_list[pos]['main_theme'] != '':
return self.get_first_main_theme(theme_list[pos]['main_theme'], theme_list)
else:
return theme_list[pos]['name']
except Exception as e:
# print "[WARNING] Tema " + sub_main_theme + " nao encontrado na lista! Retornando o tema repassado como argumento..."
return sub_main_theme
class Accents(object):
"""Accents class manipulation details.
:param object:
"""
@staticmethod
def remove_accents(s):
"""Remove accents from string.
:param s:
:return [string] string_noaccents:
"""
return unicodedata.normalize('NFD', s).encode('ascii', 'ignore')
class Handler(BaseHandler):
"""PySpider main handler.
:param BaseHandler (class):
"""
@every(minutes=60)
def on_start(self):
"""PySpider lifecycle starting method.
Next crawling step: self.index_page
:return:
"""
self.crawl(theme_crawled, callback=self.index_page)
@config(age=120)
def index_page(self, response):
"""PySpider lifecycle index page.
Get main themes and crawl each theme page.
Next crawling step: self.detail_page
:param response:
:return:
"""
# Get URL list from main themes
url_crawling_list = []
for each_html in response.doc('ul.list-group').items():
misc = Misc()
url_crawling_list = misc.get_nested_pyspider_elements(each_html, 'ul > li', 'a', 'href', 'text',
'main_theme', '')
# Crawl each theme page from list
# for each in response.doc('ul.list-group > li > a.list-group-item').items():
for crawl_obj in url_crawling_list:
# Transform inner main themes to most outer one
misc = Misc()
crawl_obj['main_theme'] = misc.get_first_main_theme(crawl_obj['main_theme'], url_crawling_list)
# If theme match regex pattern for IT cases, crawl page with JS fn
# when opening the page to load all questions' answers
if re.match(url_pat + '.+$', crawl_obj['href']):
self.crawl(crawl_obj['href'], callback=self.detail_page, save=crawl_obj,
fetch_type='js', js_script='''
function() {
var count = 0;
var id = setInterval(function() {
console.log('starting loop',count);
var panel = document.getElementsByClassName('panel panel-default loading-wrapper text-center');
var loadMoreBtn = panel[0].childNodes[1];
var noMoreDiv = panel[0].childNodes[3];
if (loadMoreBtn.attributes.style && loadMoreBtn.attributes.style.nodeValue === "display: none;") {
// No more questions to load
} else {
console.log('clicking new button');
loadMoreBtn.click();
}
count++;
}, 500);
setTimeout(function() {
clearInterval(id);
var list = document.getElementsByClassName('btn btn-black btn-lg');
for(var i in list) {
list[i].click();
}
}, 15000);
}
''')
@config(priority=2)
def detail_page(self, response):
"""Detail questions page.
Get questions' text information and append to ext_quest_list from output extraction object.
:param response:
:return [object] extraction:
"""
count = 0
extraction = {
# 'theme': response.url.split(url_pat)[1],
'theme': response.save['name'],
'ext_quest_list': [],
'main_theme': response.save['main_theme']
}
# Iterate through all div questions
for question in response.doc('HTML>BODY>DIV.container>DIV.row>SECTION.prova-detalhes>DIV.questao').items():
ext_quest = {
"subthemes": [],
'question_imgs': [],
"options": []
}
# Get header test name
header_p_list = question('header div.row div.col-a').children()('p')
if len(header_p_list) == 4:
if 'Superior' in header_p_list[3].text:
ext_quest['test_name'] = header_p_list[0].text
# Get test question text
body_question = question('div.panel-body div.panel-questao div.panel-heading')
body_question_inner_p = body_question.children()('p')
for img_el in body_question_inner_p.items('img'):
ext_quest['question_imgs'].append(img_el.attr('src'))
if len(body_question_inner_p) == 0:
ext_quest['question_text'] = re.sub(r'[.]+(?![0-9])', r'. ', body_question.text())
else:
question_text = ''
for body_question_p in body_question_inner_p:
if body_question_p.text is not None:
if len(body_question_p.getchildren()) == 0:
question_text = question_text + body_question_p.text
else:
question_text = question_text + body_question_p.text_content()
ext_quest['question_text'] = re.sub(r'[.]+(?![0-9])', r'. ', question_text.replace('\n', ' '))
if ext_quest['question_text'] == '':
ext_quest['question_text'] = body_question.text()
# Get test question options
body_question_options = question('div.panel-body div.panel-questao div.panel-body ul.list-group')
inner_options = body_question_options.children()('li')
for img_el in inner_options.items('img'):
ext_quest['question_imgs'].append(img_el.attr('src'))
for question_option_li in inner_options:
ext_quest_option = {
"correct": False,
"text": question_option_li.text_content().replace('\n', ' ').replace(' ', '')
}
for classe in question_option_li.values():
if "resposta-correta" in classe:
ext_quest_option['correct'] = True
ext_quest['options'].append(ext_quest_option)
# print ext_quest
count = count + 1
extraction['ext_quest_list'].append(ext_quest)
return extraction