-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathquestion_parser.py
236 lines (208 loc) · 9.32 KB
/
question_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# coding=utf-8
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from datautil import dump_question_into_db, dump_answer_into_db
import re
import time
def get_answer_views_upvotes(cnt):
'''
:type cnt: str
:return:
eg, 将54.8K转为 54800
'''
if cnt[-1].isdigit():
cnt = int(cnt)
elif cnt[-1] == 'k':
cnt = int(float(cnt[:-1]) * 1000)
elif cnt[-1] == 'm':
cnt = int(float(cnt[:-1]) * 1000000)
return cnt
def get_question_followers_views(follower):
'''
:type follower: str
:return:
eg, 9 Followers 返回9
'''
follower = follower.strip().split()[0]
mul = 1
factors = follower.split(',')
res = 0
for factor in factors[::-1]:
res += (int(factor) * mul)
mul *= 1000
return res
import calendar
import datetime
def get_write_date(date_str):
'''
:type date_str: str
:return:
'''
date_str = date_str.strip().split(',')
if date_str[0].split()[-1] == 'ago': # 说明在当前时间的24小时之内
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
months = {v: k for k, v in enumerate(calendar.month_abbr)}
day_abbr = [d for d in calendar.day_abbr]
now_datetime = datetime.datetime.now()
week2date = {}
for i in range(1, 8):
before_datetime = now_datetime - datetime.timedelta(days=i)
week2date[day_abbr[before_datetime.weekday()]] = before_datetime.strftime("%Y-%m-%d %H:%M:%S")
if date_str[0].split()[-2] not in months:
try:
res = week2date[date_str[0].split()[-1]]
return res
except KeyError:
return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
res = "{:s}-{:02d}-{:s} 00:00:00"
month = months[date_str[0].split()[-2]]
day = date_str[0].split()[-1]
if len(date_str) > 1:
year = date_str[-1]
else: # 不能存在年份信息
year = '2016' if month > now_datetime.month or (
month == now_datetime.month and int(day) > now_datetime.day) else '2017'
return res.format(year, month, day)
def parse_by_selenium(driver, user_elems, question_url):
# 答案数目
cnt_answers = driver.find_element_by_xpath("//div[@class='answer_count']").text.encode('utf-8').strip().split()[0]
if cnt_answers[-1].isdigit():
cnt_answers = int(cnt_answers)
else:
cnt_answers = int(cnt_answers[:-1])
# 问题标题
title = driver.find_element_by_xpath("//div[@class='header']//span[@class='rendered_qtext']").text.encode(
'utf-8').strip()
print("问题:{:s}".format(title))
# 问题的详细描述
question_details = driver.find_element_by_xpath("//div[@class='question_details']").text.encode('utf-8')
print("问题详细描述:{:s}".format(question_details))
print("答案数目:{:d}".format(cnt_answers))
stats_elem = driver.find_element_by_xpath("//div[@class='QuestionStats']")
followers = get_question_followers_views(stats_elem.find_element_by_xpath("./span[1]").text.encode('utf-8'))
print("问题关注量:{:d}".format(followers))
views = get_question_followers_views(stats_elem.find_element_by_xpath("./span[2]").text.encode('utf-8'))
print("问题浏览量:{:d}".format(views))
asked_date = get_write_date(stats_elem.find_element_by_xpath("./span[3]").text.encode('utf-8'))
print("提问时间:{:s}".format(asked_date))
tag_elems = driver.find_elements_by_xpath("//span[contains(@class, 'TopicNameSpan')]")
tags = ""
if len(tag_elems):
tags = ','.join([elem.text.encode('utf-8') for elem in tag_elems])
print("问题标签:{:s}".format(tags))
ans_info = []
true_ans_num = 0
for idx, elem in enumerate(user_elems):
# 回答者名字
try:
link_elem = elem.find_element_by_xpath(".//a[starts-with(@class,'user')]")
user_name = link_elem.text.encode('utf-8').strip()
user_url = link_elem.get_attribute("href").strip()
print('name:{:s},link:{:s}'.format(user_name, user_url))
except NoSuchElementException:
user_name = elem.find_element_by_xpath(".//span[contains(@class,'anon_user')]").text.encode('utf-8')
user_url = ''
print('name:{:s}'.format(user_name))
# 答案正文
text = elem.find_element_by_xpath(".//span[@class='rendered_qtext']").text.encode('utf-8')
print("text:{:s}".format(' '.join(map(lambda x: x.strip(), text.split()))))
# 答案编辑日期
write_date = get_write_date(elem.find_element_by_xpath(".//a[@class='answer_permalink']").text.encode('utf-8'))
print("date:{:s}".format(write_date))
# 浏览量
try:
ans_views = get_answer_views_upvotes(
elem.find_element_by_xpath(".//span[@class='meta_num']").text.encode('utf-8'))
print("views:{:d}".format(ans_views))
except NoSuchElementException:
# while elem.find_elements_by_xpath(".//a[@class='more_link']") != []:
# elem.find_element_by_xpath(".//a[@class='more_link']").click()
ans_views = 0
print("views:{:d}".format(ans_views))
# 点赞数
try:
ans_up = get_answer_views_upvotes(
elem.find_element_by_xpath(".//a[@action_click='AnswerUpvote']//span[@class='count']").
text.encode('utf-8'))
print("upvote:{:d}".format(ans_up))
except NoSuchElementException:
ans_up = 0
print("upvote:0")
ans_info.append((title, user_name, user_url, write_date, str(ans_views), str(ans_up), text))
# dump_answer_into_db(ans_info, 'answer')
true_ans_num += 1
print '我是分割线------------------我是分割线'
if dump_question_into_db((question_url.strip(), title, asked_date, str(true_ans_num),
str(views), str(followers), tags, question_details), 'question'):
dump_answer_into_db(ans_info, 'answer')
def parse_by_bs(source, question_url):
# with open("data/question/page_source") as fr:
# source = ''.join(fr.readlines())
# source = driver.page_source
soup = BeautifulSoup(source, "html.parser")
# 标题
title = soup.find("div", {"class": "header"}).find("span", {"class": "rendered_qtext"}).text.encode("utf-8")
print("标题:{:s}".format(title))
# 答案数目
print("答案数目:{:s}".format(soup.find("div", {"class": "answer_count"}).text))
stats = soup.find("div", {"class": "QuestionStats"}).find_all("span")
# followers, views, date
followers = get_question_followers_views(stats[0].text)
views = get_question_followers_views(stats[3].text)
asked_date = get_write_date(stats[5].text)
print(followers, views, asked_date)
# tags
tags = soup.find_all("span", {"class": "TopicNameSpan TopicName qserif-bold"})
tags = ','.join([tag.text for tag in tags])
print(tags)
# question details
question_details = soup.find("div", {"class": "question_details"}).text.encode("utf-8")
print(question_details)
# answers
answers = soup.find_all("a", {"name": re.compile(r"answer_\d*?")})
true_ans_num = 0
ans_info = []
for answer in answers:
answer = answer.next_sibling
# 回答者姓名
name_elem = answer.find_all("a")[1]
user_name = name_elem.text.encode("utf-8")
user_url = name_elem.attrs["href"].encode("utf-8")
if user_url.strip() == '#':
user_name = answer.find("span", {"class": "anon_user qserif"}).text.encode("utf-8")
user_url = ""
else:
user_url = "https://www.quora.com" + user_url
print("name:{:s}, link:{:s}".format(user_name, user_url))
# 正文
text = answer.find("span", {"class": "rendered_qtext"}).text.encode("utf-8")
print("text:{:s}".format(' '.join(map(lambda x: x.strip(), text.split()))))
# 答案编辑日期
write_date = get_write_date(answer.find("a", {"class": "answer_permalink"}).text.encode("utf-8"))
print("date:{:s}".format(write_date))
# 浏览量
views_elem = answer.find("span", {"class": "meta_num"})
if views_elem:
ans_views = get_answer_views_upvotes(views_elem.text.encode("utf-8"))
else:
ans_views = 0
print("views:{:d}".format(ans_views))
# 点赞数
upvote_elem = answer.find("a", {"action_click": "AnswerUpvote"}).find("span", {"class": "count"})
if upvote_elem:
ans_up = get_answer_views_upvotes(upvote_elem.text.encode("utf-8"))
else:
ans_up = 0
print("upvote:{:d}".format(ans_up))
print("我是分割线------------------我是分割线")
true_ans_num += 1
ans_info.append((title, user_name, user_url, write_date, str(ans_views), str(ans_up), text))
question_info = (question_url.strip(), title, asked_date, str(true_ans_num),
str(views), str(followers), tags, question_details)
if dump_question_into_db(question_info, 'question_user'):
dump_answer_into_db(ans_info, 'answer_user')
if __name__ == '__main__':
with open("data/question/page_source") as fr:
source = ''.join(fr.readlines())
question_url = "https://www.quora.com/How-can-I-destroy-my-ego-self"
parse_by_bs(source, question_url)