-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathContentExtractor.py
102 lines (90 loc) · 3.41 KB
/
ContentExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# encoding: utf-8
import requests
from BeautifulSoup import BeautifulSoup
import re
class ContentExtractor(object):
def __init__(self, url='', html='', threshold=173):
self.url = url
self.threshold = threshold
self.plain_text = ''
self.html_text = ''
self.html = html
self.content = ''
self.blocks = []
self.lines = ()
self.len_per_lines = []
self.title = ''
def _pre_process(self):
if self.html == '':
response = requests.get(self.url)
self.html = response.text
css_soup = BeautifulSoup(self.html)
if css_soup.find('h1') is not None:
self.title = css_soup.find('h1').text
elif css_soup.find('title') is not None:
self.title = css_soup.find('title').text
else:
return
plain_text, html_text = self._clean_html(self.html)
self.plain_text = plain_text
self.html_text = html_text
def _extract(self, threshold=173):
if self.plain_text is '':
return
self.lines = tuple(self.plain_text.split('\n'))
for i in range(len(self.lines)):
length = len(re.sub(r'\s*', '', self.lines[i]))
self.len_per_lines.append(length)
self._caculate_block(3)
start = 0
end = 0
while True:
start = self._find_surge(end, threshold)
if start < 0:
break
end = self._find_dive(start)
self.content += '\n'.join(self.lines[start:end])
self.content = re.sub(r'\n', '[crlf]', self.content)
self.content = re.sub(r'\s*', '', self.content)
self.content = re.sub(r'(?:\[crlf\])+', '\n', self.content)
if len(self.content) == 0:
self.title = ''
def _find_surge(self, start, threshold):
for i in range(start, len(self.blocks) - 2):
if self.blocks[i] > threshold\
and self.blocks[i+1] > threshold\
and self.blocks[i+2] > threshold:
return i
return -1
def _find_dive(self, surge):
for i in range(surge+1, len(self.blocks)-1):
if self.blocks[i] == 0 and self.blocks[i+1] == 0:
return i
return len(self.blocks) - 1
def _caculate_block(self, blockWidth):
self.blocks[:] = []
for i in range(len(self.len_per_lines) - blockWidth + 1):
sumLen = sum(self.len_per_lines[i+j] for j in range(blockWidth))
self.blocks.append(sumLen)
def _clean_html(self, html):
regex = re.compile(
r'(?:<!DOCTYPE.*?>)|' #doctype
r'(?:<head[\S\s]*?>[\S\s]*?</head>)|'
r'(?:<!--[\S\s]*?-->)|' #comment
r'(?:<script[\S\s]*?>[\S\s]*?</script>)|' # js...
r'(?:<style[\S\s]*?>[\S\s]*?</style>)', re.IGNORECASE) # css
html_text = regex.sub('', html)
html_text = re.sub(r"</p>|<br.*?/>", '[crlf]', html_text)
plain_text = re.sub(r"<[\s\S]*?>", '', html_text)
return plain_text, html_text
def execute(self):
self._pre_process()
self._extract()
if __name__ == '__main__':
# ce = ContentExtractor('http://www.csdn.net/article/2015-02-26/2824020')
ce = ContentExtractor('http://beiyuu.com/religion-basic/')
# ce = ContentExtractor('http://beiyuu.com')
ce.execute()
print ce.title
print ce.content