forked from Griffintaur/News-At-Command-Line
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtractor.py
142 lines (109 loc) · 4.07 KB
/
Extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
"""
Created on Jul 24 23:05:13 2016-2017
@author: Ankit Singh
"""
from bs4 import BeautifulSoup
class Extractor(object):
def ExtractionAlgo(self, text):
pass
def TextExtractionAlgo(self, text, htmlelement, classname):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all(htmlelement, class_=classname)
# print maincontent
for content in maincontent:
scripttags = content.find_all(["script", "br", "figure", "image"])
for scripttag in scripttags:
scripttag.extract()
# print content.text
Result.append(content.text)
Result = ''.join(Result)
return (title, Result)
class HuffingtonPost(Extractor):
"""class for Huffington Post parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(
self, text, "div", "content-list-component text")
class NYT(Extractor):
"""class for New York Times parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(
self, text, "p", "story-body-text story-content")
class BBC(Extractor):
"""class for BBC News parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(
self, text, "div", "story-body__inner")
class BloomBerg(Extractor):
"""class for BloomBerg parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(self, text, "div", "body-copy")
class Guardian(Extractor):
"""class for Guardian parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all(
"div", class_="content__article-body from-content-api js-article__body")
# print maincontent
for content in maincontent:
scripttags = content.find_all(["script", "br", "figure", "image"])
for scripttag in scripttags:
scripttag.extract()
# print content.text
for foundcontent in content.find_all("p"):
Result.append(foundcontent.text)
Result = ''.join(Result)
return (title, Result)
class TheHindu(Extractor):
"""class for BloomBerg parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all("div", class_="article")
# print maincontent
for content in maincontent:
scripttags = content.find_all(
["script", "br", "figure", "image", "span"])
for scripttag in scripttags:
scripttag.extract()
# print content.text
for foundcontent in content.find_all("p"):
Result.append(foundcontent.text)
Result = ''.join(Result)
return (title, Result)
class TimesOfIndia(Extractor):
"""class for BloomBerg parsing"""
def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self, text):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all("div", class_="Normal")
# print maincontent
for content in maincontent:
# print content.text
Result.append(content.text)
Result = ''.join(Result)
return (title, Result)