-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextConverter.py
82 lines (63 loc) · 2.38 KB
/
TextConverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from bs4.element import Comment
from bs4 import BeautifulSoup
import requests
import json
import os
def getFilePaths(DIRPATH):
"""
This function returns the relative paths of the files present in the directory
pointed by the DIRPATH variable.
:param DIRPATH: The path to the directory where files are needed to be searched
:return: returns a list of string type. This list contains the paths to the files
present in the directory.
"""
filelist = []
for dir_, _, files in os.walk(DIRPATH):
for file_name in files:
if file_name.split(".")[1] == "json":
rel_file = os.path.join(file_name)
filelist.append(rel_file)
return filelist
def tag_visible(element):
"""
This function acts as a filter to exclude HTML formatters. It would skip the
elements mentioned in the below list and parse only the visible text.
:param element: The tag which is to be parsed
:return: returns a boolean type
"""
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def getHTMLData(file):
"""
This function fetches HTML page for the URL mentioned in the file and parses its tags
to mine text data from them.
:param file: The JSON file which contains the URL which points to an HTML page
:return: returns textdata of string type. This textdata is the visible text on the HTML page
"""
textdata = ""
with open(file, 'r', encoding="utf-8") as jsonFile:
data = json.loads(jsonFile.read())
for element in data:
url = element["url"]
response = requests.get(url=url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
textdata += u" ".join(t.strip() for t in visible_texts)
return textdata
def handler():
"""
This function acts as the entry point to the program
:return: nothing
"""
DIRPATH = ".\\Data\\"
files = getFilePaths(DIRPATH)
for file in files:
filename = file.split(".")[0] + ".txt"
text = getHTMLData(file)
with open(filename,'w', encoding="utf-8") as textFile:
textFile.write(text+"\n\n")