-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinkParser.py
189 lines (165 loc) · 4.64 KB
/
linkParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# Import Libraries
from bs4 import BeautifulSoup
import requests
import re
# Remove all tags in the region to finally get texts in the region
def processRegion(region):
X = re.findall(r"\<.*?\>", region)
for c in X:
region = region.replace(c, "")
return region
# Find the region using the text in the immediate parent
def findRegion(link):
parent = link.parent
s = str(parent)
return processRegion(s)
# Utility Function
def cleaner(s):
s = s.replace("\n", "")
s = s.replace(" ", "")
return s
def parseLink(url = None, file= None):
# Read HTML from URL or from downloaded file
# Include dead end checker here - for links outside domain
if url:
try:
result = requests.get(url).text
doc = BeautifulSoup(result, "html.parser")
except:
return {}
elif file:
try:
with open(file, "r") as f:
doc = BeautifulSoup(f, "html.parser")
except:
return {}
else:
return {}
# Extract all links from page
links = doc.find_all("a")
# Snip the html part
doc = doc.find("html")
# Convert HTML structure to string
doc2 = str(doc)
# Remove Head tag
while doc2.find("<head>")!= -1:
s1, s2 = doc2.find("<head>"), doc2.find("</head>")
doc2 = doc2[s2+7:]
# Remove images
while doc2.find("<img") != -1:
s = doc2.find("<img")
for i in range(s, len(doc2)):
if doc2[i] == ">":
break
doc2 = doc2[:s] + doc2[i+1:]
# Remove JavaScript
while doc2.find("<script") != -1:
s1, s2 = doc2.find("<script"), doc2.find("</script>")
doc2 = doc2[:s1] + doc2[s2+9:]
# Find the preceeding text info for a link
def findPrev(link, head):
# Find the index of the link in the document
ind = doc2.find(link)
# Extract the region between head and obtained index
# Append the link at the end
region = doc2[head:ind] + link
# Process the region to get rid of the tags
region = processRegion(region)
# Point the Head after the a tag corresponding to the present link
for i in range(ind,len(doc2)-3):
if doc2[i] == "<" and doc2[i+1] == "/" and doc2[i+2] == "a":
break
head = i+4
# Return the region and the head
return region, head
# Nodes of the page
nodes = {}
# Head pointer pointing to the start of the document
head = 0
# Parse Extracted Links
for l in range(len(links)):
link = links[l]
# If no alias then this is None
alias = link.string
# Find the text preceeding the link and update the head
prev, head = findPrev(str(link), head)
# Find the region in the link area
reg = findRegion(link)
# If link is a list item, fetch the prev region from the first list item
if link.parent.name == "li" and links[l-1].parent.name == "li" and cleaner(prev) == cleaner(reg):
k = str(l) + " " + str(links[l-1]['href'])
prev = nodes[k]['prev']
# Generate key for the graph - Has a counter attached to deal with multiple occurances of same link
key = str(l+1) + " " + str(link['href'])
# Store the alias, preceeding region, and current area as values of the node
values = {'alias':alias, 'prev':prev, 'region':reg}
# Add node
if key not in nodes:
nodes[key] = values
return nodes
def txtForm(url = None, file= None):
if url and file:
return ""
if url:
nodes = parseLink(url = url)
elif file:
nodes = parseLink(file=file)
else:
print("Empty Parameters")
return ""
f = open("Links.txt", "w")
f.write("------Source------\n")
f.write(f"Link: {file}\n")
f.write("-----Destinations-------\n")
cnt = 1
for n in nodes:
f.write("\n------------\n")
f.write(f"Destination {cnt}:\n")
cnt += 1
f.write(f"\nKey: {n}")
f.write(f"\nLink Label: {nodes[n]['alias']}")
f.write(f"\nPreceeding Text: {nodes[n]['prev']}")
f.write(f"\nRegion: {nodes[n]['region']}")
f.write("\n------------\n")
f.close()
return f"PASSED"
def csvForm(url = None, file = None):
if url and file:
return ""
if url:
nodes = parseLink(url = url)
elif file:
nodes = parseLink(file=file)
else:
print("Empty Parameters")
return ""
f = open("Links.csv", "w")
f.write("S.No, Link, Link Label, Preceeding, Region\n")
cnt = 1
for n in nodes:
link = n[n.find(" ") + 1:]
alias = nodes[n]['alias']
prev = nodes[n]['prev']
region = nodes[n]['region']
if alias:
alias = alias.replace("\n", " ")
alias = alias.replace(",", " ")
if prev:
prev = prev.replace("\n", " ")
prev = prev.replace(",", " ")
if region:
region = region.replace("\n", " ")
region = region.replace(",", " ")
f.write(f"{cnt}, {link}, {alias}, {prev}, {region}\n")
f.write("\n")
cnt += 1
f.close()
if __name__ == "__main__":
src_file = "Upload/index3.html"
src_url = ""
# txtForm(file = src_file)
# csvForm(file = src_file)
# print("Done")
nodes = parseLink(file = src_file)
x = list(nodes.keys())
print(nodes[x[0]])