-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAID.py
195 lines (166 loc) · 8.03 KB
/
AID.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from CheckRules import CheckRules
from gensim.summarization import keywords
import re
import spacy
from collections import Counter
import textParser
from bs4 import BeautifulSoup
import addBanner
# Remove all tags in the region to finally get texts in the region
def processRegion(region):
X = re.findall(r"\<.*?\>", region)
for c in X:
region = region.replace(c, "")
region = region.strip()
# print(f"This is the region: {region}, {len(region)}")
return region
# Find the region using the text in the immediate parent
def findRegion(link):
parent = link.parent
s = str(parent)
return processRegion(s)
# This file runs each of the rules for each URL in the input/output file
def LinkParser(file, emlinks):
if file:
try:
with open(file, "r") as f:
doc = BeautifulSoup(f, "html.parser")
except:
return ["", -1, ""]
# Extract all links from page
links = doc.find_all("a")
# print(doc)
doc2 = str(doc)
style = ' STYLE="background-color: rgb(255,255,0)" '
res = []
res2 = []
emres = ""
# Parse Extracted Links
regex = r"(?i)\b((?:http[s]?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
for l in range(len(links)):
link = links[l]
alias = link.string
title = link.get("title")
if alias:
alias = alias.strip()
if title:
title = title.strip()
if (alias and re.match(regex, alias)) or (title and re.match(regex, title)):
# print(f"This is alias: {alias}, {len(alias)}")
# Rule 3 a: Link has no label and no region text
# if len(findRegion(link)) <= len(alias):
# style = ' STYLE="background-color: rgb(255,0,0)" '
# else:
# style = ' STYLE="background-color: rgb(255,255,0)" '
x = str(link)
aug_link = x[:2] + style + x[2:]
doc2 = doc2.replace(x, aug_link)
# print(alias)
res.append(link)
# print(doc2)
if emlinks:
# print("# Embedded Links ->", len(emlinks))
# Embedded Links
for l in range(len(emlinks)):
link = emlinks[l]
alias = link.string
title = link.get("title")
# print(title)
if alias:
alias = alias.strip()
if title:
title = title.strip()
# print(title)
if (alias and re.match(regex, alias)) or (title and re.match(regex, title)):
res2.append(link.get("href"))
if alias == "None" and title == "None":
res2.append(link.get("href"))
if res2:
emres = "The following embedded links also violate Rule 3\n"
for i in range(len(res2)):
t = f"{i+1}. {res2[i]}\n"
emres += t
# res = res.extend(res2)
return [doc2, res, emres]
def addColor(text, color):
txt = f'<p style="color: {color}">{text}</p>'
return txt
def MainProcess(usecase, subgoal, action, filename, var):
report = ""
flag = 0
flags = []
#Loading the English model for spaCy
nlp = spacy.load('en_core_web_sm')
res = 0
C = CheckRules()
# List of DOM words to exclude from keywords
DOM_words = ['window', 'document', 'header', 'form', 'link', 'field', 'tab', 'button', 'checkbox', 'icon', 'data', 'information', 'webpage', 'page', 'website', 'abi', 'clicks', 'finds', 'click', 'find', 'visits', "goto", "links"]
# Getting Keywords from subgoal and action by extracting nouns
subgoals = nlp(subgoal)
actions = nlp(action)
keywords_S = []
keywords_A = []
txt, emlinks = textParser.textParse(filename)
for token in subgoals:
if (token.pos_ == 'PROPN' or token.pos_=='NOUN' or token.pos_ == 'ADJ') and (str(token) not in DOM_words): #or (token.pos_ == 'NOUN'):
keywords_S.append(token.text)
for token in actions:
if (token.pos_ == 'PROPN' or token.pos_=='NOUN' or token.pos_ == 'ADJ') and (str(token) not in DOM_words):
keywords_A.append(token.text)
# report = f'\nURL of webpage evaluated: {filename[7:]}\nUse Case: {usecase}\nSubgoal: {subgoal}\nAction: {action}\n'
# Rule 1 starts here - tokens present in the page or not
# print the keywords S and A if violated
result_1_S = C.checkRule1(keywords_S, txt)
result_1_A = C.checkRule1(keywords_A, txt)
if (result_1_S==1 and result_1_A==1) or (keywords_A == [] and keywords_S == []):
flags.append("Not Violated")
report = report + addColor("\nRule 1 (described below) not violated. The tool found all the keywords Abi was looking for.\n", "green")
else:
flag = 1
flags.append("Violated")
report = report + addColor("\nRule 1 (described below) is violated: The tool did not find all the keywords Abi was looking for, on the webpage.\n", "orange")
report = report + f"The subgoal keywords for this instance were: {keywords_S}, and the action keywords were: {keywords_A}.\n"
print("Rule 1")
if (var==2):
result_2 = C.checkRule2(txt, keywords_A)
if result_2==1:
flag = 1
flags.append("Violated")
report = report + addColor("\n Rule 2 (described below) is violated: The tool did not find the keywords from the previous link-label on the current page.", "orange")
else:
flags.append("Not Violated")
report = report + addColor("\n Rule 2 (described below) not violated. Abi knows she is on the right webpage.", "green")
else:
flags.append("Not Applicable")
report = report + addColor("\n Rule 2 (described below) not applicable since Abi did not take any action yet.", "green")
# #Rule 3 starts here - Link label exists or not
# Highlight links which donot have label
# Link labels will be checked
document, result_3, emres = LinkParser(filename, emlinks)
# print(filename)
print("Rule 2")
if result_3 == -1:
flags.append("No Links on page")
report = report + "\nRule 3 (described below) not violated. The tool did not detect any links on the webpage.\n"
elif len(result_3) > 0:
flag = 1
flags.append("Violated")
report = report + addColor("\nRule 3 (described below) violated: Some links are not labelled. Please refer to the right side to see the highlighted links (in yellow).\n", "orange")
if emres:
report = report + addColor(emres, "orange")
else:
flags.append("Not Violated")
report= report + addColor("\nRule 3 (described below) is not violated. All links are labelled.\n", "green")
print("Rule 3")
report = report + "\n\nRead below for descriptions of each rule: \n\nRule 1: Keywords from subgoals and associated actions should be present on the webpage. \nThe wording of the subgoal serves as the information that Abi seeks, and the words from actions serve as cues to direct Abi to a UI action. Without such cues, Abi would face difficulty finding all the information they need. \n"
report = report+"\nRule 2: Linked pages should contain keywords from link labels. \n On clicking a link, the destination page should offer cues to help Abi’s understand that they have reached the right place. If a project page fails to use words similar to what a link label hinted at, Abi could get confused. \n"
report = report + "\nRule 3: Links should be labeled with a keyword or phrase. Abi clicks on a link only after gathering enough information and planning their next step. Labeled links provide Abi with information about the webpage they are supposed to visit.\n"
f = open(f"static/changed{var}.html", "w", encoding="utf-8")
# printcheck = "CHANGED!!!"
# file.write(printcheck)
addBanner.augment(subgoal, action, report, var)
f.write(document)
f.close()
return report, flag, flags
# MainProcess("usecase", "subgoal", "action", "a.html")
# LinkParser("a.html")