-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgramatical_similarity_filter2.py
111 lines (84 loc) · 4.05 KB
/
gramatical_similarity_filter2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json, math
with open("blacklist.json", "r") as jsonfile:
blacklist = json.load(jsonfile)["words"]
with open("dataset.json", "r") as file:
dataset = json.load(file)
class HashmapFilter:
key_dict = {
# Insert here all symbols that you want to replace or delete from the words during the format.
"4" : "a", "0" : "o", "3" : "e", "7" : "t",
"-" : "", "_" : "", "." : "", "," : "",
"?" : "", "!" : "", "@" : "", "#" : "",
"&" : ""
}
lr = 0.001 # This determines how fast the bias adjusts, recommended low values like this one.
bias = 0 # Recommended to keep it in one to get always the best performace
def __init__(self, k):
self.word_list = set()
self.max_size = k
def add_word(self, word):
group_size = min(int(math.sqrt(len(word))) + 1 , self.max_size)
if len(word) >= group_size:
for i in range(len(word) - group_size + 1):
self.word_list.add(word[i : i + group_size])
else: print(f"Palabra {word} es demaciado corta.")
def get_probability(self, word):
probability = 0
word = self.format(word)
group_size = min(int(math.sqrt(len(word))) + 1 , self.max_size)
if len(word) >= group_size:
for i in range(len(word) - group_size + 1):
group = word[i : i + group_size]
probability += int(group in self.word_list)
return math.sqrt(probability / (len(word) - group_size + 1))
else: return 0
def format(self, word):
"""Remove junk characters and replaces others with their equivalents, dtype -> str"""
new_word = word.lower()
for key in self.key_dict:
new_word = new_word.replace(key, self.key_dict[key])
return new_word
@staticmethod
def filter_word(word):
"""returns a character string with the form of the word, dtype -> str"""
return "".join(["*" for i in range(len(word))])
def train(self, dataset, epochs = 50):
"""Fit the bias based on the examples given and the list of banned words"""
# This function is used to automatically fit the bias, this works like a neural network or similars.
print("\n< Training start >")
for epoch in range(epochs):
for wordA, expected_output in dataset["pairs"]:
output = int(self.get_probability(wordA) >= self.bias)
self.bias += (output - expected_output) * self.lr
if not epoch%9:
print(f"Current Epoch {epoch}, bias value {round(self.bias, 3)}")
print("< Training end >\n")
def test(self, dataset):
"""Tests the score of the algorithm. Note: the score can change by adding more
training data banned words or by changing evalution functions"""
score = 0
for wordA, expected_output in dataset["pairs"]:
output = int(self.get_probability(wordA.lower()) >= self.bias)
# This part, print if the word was filtered or not.
if output: print(f"{self.filter_word(wordA)} | {wordA} -> {expected_output}")
else: print(f"{wordA} -> {expected_output}")
score += int(output == expected_output)
print(f"\nTotal score: {round(score / len(dataset['pairs']) * 100)}%\n")
def filter_text(self, text):
"""returns text by filtering banned words, dtype -> str"""
word_list = text.split(" ")
for index, word in enumerate(word_list):
output = self.get_probability(word)
if output: word_list[index] = self.filter_word(word)
return " ".join(word_list)
word_filter = HashmapFilter(3)
for word in blacklist:
word_filter.add_word(word.lower())
word_filter.test(dataset)
word_filter.train(dataset, 200)
word_filter.test(dataset)
while True:
text = input(">> ")
if text.lower() == "exit": break
else:
print(word_filter.filter_text(text))