-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpparser.py
203 lines (174 loc) · 8.88 KB
/
pparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import os
import re
import pandas as pd
from tqdm import tqdm
class Zerofucks:
'''Class container for profanity parser utility
# instantiate class
myFinder = Zerofucks()
# load in bad words list from file,
# write to list my_badwords
my_badwords = []
badwords_file = open('./bad_words.txt', "r").readlines()
for line in badwords_file:
my_badwords.append(line.rstrip())
'''
def __init__(self):
self.df = pd.DataFrame(
{'file': [], 'line_no': [], 'badword': [], 'content': []},
columns=['file', 'line_no', 'badword', 'content']
)
def __makeRegex__(self, shitToFind=["fuck", "shit"]):
'''
shitToFind is a list of word literals,
uses word boundary \b
returns a regex string
'''
shitString = ''
for word in shitToFind:
# shitString += r"\b" + word + r"\b|"
# case insensitive (?i) has deprecation warning:
# https://github.com/bottlepy/bottle/issues/949
shitString += r"(?i)\b" + word + r"\b|"
return shitString[:-1]
def write_df(self, path='./', filename='out.csv'):
'''
path: path to where you want to put the file
filename: name of file
Writes the csv from self.df to a csv file
'''
try:
self.df.to_csv(path_or_buf=path + filename, index=False, chunksize=1000)
print("File written to {}".format(path + filename))
except:
print("Unable to write csv, did you run .find_shit() yet?")
def erase_in_file(self, input_file, replace_with='', postfix='', bad_words=['fuck', 'shit']):
'''
Erases any bad words from the user specified files.
Takes an input file and writes to an outputfile.
ALSO returns a dictionary for forensic - what did you change - purposes
input_file: full path to the file you want to process / remove bad words from
replace_with: the string you'll be replacing the bad word with (defaults to nothing, '')
postfix: optional string value you want to add to the end of the outputfile. If blank, it'll overwrite the input file.
'''
print('Replacing bad words in {}...'.format(input_file))
try:
tempfile = open(input_file + '.tmp', mode='w')
with open(input_file, 'r', encoding='iso-8859-15') as infile:
shit_dict = {}
for content in tqdm(infile):
if len(re.findall(self.__makeRegex__(bad_words), content)) > 0:
if re.findall(self.__makeRegex__(bad_words), content)[0] in shit_dict:
shit_dict[re.findall(self.__makeRegex__(bad_words), content)[0]] += 1
else:
shit_dict[re.findall(self.__makeRegex__(bad_words), content)[0]] = 1
# the '' in this line is what to replace the bad words with. In this case, nothing at all.
tempfile.write(re.sub(self.__makeRegex__(bad_words), replace_with, content))
# write the tempfile contents to the original file
print('overwriting {} with {}'.format(input_file+'.tmp', input_file+postfix))
os.rename(input_file + '.tmp', input_file+postfix)
return shit_dict
except ValueError:
print(f'Unable to replace bad words in {input_file}.')
def erase_in_dirs(self, root_paths, postfix='', bad_words=['fuck', 'shit'], ignore_extensions=['ipynb']):
'''
root_paths: LIST of directories to replace shit in. Must be a list. Cannot be a specific file.
postfix: string to add to the end of the name of the output file. If input is 'myFile.csv' with a postfix of '_new', the scrubbed file would be 'myFile.csv_new'. No prefix overwrites the existing file.
bad_words: optional list of words to replace in the files within root_paths
'''
audit_log = {
'ignored': [],
'processed': []
}
for root_path in root_paths:
for (root, dirs, files) in os.walk(root_path):
for name in files:
try:
# if the file extension is one we want to ignore
file_ext = name.split('.')[-1]
if file_ext in ignore_extensions:
print(f'ignoring {self.__get_path(root, dirs, name)}')
# add the filename to the ignored section of the audit_log
audit_log['ignored'].append(self.__get_path(root, dirs, name))
continue
# otherwise, we'll erase shit in it
else:
print(f'deleting shit in {self.__get_path(root, dirs, name)}')
self.erase_in_file(root+'/'+name, postfix=postfix, bad_words=bad_words)
audit_log['processed'].append(self.__get_path(root, dirs, name))
# if the filename has no extension, we still process it
except:
print(f'no file extension in {self.__get_path(root, dirs, name)}, erasing shit anyway')
self.erase_in_file(root+'/'+name, postfix=postfix, bad_words=bad_words)
audit_log['processed'].append(self.__get_path(root, dirs, name))
return audit_log
def find_shit(self, root='./', bad_words=['fuck', 'shit'], include_content=True):
'''
root: the directory root where you want the crawl to start
bad_words: list of words you want to search for.
These can also be regular expressions. It will
match partials, so 'fuck' will match 'fucking',
'unfuckingbelievable', etc. Use \b and similar
to constrain to word boundaries.
The meat and potatoes, this is what conducts the walk
and writes the resultant dataframe to self.df.
To export the df to a csv, use the write_df() method.
'''
for item in os.walk(root):
# keep records for each directory parsed
file_df = []
line_no_df = []
badword_df = []
if include_content:
content_df = []
for file in item[2]:
print('Searching {}'.format(item[0]+'/'+file))
try:
openfile = open(item[0] + '/' + file, "r").readlines()
for line_no, content in enumerate(openfile):
if len(re.findall(self.__makeRegex__(bad_words), content)) > 0:
for i, badword in enumerate(re.findall(self.__makeRegex__(bad_words), content)):
file_df.append(item[0]+'/'+file)
line_no_df.append(line_no)
badword_df.append(re.findall(self.__makeRegex__(bad_words),content)[i])
# rstrip to remove newline character
if include_content:
content_df.append(content.rstrip())
except:
pass
# print(content_df)
# write the records to the dataframe with each dir parsed
if include_content:
self.df = self.df.append(pd.DataFrame({'file': file_df,
'line_no': line_no_df,
'badword': badword_df,
'content': content_df},
columns=['file', 'line_no', 'badword', 'content']),
ignore_index=True)
else:
self.df = self.df.append(pd.DataFrame({'file': file_df,
'line_no': line_no_df,
'badword': badword_df},
columns=['file', 'line_no', 'badword']),
ignore_index=True)
# change the silly auto-detected float line_no to an integer
self.df['line_no'] = self.df.copy()['line_no'].apply(lambda x: int(x))
# return self.df
print("Dataframe successfully created, \n \
use <obj>.df to print the df, or \n \
.write_df() method to write to file.")
def __get_path(self, root, dirs, name):
'''
returns the full path as a string
'''
dir_string = ''
num_dirs = len(dirs)
if num_dirs == 0:
return(root+'/'+name)
else:
for idx, dir in enumerate(dirs):
if idx < num_dirs:
dir_string += dir + '/'
else:
dir_string += dir
return(root+dir_string+name)