-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram.py
207 lines (170 loc) · 7.13 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
__author__ = 'thirumal'
#-*-coding: utf-8 -*-
import nltk
import os
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from ast import literal_eval
import sys
from email.parser import Parser
import pickle
class Ngram:
global email_body_list
email_body_list = []
def __init__(self, path):
self.path = path
def loadCorpus(self):
''' loads the corpus, reads from all files and writes to a single large file result.txt in working directory.
:return:a list of tokens for the entire corpus
'''
corpus_root = self.path
rawcontent = PlaintextCorpusReader(corpus_root, ".*")
reload(sys)
#sys.setdefaultencoding('utf-8')
read_files = rawcontent._fileids
print 'files to be read ' + str(read_files)
with open("result.txt", "wb") as outfile:
for f in read_files:
if(f != '.DS_Store'):
with open(os.path.join(self.path + '/' + f), "rb") as infile:
outfile.write(infile.read())
raw = open("result.txt").read()
parser = Parser()
email = parser.parsestr(raw)
email_body_dict = {}
i = 0
for email in raw.split('##########################################################'):
i += 1
email_body_list.append(email.split('Body:')[-1])
email_body_dict[i] = email.split('Body:')[-1]
self.save_obj(email_body_dict,'emailBodyDict')
return raw
def createTokens(self,emailText):
'''
:param emailText: it is a string
:return: tokens of type list
'''
tokens = nltk.wordpunct_tokenize(emailText)
return tokens
def preprocessData(self, tokens):
''' removes punctuations, stop words in english,
:param tokens: list of tokens for entire corpus that has to be preprocessed
:return: preprocessed corpus that is devoid of punctuations and stop words. returns type nltk.text
'''
custom_stop_words = ['docid', 'segmentnumber', 'body', 'x-from', 'x-to', 'x-cc', 'x-bcc', 'x-folder',
'x-origin', 'x-fileName','monday','tuesday','wednesday','thursday','friday','saturday','sunday',
'section','thanks','content-type','x-origin','c-fileName','mime-version','subject','from',
'content- transfer-encoding','message-id','enron','com','january','february','march','april',
'may','june','july','august','september','october','november','december','enron.com','inc.',
'moreover','u.s','henry','news','copyright','mon','tue','wed','thu','fri','sat','sun','pdt','pst','jan','feb',
'mar','apr','jun','jul','aug','sep','oct','nov','dec']
stoppedWords = []
stop_words = set(stopwords.words("english"))
for w in custom_stop_words:
stop_words.add(w)
stop_words.update()
for token in tokens:
if (token.lower() not in stop_words):
t = self.ensure_unicode(token)
stoppedWords.append(token)
words = [w for w in stoppedWords if w.isalpha()]
words_new = [w for w in words if len(w) > 2]
text = nltk.Text(words_new)
return text
def stemWords(self,text):
'''
this method stems the words in the corpus
:param text: a list of all words after preprocessing is done
:return:list of stemmed words
'''
ps = PorterStemmer()
stemmedWords=[]
for w in text:
t = self.ensure_unicode(w)
stemmedWords.append(ps.stem(t))
return stemmedWords
def ensure_unicode(self,v):
if isinstance(v, str):
v = v.decode('latin-1')
return unicode(v) # convert anything not a string to unicode too
def createBigrams(self,tokens):
custom_bigrams = list(nltk.bigrams(tokens))
return custom_bigrams
def createTrigrams(self,tokens):
custom_trigrams = list(nltk.trigrams(tokens))
return custom_trigrams
def createPOSTagging(self, tokens):
posTaggedList = []
try:
for i in tokens:
words = nltk.word_tokenize(i)
posTagged = nltk.pos_tag(words)
posTaggedList.append(posTagged)
except Exception as e:
print(str(e))
print 'POS tagging completed successfully'
return posTaggedList
def bigramfrequencyDistribution(self,tokens,custom_bigrams):
''' will find frequency distribution for each bigram in the corpus and write to a file
:param tokens: words of corpus in type tokens
:param custom_bigrams: bigrams in type list
:return:void
'''
uniqWords = sorted(set(tokens)) # Calculating unique words.
print(len(uniqWords))
all_words_Freq = nltk.FreqDist(uniqWords)
bigrams_freq = nltk.FreqDist(custom_bigrams)
bigrams_freq_dict={}
for k,v in bigrams_freq.items():
bigrams_freq_dict[k]=v
#print(k,v)
return bigrams_freq_dict
def writeToFile(self,text,filename):
'''
:param text: this is a list of strings that needs to be written to the file
:param filename: name of the file to be created.
:return: NA
'''
filename = filename + '.txt'
fileobject = open(filename, "wb")
for line in text:
fileobject.write(' '.join(str(s) for s in line) + '\n')
fileobject.close()
def writeDictToFile(self,dict,filename):
with open(filename+'.txt','w') as f:
f.write(str(dict))
def readDictFromFile(self,dict,filename):
with open(filename+'.txt','r') as f:
return literal_eval(f.read())
def createPOSTagging(self, tokens):
posTagged = nltk.pos_tag(tokens)
print ('POS tagging completed successfully')
#print (posTagged)
return posTagged
def removeProperNouns(self,postaggedList):
'''
removes proper nouns
:param postaggedList: this is a list of tuples. Example - [('Michael', 'NNP'), ('Jackson', 'NNP')]
:return: list of tuples which are not NNP or NNPS
'''
listWithoutNouns = [word for word,pos in postaggedList if pos not in ['NNP','NNPS']]
return listWithoutNouns
def save_obj(self, obj, objname):
'''
this method dumps the python object in a .pkl file
:param obj: python object that needs to be written to a file
:param objname: name that will be used to create a pkl file
:return: void
'''
with open(os.getcwd() + '/' + objname + '.pkl', 'wb') as f:
pickle.dump(obj, f)
f.close()
def load_obj(self, objname):
'''
retrieves a pkl file and converts back into a python object
:param objname: name of the pkl file
:return: pkl file content in the form of a python object
'''
with open(os.getcwd() + '/' + objname + '.pkl', 'rb') as f:
return pickle.load(f)