-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtesting_file.py
199 lines (127 loc) · 6.79 KB
/
testing_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# Generate dat file with tweets
from builtins import str
from gensim import corpora # to create a dictionary
import os
import csv
import dtmmodel
class FileMaker:
def __init__(self,path,timestamps):
self.path=path
self.timestamps=timestamps
self.dat_outfile = open(os.path.join('data', 'metadata.dat'), 'w',encoding='utf-8')
self.tweets = list()
self.cont=[]
# time_stamps=setTimePeriod(self.start,self.end)
def writeToFiles(self):
self.dat_outfile.write('id\tdate\tcontent\n') # write header
# Set total_tweets list per year, starting at 0
# total_tweets_list = [0 for year in conferences_years]
total_tweets_list = [0 for timestamp in self.timestamps]
# Analyze each year..
time_stamps_count = 0
j = 0
for timestamp in self.timestamps: # For each minute array
'''
prefix='2019-04-'
total_tweets = 0
preprocessed_outfile = open(os.path.join(self.path, prefix+timestamp), 'r')
'''
total_tweets = 0
preprocessed_outfile = open(os.path.join(self.path, timestamp), 'r',encoding='utf-8')
for line in preprocessed_outfile:
total_tweets += 1
self.dat_outfile.write(str(j) + '\t' + timestamp + '\t' + line)
self.cont.append(line.split())
j = j + 1
preprocessed_outfile.close()
# Add the total tweets to the total tweets per year list
total_tweets_list[time_stamps_count] += total_tweets
time_stamps_count += 1
self.dat_outfile.close() # Close the tweets file
# Write seq file
seq_outfile = open(os.path.join('data', '-seq.dat'), 'w',encoding='utf-8')
seq_outfile.write(str(len(total_tweets_list)) + '\n') # number of TimeStamps
for count in total_tweets_list:
seq_outfile.write(str(count) + '\n') # write the total tweets per year (timestamp)
seq_outfile.close()
print('Done collecting tweets and writing seq')
corpus_memory_friendly = MyCorpus(cont=self.cont)
multFile = open(os.path.join('data', '-mult.dat'), 'w',encoding='utf-8')
for vector in corpus_memory_friendly: # load one vector into memory at a time
multFile.write(str(len(vector)) + ' ')
for (wordID, weigth) in vector:
multFile.write(str(wordID) + ':' + str(weigth) + ' ')
multFile.write('\n')
multFile.close()
print('Mult file saved')
class MyCorpus(object):
def __init__(self, cont=None):
self.cont = cont
self.prepare_dictionary()
def __iter__(self):
for line in self.cont:
# assume there's one document per line, tokens separated by whitespace
yield self.dictionary.doc2bow(line)
def prepare_dictionary(self):
stop_list = set('for a of the and to in'.split()) # List of stop words which can also be loaded from a file.
# Creating a dictionary using stored the text file and the Dictionary class defined by Gensim.
self.dictionary = corpora.Dictionary(self.cont)
# Collecting the id's of the tokens which exist in the stop-list
stop_ids = [self.dictionary.token2id[stop_word] for stop_word in stop_list if
stop_word in self.dictionary.token2id]
# Collecting the id's of the token which appear only once
once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.items() if docfreq == 1]
# Removing the unwanted tokens using collected id's
self.dictionary.filter_tokens(stop_ids + once_ids)
self.dictionary.save(os.path.join('data', 'dictionary.dict')) # store the dictionary, for future reference
# Save vocabulary
vocFile = open(os.path.join('data', 'vocabulary.dat'), 'w',encoding='utf-8')
for word in self.dictionary.values():
vocFile.write(word + '\n')
vocFile.close()
print('Dictionary and vocabulary saved')
class DTModeller:
def make_dtm(self):
# Make DTM
os.system('dtm-win64.exe ./main --ntopics=5 --mode=fit --rng_seed=12000 --initialize_lda=true --corpus_prefix=data/ --outname=data/output --top_chain_var=0.9 --alpha=0.01 --lda_sequence_min_iter=1 --lda_sequence_max_iter=3 --lda_max_em_iter=4')
# Import to tethne
self.dtm = dtmmodel.from_gerrish('data/output/', 'data/metadata.dat', 'data/vocabulary.dat')
def writeTopics(self,numtimes,months):
files=['OutputDTM0.csv','OutputDTM1.csv','OutputDTM2.csv','OutputDTM3.csv','OutputDTM4.csv']
for i in range(5):
with open(files[i], 'w',encoding='utf-8') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
# Write header
writer.writerow(['TopicID', 'Word', 'Year', 'Probability'])
for year_i in range(numtimes):
arr = self.dtm.print_topic(i, year_i, 5) # returns the top 5 words for topic 0 in time 0
# my code
word_arr = []
count_prob = 0
for ele in arr:
word_arr.append(ele[0])
# word_str=word_str+ele[0]
count_prob = count_prob + ele[1]
word_str = ' '.join(word_arr)
writer.writerow([int(i), word_str, months[year_i], count_prob])
print("done everything")
class Handler:
def __init__(self,storeFolder=None,start=None,end=None):
self.path=storeFolder
#self.start_date = int((start.split('-')[-1]))
#self.end_date =int((end.split('-')[-1]))
self.times=["201810","201811","201812","201901","201902","201903","201904","201905","201906"]
self.months = {"201810":"October 2018","201811":"November 2018","201812":"Decemebr 2018","201901":"January 2019","201902":"February 2019","201903":"March 2019","201904":"April 2019","201905":"May 2019","201906":"June 2019"}
self.start = self.times.index(start)
self.end = self.times.index(end)
def handle(self):
self.timestamps=[]
self.monthstamps=[]
for month in range(self.start , self.end+1):
self.timestamps.append(self.times[month])
self.monthstamps.append(self.months[self.times[month]])
print(self.timestamps)
FileMaker(path=self.path,timestamps=self.timestamps).writeToFiles()
mymodel= DTModeller()
mymodel.make_dtm()
mymodel.writeTopics(numtimes=len(self.timestamps),months=self.monthstamps)