-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_cloud.py
73 lines (54 loc) · 2.08 KB
/
word_cloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
import pandas as pd
import pickle as pkl
import os
import sys
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from utils import stop_words, tokenize
def draw_word_cloud(df, topic, n_articles=50, save=True):
ftopic = df[df['topic_sorted']==topic]
ftopic = ftopic.sort('weight', ascending=False).head(n_articles)
text = list(ftopic['content'].values)
text = " ".join(text)
#remove stop words and lemmatize
text = tokenize(text) # but this tokenizes again!
text = " ".join(text)
wordcloud = WordCloud().generate(text)
plt.figure(figsize=(6,4))
plt.imshow(wordcloud)
plt.axis("off")
if save:
fileName = outdir + "topic_"+str(topic)+"temp.png"
plt.savefig(fileName, bbox_inches='tight')
def assign_topics(df, modle, vectors):
W = model.components_
A = vectors.dot(W.T)
df['topic'] = list(np.argmax(A, axis=1))
df['weight'] = list(np.max(A, axis=1))
df = df[df['weight']>0.5]
# now sort topics w.r.t number of articles per topic
# this is just renaming the topic
dg = df[['topic','headline']].groupby('topic')
x = sorted(dg.groups.keys())
y = [len(dg.groups[i]) for i in x]
m = list(np.argsort(y)[::-1])
d = {j : x[i] for i, j in enumerate(m)}
df['topic_sorted'] = df['topic'].map(lambda x : d[x])
return df
if __name__=="__main__":
s_topics = sys.argv[1] # this is a string
n_topics = int(s_topics)
outdir = 'topic_browser/static/'
df = pkl.load(open('data/data_all.pkl'))
model = pkl.load(open('data/model_' + str(n_topics) + '.pkl'))
vectorizer, vectors = pkl.load(open('data/vectorizer.pkl', "rb"))
df = assign_topics(df, model, vectors)
for i in range(n_topics):
draw_word_cloud(df, i)
# trim output wordcloud images using imagemagick convert function
for i in range(n_topics):
fileName = outdir + 'topic_' + str(i) + 'temp.png'
os.system('convert ' + fileName + ' -trim ' + \
outdir + 'topic_' + s_topics + '_' + str(i) + '.png')
os.system('rm ' + fileName)