-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcluster.py
40 lines (28 loc) · 985 Bytes
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
""" WORD CLUSTERS (K-MEANS) """
import json
import numpy as np
from myutils import debug
# gensim modules
from gensim.models import Doc2Vec
# KMeans clustering
from sklearn.cluster import KMeans
# Model persistence
from sklearn.externals import joblib
config = json.load(open('config.json', 'r'))
cluster_cache = {}
debug('====== IMPORTING DOC2VEC MODEL ======')
modelPath = config['DOC2VEC']['full']['path']
modelName = config['DOC2VEC']['full']['name']
doc2vec = Doc2Vec.load(modelPath + modelName)
debug('====== CONSTRUCTING DATA POINTS ======')
vocab = doc2vec.vocab.keys()
X = np.array([ doc2vec[w] for w in vocab ])
X.dtype = np.float64
debug('====== RUNNING KMEANS ======')
kmeans = KMeans(n_clusters=1000).fit(X)
joblib.dump(kmeans, 'models/cluster/kmeans.pkl')
debug('====== SAVING RESULTS ======')
for i, w in enumerate(vocab):
cluster_cache[w] = int(kmeans.labels_[i])
json.dump(cluster_cache, open('cluster_cache.json', 'w'))
debug('====== FINISHED ======')