-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
62 lines (55 loc) · 2.36 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import codecs
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from io import StringIO
from sklearn.cluster import KMeans as km
from sklearn.cluster import AffinityPropagation as ap
from sklearn.cluster import Birch as b
from sklearn.cluster import FeatureAgglomeration as fa
def arrayToDict(array):
tempDict = {}
for i in range(0, len(array), 2):
tempDict[array[i]]=array[i+1]
return tempDict
def getFile(fileName):
file = codecs.open(fileName, 'r', encoding='utf8', errors='ignore')#gets file
file = file.read()#reads file into 1 string
file = file.split()#makes it into an array
return file
def addEmojiCode(dimensions, cluster, emojiDict):
labels = cluster.predict(dimensions)
clusters = {}
n = 0
for item in labels:#goes in order by coords put in which go in order by emoji code so will work
if item in clusters:
clusters[item].append(emojiDict[n])#if has two in same spot(doubt it does but just in case)(looking back this is quite useless)
else:
clusters[item] = [emojiDict[n]]#for each new set of coords get the emoji code for it and ad to dictionary
n += 2#skips icons
return clusters
def organizeEnumeratedDictionary(dictionary):
newDict = {}
for i in range(0, len(dictionary)):
newDict[i]=dictionary[i]
return newDict
def emojiCodeToEmoji(clusterDict, emojiDict):
for i in clusterDict:
for x in range(0, len(clusterDict[i])):
clusterDict[i][x] = emojiDict[clusterDict[i][x]]
return clusterDict
tsvToEmojiDict = arrayToDict(getFile('emoji_lookup.tsv'))#dictionary to translate emojis over
emojiDataFrame = pd.read_csv(StringIO(codecs.open('emojis.txt', 'r', encoding='utf8', errors='ignore').read()), sep='\s+')#creates pandas system for holding data
dimensions = np.array(emojiDataFrame.as_matrix(columns=emojiDataFrame.columns[1:]))#gets dimensions in an array perfect for cluster
# auto random state
cluster = km(n_clusters = 100, max_iter=10000000)#kmeans clustering
#cluster = ap(max_iter=1000000)#Affinity propogation clustering
#cluster = b(n_clusters=200)#birch clustering
cluster.fit(dimensions)
codedCluster = addEmojiCode(dimensions, cluster, getFile('emoji_lookup.tsv'))
organizedClusters = organizeEnumeratedDictionary(codedCluster)
emojiClusters = emojiCodeToEmoji(organizedClusters, tsvToEmojiDict)
for z in emojiClusters:
print("Clusters" + str(z))
print(emojiClusters[z])
print("\n")