-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathanalysis.py
78 lines (60 loc) · 2.33 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import database as db
from datamodel import *
import time
import datetime as dt
import pylab as pl
import pandas as pd
import numpy as np
s = db.Session()
data = s.query(Comment.author, Comment.comment_id, Comment.created_utc).all()
author = [a for a,b,c in data]
comment_id = [b for a,b,c in data]
timestamps = [time.gmtime(c) for a,b,c in data]
created_dates = [dt.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) for t in timestamps]
df = pd.DataFrame({'author':author, 'comment_id':comment_id}, index=created_dates)
# This gives us the total comments by hour. I want this normalized to percent of total.
# ... actually, if I use cosine similarity for clustering, it shouldn't make a difference.
grouped = df.groupby(['author', lambda x: x.hour]).agg(len)
grid = grouped.unstack()
pl.plot(grid.T)
pl.show()
# Let's add some more features and break it out by day of week
# switching around lambdas would probably simplify unstacking
# can probably change .agg(len) to .count()
grouped2 = df.groupby(['author', lambda x: x.dayofweek, lambda x:x.hour]).agg(len)
grid2 = grouped2.unstack(1).unstack()
grid2 = grid2.fillna(0) # mainly for PCA, but can't hurt.
pl.plot(grid2.T)
pl.show
# pca
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(grid2)
grid2_pca = pca.transform(grid2)
pl.scatter(grid2_pca.T[0], grid2_pca.T[1])
pl.show()
#filter on best data
filtered = grid2[grid2.T.sum()>900]
pca.fit(filtered)
f_pca = pca.transform(filtered)
pl.scatter(f_pca.T[0],f_pca.T[1])
pl.show()
# An attempt at spectral clustering. Doesn't really show anythign on the plot, anyway
from sklearn.clustering import SpectralClustering
spec = SpectralClustering(n_clusters=2)
spec.fit(grid2)
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
col = colors[spec.labels_.astype(np.int)].tolist()
x,y = grid_pca.T[0], grid_pca.T[1]
pl.scatter(x,y,color=col)
## Don't really know anythign about ward hierarchical clustering,
## but this works as a proof of concept.
## For visualization purposes, we could probably get away
## with some PCA here as well, maybe make this 3D.
#from sklearn.cluster import Ward
#ward = Ward(n_clusters=10).fit(grid)
#label = ward.labels_
#
#for l in np.unique(label):
# pl.plot(grid[label==l].T, color=plt.cm.jet(np.float(l) / np.max(label + 1)))