-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsvd_recommend.py
87 lines (77 loc) · 2.83 KB
/
svd_recommend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import division
##
import numpy as np
import scipy.linalg
from prettyplotlib import plt
#import matplotlib.pyplot as plt
from scipy.io import mmread
from sklearn.metrics import mean_squared_error
##
from incremental_svd2 import incremental_SVD, test_perf
def test_perf(X, test, uk, sk, vk, r):
prediction = np.zeros(test.shape[0])
for n in xrange(test.shape[0]):
i, j = test[n, :2] - 1
prediction[n] = r[i] + np.dot(np.dot(uk, np.sqrt(sk).T)[i, :], np.dot(np.sqrt(sk), vk)[:, j])
return prediction
def get_error(U, S, V, train, test, r):
#U, S, V = incremental_SVD(train, k, u, by_row=True)
pred = test_perf(train, test, U, S, V, r)
ndx = ~np.isnan(pred)
rmse = np.sqrt(mean_squared_error(pred[ndx], test[ndx, 2]))
#ortho = np.linalg.norm(U.dot(U.T) - np.identity(U.shape[0]))
return rmse
if __name__ == '__main__':
raw_train = np.matrix(mmread('subset_train.mtx').todense(), dtype=np.float)
test = np.loadtxt('subset_test.txt', dtype=np.int32)
maxX, maxY = 2000, 100
train = raw_train[0:maxX, 0:maxY]
test = test[0:maxX, 0:maxY]
print 'Using matrix of size {}'.format(train.shape)
#
prod_avg = np.zeros(train.shape[0], dtype=np.float32)
for i in xrange(len(prod_avg)):
prod_avg[i] = np.mean(train[i, np.argwhere(train[i, :] != 0)])
#
cust_avg = np.zeros(train.shape[1], dtype=np.float32)
for i in xrange(len(cust_avg)):
cust_avg[i] = np.mean(train[np.argwhere(train[:, i] != 0), i])
# Replace empties by average product rating
for i, avg in enumerate(prod_avg):
train[i, np.argwhere(train[i, :] == 0)] += avg
# Normalize all ratings by the averge customer rating
for i, avg in enumerate(cust_avg):
train[:, i] -= avg
print 'Using matrix of size {}'.format(train.shape)
print train[0:10, 0:10]
"""
print 'Testing SVD'
svdX = []
svdY = []
u, s, vT = scipy.linalg.svd(train)
for k in xrange(1, 100):
low_s = [s[i] for i in xrange(k)] # + (min(u.shape[0], vT.shape[1]) - k) * [0]
print 'Exact SVD with low-rank approximation {}'.format(k)
svdX.append(k)
svdY.append(get_error(u, np.diag(low_s), vT, train, test))
plt.plot(svdX, svdY, label="SVD", color='black', linewidth='2', linestyle='--')
"""
print
print 'Testing incremental SVD'
for num in xrange(400, 1001, 300):
print '... with block size of {}'.format(num)
X, Y = [], []
for k in xrange(1, 91, 10):
print k
u, s, vT = incremental_SVD(train, k, num)
X.append(k)
Y.append(get_error(u, s, vT, train, test, prod_avg))
plt.plot(X, Y, label='iSVD u={}'.format(num))
##
plt.title('Recommendation system RMSE on {}x{} matrix'.format(*train.shape))
plt.xlabel('Low rank approximation (k)')
plt.ylabel('Root Mean Squared Error')
#plt.ylim(0, max(svdY))
plt.legend(loc='best')
plt.savefig('recommend_rmse_{}x{}.pdf'.format(*train.shape))
plt.show(block=True)