Skip to content

Commit

Permalink
adding batch eigenvalues timing example
Browse files Browse the repository at this point in the history
  • Loading branch information
pgupta29 committed Oct 9, 2022
1 parent fd602aa commit d9e3dda
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 9 deletions.
67 changes: 67 additions & 0 deletions demos/batch_eigenvalues.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#%%
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from proSVD import proSVD

#%%
# make large data high-dimensional data
n = 100000 # number of samples
p = 1000 # dimension of sample
X = np.random.uniform(-1, 1, size=(n,p))
X -= X.mean(axis=0)[None, :]

# do PCA and SVD
pca = PCA()
startpca = time.time()
pca.fit(X)
print(f'sklearn PCA took {time.time()-startpca:.2f} s')

startsvd = time.time()
u, s, v = np.linalg.svd(X, full_matrices=False)
print(f'numpy svd took {time.time()-startsvd:.2f} s')

# eigenvalues of covariance = singular values squared / (n_samples-1)
print(np.allclose((s**2)/(n-1), pca.explained_variance_))

#%%
# proSVD takes data in shape (num_dimensions, num_samples)
X = X.reshape((p, n))

# proSVD parameters
k = p # dimension to reduce to (keeping all p dims as example)
n_inits = 10000 # number of columns (samples) get initial basis
n_samples_update = n-n_inits # number of columns (samples) used per update iteration
decay_alpha = 1 # how much discounting of old data (sets effective window size, alpha=1 is all seen data)

# get number of iterations for entire dataset
num_iters = 1 # np.floor((X.shape[1]-n_inits-n_samples_update)/n_samples_update).astype('int')
update_times = np.arange(n_inits, num_iters*n_samples_update, n_samples_update) # index of when updates happen (not including init)

# make proSVD object, run
pro = proSVD(k, n_samples_update, decay_alpha=decay_alpha, trueSVD=True)
pro.initialize(X[:, :n_inits])
startpro = time.time()
for i, t in enumerate(update_times):
start, end = t, t+n_samples_update
dat = X[:, start:end]

pro.preupdate()
pro.updateSVD(dat)
pro.postupdate()
print(f'proSVD took {time.time()-startpro:.2f} s')

#%%
# visualize
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(s, label='true SVD')
ax[0].plot(pro.S, label='proSVD')

ax[1].plot(pca.explained_variance_, label='PCA eigenvalues')
ax[1].plot((pro.S**2)/(n-1), label='proSVD eigenvalues')

for currax in ax:
currax.legend()

#%%
14 changes: 5 additions & 9 deletions proSVD.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,8 @@ def initialize(self, A_init, Q_init=None, B_init=None):
# optional num_iters, defaults to self.w_shift going through once
# updates proSVD
# outputs: projections, variance explained, derivatives
def run(self, A, num_init, num_iters=None, ref_basis=None):
# getting index of update times
if num_iters is None: # do enough iters to go through once
num_iters = np.floor((A.shape[1] - num_init - self.w_len)/self.w_shift).astype('int')
update_times = np.arange(num_init, num_iters*self.w_shift, self.w_len) # index of when updates happen (not including init)

def run(self, A, update_times, ref_basis=None):

# run proSVD online
for i, t in enumerate(update_times):
start, end = t, t+self.w_len
Expand All @@ -81,13 +77,13 @@ def run(self, A, num_init, num_iters=None, ref_basis=None):
# TODO: run should take user input pre/postupdate functions
# they should be executed here
self.preupdate()
self._updateSVD(dat, ref_basis)
self.postupdate(dat, start, end, i, t, num_init, A)
self.updateSVD(dat, ref_basis)
self.postupdate()

return

# internal func to do a single iter of basis update given some data A
def _updateSVD(self, A, ref_basis=None):
def updateSVD(self, A, ref_basis=None):
## Update our basis vectors based on a chunk of new data
## Currently assume we get chunks as specificed in self.l
## QR decomposition of new data
Expand Down

0 comments on commit d9e3dda

Please sign in to comment.