diff --git a/demos/batch_eigenvalues.py b/demos/batch_eigenvalues.py new file mode 100644 index 0000000..32f712b --- /dev/null +++ b/demos/batch_eigenvalues.py @@ -0,0 +1,67 @@ +#%% +import time +import numpy as np +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA +from proSVD import proSVD + +#%% +# make large data high-dimensional data +n = 100000 # number of samples +p = 1000 # dimension of sample +X = np.random.uniform(-1, 1, size=(n,p)) +X -= X.mean(axis=0)[None, :] + +# do PCA and SVD +pca = PCA() +startpca = time.time() +pca.fit(X) +print(f'sklearn PCA took {time.time()-startpca:.2f} s') + +startsvd = time.time() +u, s, v = np.linalg.svd(X, full_matrices=False) +print(f'numpy svd took {time.time()-startsvd:.2f} s') + +# eigenvalues of covariance = singular values squared / (n_samples-1) +print(np.allclose((s**2)/(n-1), pca.explained_variance_)) + +#%% +# proSVD takes data in shape (num_dimensions, num_samples) +X = X.reshape((p, n)) + +# proSVD parameters +k = p # dimension to reduce to (keeping all p dims as example) +n_inits = 10000 # number of columns (samples) get initial basis +n_samples_update = n-n_inits # number of columns (samples) used per update iteration +decay_alpha = 1 # how much discounting of old data (sets effective window size, alpha=1 is all seen data) + +# get number of iterations for entire dataset +num_iters = 1 # np.floor((X.shape[1]-n_inits-n_samples_update)/n_samples_update).astype('int') +update_times = np.arange(n_inits, num_iters*n_samples_update, n_samples_update) # index of when updates happen (not including init) + +# make proSVD object, run +pro = proSVD(k, n_samples_update, decay_alpha=decay_alpha, trueSVD=True) +pro.initialize(X[:, :n_inits]) +startpro = time.time() +for i, t in enumerate(update_times): + start, end = t, t+n_samples_update + dat = X[:, start:end] + + pro.preupdate() + pro.updateSVD(dat) + pro.postupdate() +print(f'proSVD took {time.time()-startpro:.2f} s') + +#%% +# visualize +fig, ax = plt.subplots(1, 2, figsize=(12, 4)) +ax[0].plot(s, label='true SVD') +ax[0].plot(pro.S, label='proSVD') + +ax[1].plot(pca.explained_variance_, label='PCA eigenvalues') +ax[1].plot((pro.S**2)/(n-1), label='proSVD eigenvalues') + +for currax in ax: + currax.legend() + +#%% \ No newline at end of file diff --git a/proSVD.py b/proSVD.py index bc0fc35..bbe839d 100644 --- a/proSVD.py +++ b/proSVD.py @@ -67,12 +67,8 @@ def initialize(self, A_init, Q_init=None, B_init=None): # optional num_iters, defaults to self.w_shift going through once # updates proSVD # outputs: projections, variance explained, derivatives - def run(self, A, num_init, num_iters=None, ref_basis=None): - # getting index of update times - if num_iters is None: # do enough iters to go through once - num_iters = np.floor((A.shape[1] - num_init - self.w_len)/self.w_shift).astype('int') - update_times = np.arange(num_init, num_iters*self.w_shift, self.w_len) # index of when updates happen (not including init) - + def run(self, A, update_times, ref_basis=None): + # run proSVD online for i, t in enumerate(update_times): start, end = t, t+self.w_len @@ -81,13 +77,13 @@ def run(self, A, num_init, num_iters=None, ref_basis=None): # TODO: run should take user input pre/postupdate functions # they should be executed here self.preupdate() - self._updateSVD(dat, ref_basis) - self.postupdate(dat, start, end, i, t, num_init, A) + self.updateSVD(dat, ref_basis) + self.postupdate() return # internal func to do a single iter of basis update given some data A - def _updateSVD(self, A, ref_basis=None): + def updateSVD(self, A, ref_basis=None): ## Update our basis vectors based on a chunk of new data ## Currently assume we get chunks as specificed in self.l ## QR decomposition of new data