adding batch eigenvalues timing example

pearsonlab · Oct 9, 2022 · d9e3dda · d9e3dda
1 parent fd602aa
commit d9e3dda
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 9 deletions.
diff --git a/demos/batch_eigenvalues.py b/demos/batch_eigenvalues.py
@@ -0,0 +1,67 @@
+#%%
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from proSVD import proSVD
+
+#%%
+# make large data high-dimensional data
+n = 100000 # number of samples
+p = 1000 # dimension of sample
+X = np.random.uniform(-1, 1, size=(n,p))
+X -= X.mean(axis=0)[None, :]
+
+# do PCA and SVD
+pca = PCA()
+startpca = time.time()
+pca.fit(X)
+print(f'sklearn PCA took {time.time()-startpca:.2f} s')
+
+startsvd = time.time()
+u, s, v = np.linalg.svd(X, full_matrices=False)
+print(f'numpy svd took {time.time()-startsvd:.2f} s')
+
+# eigenvalues of covariance = singular values squared / (n_samples-1)
+print(np.allclose((s**2)/(n-1), pca.explained_variance_)) 
+
+#%% 
+# proSVD takes data in shape (num_dimensions, num_samples)
+X = X.reshape((p, n))
+
+# proSVD parameters
+k = p # dimension to reduce to (keeping all p dims as example)
+n_inits = 10000 # number of columns (samples) get initial basis
+n_samples_update = n-n_inits # number of columns (samples) used per update iteration
+decay_alpha = 1 # how much discounting of old data (sets effective window size, alpha=1 is all seen data)
+
+# get number of iterations for entire dataset
+num_iters = 1 # np.floor((X.shape[1]-n_inits-n_samples_update)/n_samples_update).astype('int')
+update_times = np.arange(n_inits, num_iters*n_samples_update, n_samples_update) # index of when updates happen (not including init)
+
+# make proSVD object, run
+pro = proSVD(k, n_samples_update, decay_alpha=decay_alpha, trueSVD=True)
+pro.initialize(X[:, :n_inits])
+startpro = time.time()
+for i, t in enumerate(update_times): 
+    start, end = t, t+n_samples_update
+    dat = X[:, start:end]
+
+    pro.preupdate()
+    pro.updateSVD(dat)
+    pro.postupdate()
+print(f'proSVD took {time.time()-startpro:.2f} s')
+
+#%%
+# visualize 
+fig, ax = plt.subplots(1, 2, figsize=(12, 4))
+ax[0].plot(s, label='true SVD')
+ax[0].plot(pro.S, label='proSVD')
+
+ax[1].plot(pca.explained_variance_, label='PCA eigenvalues')
+ax[1].plot((pro.S**2)/(n-1), label='proSVD eigenvalues')
+
+for currax in ax:
+    currax.legend()
+
+#%%
diff --git a/proSVD.py b/proSVD.py
@@ -67,12 +67,8 @@ def initialize(self, A_init, Q_init=None, B_init=None):
     #        optional num_iters, defaults to self.w_shift going through once
     # updates proSVD 
     # outputs: projections, variance explained, derivatives 
-    def run(self, A, num_init, num_iters=None, ref_basis=None):
-        # getting index of update times
-        if num_iters is None: # do enough iters to go through once
-            num_iters = np.floor((A.shape[1] - num_init - self.w_len)/self.w_shift).astype('int')
-        update_times = np.arange(num_init, num_iters*self.w_shift, self.w_len) # index of when updates happen (not including init)
-
+    def run(self, A, update_times, ref_basis=None):
+
         # run proSVD online
         for i, t in enumerate(update_times): 
             start, end = t, t+self.w_len
@@ -81,13 +77,13 @@ def run(self, A, num_init, num_iters=None, ref_basis=None):
             # TODO: run should take user input pre/postupdate functions
             # they should be executed here
             self.preupdate()
-            self._updateSVD(dat, ref_basis)
-            self.postupdate(dat, start, end, i, t, num_init, A)
+            self.updateSVD(dat, ref_basis)
+            self.postupdate()
 
         return
 
     # internal func to do a single iter of basis update given some data A
-    def _updateSVD(self, A, ref_basis=None):
+    def updateSVD(self, A, ref_basis=None):
         ## Update our basis vectors based on a chunk of new data
         ## Currently assume we get chunks as specificed in self.l
         ## QR decomposition of new data