Add linear_time_svd.py

yunjhongwu · Mar 10, 2017 · c971dae · c971dae
1 parent 7970689
commit c971dae
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # Matrix routines
 
  - `cur.py`: Mahoney, M. W., & Drineas, P. (2009). CUR matrix decompositions for improved data analysis. *Proceedings of the National Academy of Sciences*, 106(3), 697-702.
+ - `linear_time_svd`: Drineas, P., Kannan, R., & Mahoney, M. W. (2006). Fast Monte Carlo algorithms for matrices II: Computing a low-rank approximation to a matrix. *SIAM Journal on computing*, 36(1), 158-183.
  - `matrix_completion.py`: Lin, Z., Chen, M., & Ma, Y. (2010). The augmented lagrange multiplier method for exact recovery of corrupted low-rank matrices. *arXiv preprint* arXiv:1009.5055.
  - `neighborhood_smoothing.py`: Zhang, Y., Levina, E. and Zhu, J. (2016) Estimating neighborhood edge probabilities by neighborhood smoothing. *arXiv preprint* arXiv: 1509.08588.
  - `parafac2.py`: Kiers, H. A., Ten Berge, J. M., & Bro, R. (1999). PARAFAC2-Part I. A direct fitting algorithm for the PARAFAC2 model. *Journal of Chemometrics*, 13(3-4), 275-294.

diff --git a/linear_time_svd.py b/linear_time_svd.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Mar  9 21:37:24 2017
+Author: Yun-Jhong Wu
+E-mail: [email protected]
+"""
+
+import numpy as np
+import scipy as sp
+
+def select(p):
+    D = 0
+    idx = -1
+    q = 0
+    for i, p_i in enumerate(p):
+        D += p_i
+        if np.random.uniform(0, D) < p_i:
+           idx = i
+           q = p_i
+
+    return idx, q
+
+def LinearTimeSVD(A, r, n_oversampling=10):
+    """
+    Drineas, P., Kannan, R., & Mahoney, M. W. (2006). Fast Monte Carlo 
+    algorithms for matrices II: Computing a low-rank approximation to 
+    a matrix. SIAM Journal on computing, 36(1), 158-183.
+    
+    A: data matrix
+
+    return (the r leading left singular values,
+            the r leading left singular vectors)
+    """
+
+    rowsums = np.sum(A * A, 0)
+    p = rowsums / np.sum(rowsums)
+    c = r + n_oversampling
+    idx, q = map(np.array, zip(*[select(p) for _ in range(c)]))
+    C = A[:, idx] * (1 / np.sqrt(c * q))
+    w, H = sp.linalg.eigh(C.T @ C, eigvals=[n_oversampling, c - 1])
+    d = np.sqrt(w)[::-1]
+
+    return d, H[:, ::-1] * (1 / d)
+