From 9c8051594a88b53ce83b39b127a098b31bd89aad Mon Sep 17 00:00:00 2001 From: Li Pu Date: Wed, 4 Jun 2014 01:25:58 -0700 Subject: [PATCH] use non-sparse implementation when k = n --- .../mllib/linalg/distributed/RowMatrix.scala | 15 +++++++++++++-- .../mllib/linalg/distributed/RowMatrixSuite.scala | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index fd653a367a81d..fd599b53bef0a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -246,6 +246,9 @@ class RowMatrix( * Then we compute U via easy matrix multiplication as U = A * (V * S-1). * Note that this approach requires `O(nnz(A))` time. * + * When the requested eigenvalues k = n, a non-sparse implementation will be used, which requires + * `n^2` doubles to fit in memory and `O(n^3)` time on the master node. + * * At most k largest non-zero singular values and associated vectors are returned. * If there are k such values, then the dimensions of the return will be: * @@ -269,8 +272,16 @@ class RowMatrix( val n = numCols().toInt require(k > 0 && k <= n, s"Request up to n singular values k=$k n=$n.") - val (sigmaSquares: BDV[Double], u: BDM[Double]) = + val (sigmaSquares: BDV[Double], u: BDM[Double]) = if (k < n) { EigenValueDecomposition.symmetricEigs(multiplyGramianMatrix, n, k, tol) + } else { + logWarning(s"Request full SVD (k = n = $k), while ARPACK requires k strictly less than n. " + + s"Using non-sparse implementation.") + val G = computeGramianMatrix() + val (uFull: BDM[Double], sigmaSquaresFull: BDV[Double], vFull: BDM[Double]) = + brzSvd(G.toBreeze.asInstanceOf[BDM[Double]]) + (sigmaSquaresFull, uFull) + } val sigmas: BDV[Double] = brzSqrt(sigmaSquares) // Determine effective rank. @@ -508,4 +519,4 @@ object RowMatrix { Matrices.dense(n, n, G.data) } -} +} \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index 9c346de3d2fbe..8014428b02d59 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -99,7 +99,7 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext { val localMat = mat.toBreeze() val (localU, localSigma, localVt) = brzSvd(localMat) val localV: BDM[Double] = localVt.t.toDenseMatrix - for (k <- 1 to (n - 1)) { + for (k <- 1 to n) { val svd = mat.computeSVD(k, computeU = true) val U = svd.U val s = svd.s