Merge remote-tracking branch 'origin/master' into nan

apache · Jul 18, 2015 · d907b5b · d907b5b
2 parents 630ebc5 + 34a889d
commit d907b5b
Show file tree

Hide file tree

Showing 771 changed files with 27,712 additions and 9,198 deletions.
diff --git a/.rat-excludes b/.rat-excludes
@@ -91,3 +91,5 @@ help/*
 html/*
 INDEX
 .lintr
+gen-java.*
+.*avpr
diff --git a/R/README.md b/R/README.md
@@ -6,7 +6,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R
 
 #### Build Spark
 
-Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-PsparkR` profile to build the R package. For example to use the default Hadoop versions you can run
+Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
 ```
   build/mvn -DskipTests -Psparkr package
 ```

diff --git a/R/install-dev.bat b/R/install-dev.bat
@@ -25,3 +25,8 @@ set SPARK_HOME=%~dp0..
 MKDIR %SPARK_HOME%\R\lib
 
 R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib"  %SPARK_HOME%\R\pkg\
+
+rem Zip the SparkR package so that it can be distributed to worker nodes on YARN
+pushd %SPARK_HOME%\R\lib
+%JAVA_HOME%\bin\jar.exe cfM "%SPARK_HOME%\R\lib\sparkr.zip" SparkR
+popd
diff --git a/R/install-dev.sh b/R/install-dev.sh
@@ -34,12 +34,16 @@ LIB_DIR="$FWDIR/lib"
 
 mkdir -p $LIB_DIR
 
-pushd $FWDIR
+pushd $FWDIR > /dev/null
 
 # Generate Rd files if devtools is installed
 Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
 
 # Install SparkR to $LIB_DIR
 R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
 
-popd
+# Zip the SparkR package so that it can be distributed to worker nodes on YARN
+cd $LIB_DIR
+jar cfM "$LIB_DIR/sparkr.zip" SparkR
+
+popd > /dev/null
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -32,4 +32,3 @@ Collate:
     'serialize.R'
     'sparkR.R'
     'utils.R'
-    'zzz.R'
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -77,6 +77,7 @@ exportMethods("abs",
               "atan",
               "atan2",
               "avg",
+              "between",
               "cast",
               "cbrt",
               "ceiling",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -1314,7 +1314,7 @@ setMethod("except",
 #' write.df(df, "myfile", "parquet", "overwrite")
 #' }
 setMethod("write.df",
-          signature(df = "DataFrame", path = 'character'),
+          signature(df = "DataFrame", path = "character"),
           function(df, path, source = NULL, mode = "append", ...){
             if (is.null(source)) {
               sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
@@ -1328,7 +1328,7 @@ setMethod("write.df",
             jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
             options <- varargsToEnv(...)
             if (!is.null(path)) {
-                options[['path']] = path
+                options[["path"]] <- path
             }
             callJMethod(df@sdf, "save", source, jmode, options)
           })
@@ -1337,7 +1337,7 @@ setMethod("write.df",
 #' @aliases saveDF
 #' @export
 setMethod("saveDF",
-          signature(df = "DataFrame", path = 'character'),
+          signature(df = "DataFrame", path = "character"),
           function(df, path, source = NULL, mode = "append", ...){
             write.df(df, path, source, mode, ...)
           })
@@ -1375,8 +1375,8 @@ setMethod("saveDF",
 #' saveAsTable(df, "myfile")
 #' }
 setMethod("saveAsTable",
-          signature(df = "DataFrame", tableName = 'character', source = 'character',
-                    mode = 'character'),
+          signature(df = "DataFrame", tableName = "character", source = "character",
+                    mode = "character"),
           function(df, tableName, source = NULL, mode="append", ...){
             if (is.null(source)) {
               sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)

diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
@@ -165,7 +165,6 @@ setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
                                    serializedFuncArr,
                                    rdd@env$prev_serializedMode,
                                    packageNamesArr,
-                                   as.character(.sparkREnv[["libname"]]),
                                    broadcastArr,
                                    callJMethod(prev_jrdd, "classTag"))
             } else {
@@ -175,7 +174,6 @@ setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
                                    rdd@env$prev_serializedMode,
                                    serializedMode,
                                    packageNamesArr,
-                                   as.character(.sparkREnv[["libname"]]),
                                    broadcastArr,
                                    callJMethod(prev_jrdd, "classTag"))
             }

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -86,7 +86,9 @@ infer_type <- function(x) {
 createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) {
   if (is.data.frame(data)) {
       # get the names of columns, they will be put into RDD
-      schema <- names(data)
+      if (is.null(schema)) {
+        schema <- names(data)
+      }
       n <- nrow(data)
       m <- ncol(data)
       # get rid of factor type
@@ -455,7 +457,7 @@ dropTempTable <- function(sqlContext, tableName) {
 read.df <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
-    options[['path']] <- path
+    options[["path"]] <- path
   }
   if (is.null(source)) {
     sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
@@ -504,7 +506,7 @@ loadDF <- function(sqlContext, path = NULL, source = NULL, schema = NULL, ...) {
 createExternalTable <- function(sqlContext, tableName, path = NULL, source = NULL, ...) {
   options <- varargsToEnv(...)
   if (!is.null(path)) {
-    options[['path']] <- path
+    options[["path"]] <- path
   }
   sdf <- callJMethod(sqlContext, "createExternalTable", tableName, source, options)
   dataFrame(sdf)

diff --git a/R/pkg/R/client.R b/R/pkg/R/client.R
@@ -36,9 +36,9 @@ connectBackend <- function(hostname, port, timeout = 6000) {
 
 determineSparkSubmitBin <- function() {
   if (.Platform$OS.type == "unix") {
-    sparkSubmitBinName = "spark-submit"
+    sparkSubmitBinName <- "spark-submit"
   } else {
-    sparkSubmitBinName = "spark-submit.cmd"
+    sparkSubmitBinName <- "spark-submit.cmd"
   }
   sparkSubmitBinName
 }

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -187,6 +187,23 @@ setMethod("substr", signature(x = "Column"),
             column(jc)
           })
 
+#' between
+#'
+#' Test if the column is between the lower bound and upper bound, inclusive.
+#'
+#' @rdname column
+#'
+#' @param bounds lower and upper bounds
+setMethod("between", signature(x = "Column"),
+          function(x, bounds) {
+            if (is.vector(bounds) && length(bounds) == 2) {
+              jc <- callJMethod(x@jc, "between", bounds[1], bounds[2])
+              column(jc)
+            } else {
+              stop("bounds should be a vector of lower and upper bounds")
+            }
+          })
+
 #' Casts the column to a different data type.
 #'
 #' @rdname column

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
@@ -23,6 +23,7 @@
 # Int -> integer
 # String -> character
 # Boolean -> logical
+# Float -> double
 # Double -> double
 # Long -> double
 # Array[Byte] -> raw

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -20,7 +20,8 @@
 # @rdname aggregateRDD
 # @seealso reduce
 # @export
-setGeneric("aggregateRDD", function(x, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
+setGeneric("aggregateRDD",
+           function(x, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
 
 # @rdname cache-methods
 # @export
@@ -566,6 +567,10 @@ setGeneric("asc", function(x) { standardGeneric("asc") })
 #' @export
 setGeneric("avg", function(x, ...) { standardGeneric("avg") })
 
+#' @rdname column
+#' @export
+setGeneric("between", function(x, bounds) { standardGeneric("between") })
+
 #' @rdname column
 #' @export
 setGeneric("cast", function(x, dataType) { standardGeneric("cast") })

diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
@@ -87,7 +87,7 @@ setMethod("count",
 setMethod("agg",
           signature(x = "GroupedData"),
           function(x, ...) {
-            cols = list(...)
+            cols <- list(...)
             stopifnot(length(cols) > 0)
             if (is.character(cols[[1]])) {
               cols <- varargsToEnv(...)
@@ -97,7 +97,7 @@ setMethod("agg",
               if (!is.null(ns)) {
                 for (n in ns) {
                   if (n != "") {
-                    cols[[n]] = alias(cols[[n]], n)
+                    cols[[n]] <- alias(cols[[n]], n)
                   }
                 }
               }

diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
@@ -215,7 +215,6 @@ setMethod("partitionBy",
                                        serializedHashFuncBytes,
                                        getSerializedMode(x),
                                        packageNamesArr,
-                                       as.character(.sparkREnv$libname),
                                        broadcastArr,
                                        callJMethod(jrdd, "classTag"))
 
@@ -560,8 +559,8 @@ setMethod("join",
 # Left outer join two RDDs
 #
 # @description
-# \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of the form list(K, V).
-# The key types of the two RDDs should be the same.
+# \code{leftouterjoin} This function left-outer-joins two RDDs where every element is of
+# the form list(K, V). The key types of the two RDDs should be the same.
 #
 # @param x An RDD to be joined. Should be an RDD where each element is
 #             list(K, V).
@@ -597,8 +596,8 @@ setMethod("leftOuterJoin",
 # Right outer join two RDDs
 #
 # @description
-# \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of the form list(K, V).
-# The key types of the two RDDs should be the same.
+# \code{rightouterjoin} This function right-outer-joins two RDDs where every element is of
+# the form list(K, V). The key types of the two RDDs should be the same.
 #
 # @param x An RDD to be joined. Should be an RDD where each element is
 #             list(K, V).
@@ -634,8 +633,8 @@ setMethod("rightOuterJoin",
 # Full outer join two RDDs
 #
 # @description
-# \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of the form list(K, V).
-# The key types of the two RDDs should be the same.
+# \code{fullouterjoin} This function full-outer-joins two RDDs where every element is of
+# the form list(K, V). The key types of the two RDDs should be the same.
 #
 # @param x An RDD to be joined. Should be an RDD where each element is
 #             list(K, V).

diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
@@ -123,6 +123,7 @@ structField.character <- function(x, type, nullable = TRUE) {
   }
   options <- c("byte",
                "integer",
+               "float",
                "double",
                "numeric",
                "character",

diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
@@ -140,8 +140,8 @@ writeType <- function(con, class) {
                  jobj = "j",
                  environment = "e",
                  Date = "D",
-                 POSIXlt = 't',
-                 POSIXct = 't',
+                 POSIXlt = "t",
+                 POSIXct = "t",
                  stop(paste("Unsupported type for serialization", class)))
   writeBin(charToRaw(type), con)
 }

diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
@@ -17,10 +17,6 @@
 
 .sparkREnv <- new.env()
 
-sparkR.onLoad <- function(libname, pkgname) {
-  .sparkREnv$libname <- libname
-}
-
 # Utility function that returns TRUE if we have an active connection to the
 # backend and FALSE otherwise
 connExists <- function(env) {
@@ -80,7 +76,6 @@ sparkR.stop <- function() {
 #' @param sparkEnvir Named list of environment variables to set on worker nodes.
 #' @param sparkExecutorEnv Named list of environment variables to be used when launching executors.
 #' @param sparkJars Character string vector of jar files to pass to the worker nodes.
-#' @param sparkRLibDir The path where R is installed on the worker nodes.
 #' @param sparkPackages Character string vector of packages from spark-packages.org
 #' @export
 #' @examples
@@ -101,15 +96,15 @@ sparkR.init <- function(
   sparkEnvir = list(),
   sparkExecutorEnv = list(),
   sparkJars = "",
-  sparkRLibDir = "",
   sparkPackages = "") {
 
   if (exists(".sparkRjsc", envir = .sparkREnv)) {
-    cat("Re-using existing Spark Context. Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context\n")
+    cat(paste("Re-using existing Spark Context.",
+              "Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context\n"))
     return(get(".sparkRjsc", envir = .sparkREnv))
   }
 
-  sparkMem <- Sys.getenv("SPARK_MEM", "512m")
+  sparkMem <- Sys.getenv("SPARK_MEM", "1024m")
   jars <- suppressWarnings(normalizePath(as.character(sparkJars)))
 
   # Classpath separator is ";" on Windows
@@ -145,7 +140,7 @@ sparkR.init <- function(
     if (!file.exists(path)) {
       stop("JVM is not ready after 10 seconds")
     }
-    f <- file(path, open='rb')
+    f <- file(path, open="rb")
     backendPort <- readInt(f)
     monitorPort <- readInt(f)
     close(f)
@@ -169,25 +164,23 @@ sparkR.init <- function(
     sparkHome <- normalizePath(sparkHome)
   }
 
-  if (nchar(sparkRLibDir) != 0) {
-    .sparkREnv$libname <- sparkRLibDir
-  }
-
   sparkEnvirMap <- new.env()
   for (varname in names(sparkEnvir)) {
     sparkEnvirMap[[varname]] <- sparkEnvir[[varname]]
   }
 
   sparkExecutorEnvMap <- new.env()
   if (!any(names(sparkExecutorEnv) == "LD_LIBRARY_PATH")) {
-    sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <- paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH"))
+    sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <-
+      paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH"))
   }
   for (varname in names(sparkExecutorEnv)) {
     sparkExecutorEnvMap[[varname]] <- sparkExecutorEnv[[varname]]
   }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
-  localJarPaths <- sapply(nonEmptyJars, function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
+  localJarPaths <- sapply(nonEmptyJars,
+                          function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
 
   # Set the start time to identify jobjs
   # Seconds resolution is good enough for this purpose, so use ints
-Original file line number
+Diff line change
@@ Expand Up / @@ -91,3 +91,5 @@ help/* @@
     html/*
     INDEX
     .lintr
+    gen-java.*
+    .*avpr