diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 044401af98fda..635a04e48d3a5 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -18,10 +18,10 @@ Collate: 'schema.R' 'generics.R' 'jobj.R' - 'RDD.R' - 'pairRDD.R' 'column.R' 'group.R' + 'RDD.R' + 'pairRDD.R' 'DataFrame.R' 'SQLContext.R' 'backend.R' @@ -36,3 +36,4 @@ Collate: 'stats.R' 'types.R' 'utils.R' +RoxygenNote: 5.0.1 diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 5b47f0b17b5c7..ddfde3d3a677f 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -303,8 +303,28 @@ setMethod("colnames", #' @rdname columns #' @name colnames<- setMethod("colnames<-", - signature(x = "DataFrame", value = "character"), + signature(x = "DataFrame"), function(x, value) { + + # Check parameter integrity + if (class(value) != "character") { + stop("Invalid column names.") + } + + if (length(value) != ncol(x)) { + stop( + "Column names must have the same length as the number of columns in the dataset.") + } + + if (any(is.na(value))) { + stop("Column names cannot be NA.") + } + + # Check if the column names have . in it + if (any(regexec(".", value, fixed = TRUE)[[1]][1] != -1)) { + stop("Colum names cannot contain the '.' symbol.") + } + sdf <- callJMethod(x@sdf, "toDF", as.list(value)) dataFrame(sdf) }) @@ -331,7 +351,7 @@ setMethod("coltypes", types <- sapply(dtypes(x), function(x) {x[[2]]}) # Map Spark data types into R's data types using DATA_TYPES environment - rTypes <- sapply(types, USE.NAMES=F, FUN=function(x) { + rTypes <- sapply(types, USE.NAMES = F, FUN = function(x) { # Check for primitive types type <- PRIMITIVE_TYPES[[x]] @@ -1734,7 +1754,7 @@ setMethod("merge", signature(x = "DataFrame", y = "DataFrame"), function(x, y, by = intersect(names(x), names(y)), by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all, - sort = TRUE, suffixes = c("_x","_y"), ... ) { + sort = TRUE, suffixes = c("_x", "_y"), ... ) { if (length(suffixes) != 2) { stop("suffixes must have length 2") @@ -2258,7 +2278,7 @@ setMethod("as.data.frame", function(x, ...) { # Check if additional parameters have been passed if (length(list(...)) > 0) { - stop(paste("Unused argument(s): ", paste(list(...), collapse=", "))) + stop(paste("Unused argument(s): ", paste(list(...), collapse = ", "))) } collect(x) }) @@ -2354,13 +2374,13 @@ setMethod("str", # Get the first elements for each column firstElements <- if (types[i] == "character") { - paste(paste0("\"", localDF[,i], "\""), collapse = " ") + paste(paste0("\"", localDF[, i], "\""), collapse = " ") } else { - paste(localDF[,i], collapse = " ") + paste(localDF[, i], collapse = " ") } # Add the corresponding number of spaces for alignment - spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse="") + spaces <- paste(rep(" ", max(nchar(names) - nchar(names[i]))), collapse = "") # Get the short type. For 'character', it would be 'chr'; # 'for numeric', it's 'num', etc. @@ -2372,7 +2392,7 @@ setMethod("str", # Concatenate the colnames, coltypes, and first # elements of each column line <- paste0(" $ ", names[i], spaces, ": ", - dataType, " ",firstElements) + dataType, " ", firstElements) # Chop off extra characters if this is too long cat(substr(line, 1, MAX_CHAR_PER_ROW)) @@ -2383,4 +2403,4 @@ setMethod("str", cat(paste0("\nDisplaying first ", ncol(localDF), " columns only.")) } } - }) \ No newline at end of file + }) diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index a78fbb714f2be..35c4e6f1afaf4 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -67,7 +67,7 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode, setMethod("show", "RDD", function(object) { - cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep="")) + cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep = "")) }) setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) { diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 471bec1eacf03..b0e67c8ad26ab 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -103,7 +103,10 @@ parallelize <- function(sc, coll, numSlices = 1) { # TODO: bound/safeguard numSlices # TODO: unit tests for if the split works for all primitives # TODO: support matrix, data frame, etc + # nolint start + # suppress lintr warning: Place a space before left parenthesis, except in a function call. if ((!is.list(coll) && !is.vector(coll)) || is.data.frame(coll)) { + # nolint end if (is.data.frame(coll)) { message(paste("context.R: A data frame is parallelized by columns.")) } else { diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index d8a0393275390..eefdf178733fd 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -186,7 +186,7 @@ readMultipleObjects <- function(inputCon) { # of the objects, so the number of objects varies, we try to read # all objects in a loop until the end of the stream. data <- list() - while(TRUE) { + while (TRUE) { # If reaching the end of the stream, type returned should be "". type <- readType(inputCon) if (type == "") { diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 62a854ef93afa..88cff3559ca8a 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -581,7 +581,7 @@ setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr") #' @rdname showDF #' @export -setGeneric("showDF", function(x,...) { standardGeneric("showDF") }) +setGeneric("showDF", function(x, ...) { standardGeneric("showDF") }) # @rdname subset # @export @@ -589,7 +589,7 @@ setGeneric("subset", function(x, ...) { standardGeneric("subset") }) #' @rdname agg #' @export -setGeneric("summarize", function(x,...) { standardGeneric("summarize") }) +setGeneric("summarize", function(x, ...) { standardGeneric("summarize") }) #' @rdname summary #' @export diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 8d3b4388ae575..a9ffc61d9595c 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -54,7 +54,7 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFram function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0, standardize = TRUE, solver = "auto") { family <- match.arg(family) - formula <- paste(deparse(formula), collapse="") + formula <- paste(deparse(formula), collapse = "") model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers", "fitRModelFormula", formula, data@sdf, family, lambda, alpha, standardize, solver) diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 095ddb9aed2e7..5f774052c97d8 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -100,7 +100,7 @@ writeJobj <- function(con, value) { writeString <- function(con, value) { utfVal <- enc2utf8(value) writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1)) - writeBin(utfVal, con, endian = "big", useBytes=TRUE) + writeBin(utfVal, con, endian = "big", useBytes = TRUE) } writeInt <- function(con, value) { diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index d2bfad553104f..996cac6fe617a 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -153,7 +153,7 @@ sparkR.init <- function( if (!file.exists(path)) { stop("JVM is not ready after 10 seconds") } - f <- file(path, open="rb") + f <- file(path, open = "rb") backendPort <- readInt(f) monitorPort <- readInt(f) rLibPath <- readString(f) @@ -185,9 +185,9 @@ sparkR.init <- function( } sparkExecutorEnvMap <- convertNamedListToEnv(sparkExecutorEnv) - if(is.null(sparkExecutorEnvMap$LD_LIBRARY_PATH)) { + if (is.null(sparkExecutorEnvMap$LD_LIBRARY_PATH)) { sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <- - paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH")) + paste0("$LD_LIBRARY_PATH:", Sys.getenv("LD_LIBRARY_PATH")) } # Classpath separator is ";" on Windows diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index 43105aaa38424..36bbfdab5de80 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -158,7 +158,7 @@ wrapInt <- function(value) { # Multiply `val` by 31 and add `addVal` to the result. Ensures that # integer-overflows are handled at every step. mult31AndAdd <- function(val, addVal) { - vec <- c(bitwShiftL(val, c(4,3,2,1,0)), addVal) + vec <- c(bitwShiftL(val, c(4, 3, 2, 1, 0)), addVal) Reduce(function(a, b) { wrapInt(as.numeric(a) + as.numeric(b)) }, @@ -202,7 +202,7 @@ serializeToString <- function(rdd) { # This function amortizes the allocation cost by doubling # the size of the list every time it fills up. addItemToAccumulator <- function(acc, item) { - if(acc$counter == acc$size) { + if (acc$counter == acc$size) { acc$size <- acc$size * 2 length(acc$data) <- acc$size } diff --git a/R/pkg/inst/profile/general.R b/R/pkg/inst/profile/general.R index c55fe9ba7af7a..8c75c19ca7ac3 100644 --- a/R/pkg/inst/profile/general.R +++ b/R/pkg/inst/profile/general.R @@ -19,5 +19,5 @@ packageDir <- Sys.getenv("SPARKR_PACKAGE_DIR") dirs <- strsplit(packageDir, ",")[[1]] .libPaths(c(dirs, .libPaths())) - Sys.setenv(NOAWT=1) + Sys.setenv(NOAWT = 1) } diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R index 207a37a0cb47f..c26b28b78dee8 100644 --- a/R/pkg/inst/tests/testthat/packageInAJarTest.R +++ b/R/pkg/inst/tests/testthat/packageInAJarTest.R @@ -25,6 +25,6 @@ run2 <- myfunc(-4L) sparkR.stop() -if(run1 != 6) quit(save = "no", status = 1) +if (run1 != 6) quit(save = "no", status = 1) -if(run2 != -3) quit(save = "no", status = 1) +if (run2 != -3) quit(save = "no", status = 1) diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R index f2452ed97d2ea..976a7558a816d 100644 --- a/R/pkg/inst/tests/testthat/test_binaryFile.R +++ b/R/pkg/inst/tests/testthat/test_binaryFile.R @@ -23,8 +23,8 @@ sc <- sparkR.init() mockFile <- c("Spark is pretty.", "Spark is awesome.") test_that("saveAsObjectFile()/objectFile() following textFile() works", { - fileName1 <- tempfile(pattern="spark-test", fileext=".tmp") - fileName2 <- tempfile(pattern="spark-test", fileext=".tmp") + fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp") + fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName1) rdd <- textFile(sc, fileName1, 1) @@ -37,7 +37,7 @@ test_that("saveAsObjectFile()/objectFile() following textFile() works", { }) test_that("saveAsObjectFile()/objectFile() works on a parallelized list", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") l <- list(1, 2, 3) rdd <- parallelize(sc, l, 1) @@ -49,8 +49,8 @@ test_that("saveAsObjectFile()/objectFile() works on a parallelized list", { }) test_that("saveAsObjectFile()/objectFile() following RDD transformations works", { - fileName1 <- tempfile(pattern="spark-test", fileext=".tmp") - fileName2 <- tempfile(pattern="spark-test", fileext=".tmp") + fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp") + fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName1) rdd <- textFile(sc, fileName1) @@ -73,8 +73,8 @@ test_that("saveAsObjectFile()/objectFile() following RDD transformations works", }) test_that("saveAsObjectFile()/objectFile() works with multiple paths", { - fileName1 <- tempfile(pattern="spark-test", fileext=".tmp") - fileName2 <- tempfile(pattern="spark-test", fileext=".tmp") + fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp") + fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp") rdd1 <- parallelize(sc, "Spark is pretty.") saveAsObjectFile(rdd1, fileName1) diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/testthat/test_binary_function.R index f054ac9a87d61..7bad4d2a7e106 100644 --- a/R/pkg/inst/tests/testthat/test_binary_function.R +++ b/R/pkg/inst/tests/testthat/test_binary_function.R @@ -31,7 +31,7 @@ test_that("union on two RDDs", { actual <- collect(unionRDD(rdd, rdd)) expect_equal(actual, as.list(rep(nums, 2))) - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) text.rdd <- textFile(sc, fileName) @@ -74,10 +74,10 @@ test_that("zipPartitions() on RDDs", { actual <- collect(zipPartitions(rdd1, rdd2, rdd3, func = function(x, y, z) { list(list(x, y, z))} )) expect_equal(actual, - list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6)))) + list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6)))) mockFile <- c("Spark is pretty.", "Spark is awesome.") - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName, 1) diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/testthat/test_broadcast.R index bb86a5c922bde..8be6efc3dbed3 100644 --- a/R/pkg/inst/tests/testthat/test_broadcast.R +++ b/R/pkg/inst/tests/testthat/test_broadcast.R @@ -25,7 +25,7 @@ nums <- 1:2 rrdd <- parallelize(sc, nums, 2L) test_that("using broadcast variable", { - randomMat <- matrix(nrow=10, ncol=10, data=rnorm(100)) + randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100)) randomMatBr <- broadcast(sc, randomMat) useBroadcast <- function(x) { @@ -37,7 +37,7 @@ test_that("using broadcast variable", { }) test_that("without using broadcast variable", { - randomMat <- matrix(nrow=10, ncol=10, data=rnorm(100)) + randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100)) useBroadcast <- function(x) { sum(randomMat * x) diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R index 08099dd96a87b..63817233e99f3 100644 --- a/R/pkg/inst/tests/testthat/test_mllib.R +++ b/R/pkg/inst/tests/testthat/test_mllib.R @@ -96,9 +96,9 @@ test_that("summary coefficients match with native glm of family 'binomial'", { training <- filter(df, df$Species != "setosa") stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training, family = "binomial")) - coefs <- as.vector(stats$coefficients[,1]) + coefs <- as.vector(stats$coefficients[, 1]) - rTraining <- iris[iris$Species %in% c("versicolor","virginica"),] + rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ] rCoefs <- as.vector(coef(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining, family = binomial(link = "logit")))) diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R index 1b3a22486e95f..3b0c16be5a754 100644 --- a/R/pkg/inst/tests/testthat/test_rdd.R +++ b/R/pkg/inst/tests/testthat/test_rdd.R @@ -75,7 +75,7 @@ test_that("mapPartitions on RDD", { test_that("flatMap() on RDDs", { flat <- flatMap(intRdd, function(x) { list(x, x) }) actual <- collect(flat) - expect_equal(actual, rep(intPairs, each=2)) + expect_equal(actual, rep(intPairs, each = 2)) }) test_that("filterRDD on RDD", { @@ -245,9 +245,9 @@ test_that("mapValues() on pairwise RDDs", { }) test_that("flatMapValues() on pairwise RDDs", { - l <- parallelize(sc, list(list(1, c(1,2)), list(2, c(3,4)))) + l <- parallelize(sc, list(list(1, c(1, 2)), list(2, c(3, 4)))) actual <- collect(flatMapValues(l, function(x) { x })) - expect_equal(actual, list(list(1,1), list(1,2), list(2,3), list(2,4))) + expect_equal(actual, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4))) # Generate x to x+1 for every value actual <- collect(flatMapValues(intRdd, function(x) { x: (x + 1) })) @@ -448,12 +448,12 @@ test_that("zipRDD() on RDDs", { list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004))) mockFile <- c("Spark is pretty.", "Spark is awesome.") - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName, 1) actual <- collect(zipRDD(rdd, rdd)) - expected <- lapply(mockFile, function(x) { list(x ,x) }) + expected <- lapply(mockFile, function(x) { list(x, x) }) expect_equal(actual, expected) rdd1 <- parallelize(sc, 0:1, 1) @@ -484,7 +484,7 @@ test_that("cartesian() on RDDs", { expect_equal(actual, list()) mockFile <- c("Spark is pretty.", "Spark is awesome.") - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) @@ -523,19 +523,19 @@ test_that("subtract() on RDDs", { # subtract by an empty RDD rdd2 <- parallelize(sc, list()) actual <- collect(subtract(rdd1, rdd2)) - expect_equal(as.list(sort(as.vector(actual, mode="integer"))), + expect_equal(as.list(sort(as.vector(actual, mode = "integer"))), l) rdd2 <- parallelize(sc, list(2, 4)) actual <- collect(subtract(rdd1, rdd2)) - expect_equal(as.list(sort(as.vector(actual, mode="integer"))), + expect_equal(as.list(sort(as.vector(actual, mode = "integer"))), list(1, 1, 3)) l <- list("a", "a", "b", "b", "c", "d") rdd1 <- parallelize(sc, l) rdd2 <- parallelize(sc, list("b", "d")) actual <- collect(subtract(rdd1, rdd2)) - expect_equal(as.list(sort(as.vector(actual, mode="character"))), + expect_equal(as.list(sort(as.vector(actual, mode = "character"))), list("a", "a", "c")) }) @@ -585,53 +585,53 @@ test_that("intersection() on RDDs", { }) test_that("join() on pairwise RDDs", { - rdd1 <- parallelize(sc, list(list(1,1), list(2,4))) - rdd2 <- parallelize(sc, list(list(1,2), list(1,3))) + rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) + rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) actual <- collect(join(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list(1, list(1, 2)), list(1, list(1, 3))))) - rdd1 <- parallelize(sc, list(list("a",1), list("b",4))) - rdd2 <- parallelize(sc, list(list("a",2), list("a",3))) + rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4))) + rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3))) actual <- collect(join(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list("a", list(1, 2)), list("a", list(1, 3))))) - rdd1 <- parallelize(sc, list(list(1,1), list(2,2))) - rdd2 <- parallelize(sc, list(list(3,3), list(4,4))) + rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) + rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) actual <- collect(join(rdd1, rdd2, 2L)) expect_equal(actual, list()) - rdd1 <- parallelize(sc, list(list("a",1), list("b",2))) - rdd2 <- parallelize(sc, list(list("c",3), list("d",4))) + rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) + rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) actual <- collect(join(rdd1, rdd2, 2L)) expect_equal(actual, list()) }) test_that("leftOuterJoin() on pairwise RDDs", { - rdd1 <- parallelize(sc, list(list(1,1), list(2,4))) - rdd2 <- parallelize(sc, list(list(1,2), list(1,3))) + rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4))) + rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3))) actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list("a",1), list("b",4))) - rdd2 <- parallelize(sc, list(list("a",2), list("a",3))) + rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4))) + rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3))) actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(4, NULL)), list("a", list(1, 2)), list("a", list(1, 3))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list(1,1), list(2,2))) - rdd2 <- parallelize(sc, list(list(3,3), list(4,4))) + rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) + rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(1, NULL)), list(2, list(2, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list("a",1), list("b",2))) - rdd2 <- parallelize(sc, list(list("c",3), list("d",4))) + rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) + rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) actual <- collect(leftOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(2, NULL)), list("a", list(1, NULL))) expect_equal(sortKeyValueList(actual), @@ -639,57 +639,57 @@ test_that("leftOuterJoin() on pairwise RDDs", { }) test_that("rightOuterJoin() on pairwise RDDs", { - rdd1 <- parallelize(sc, list(list(1,2), list(1,3))) - rdd2 <- parallelize(sc, list(list(1,1), list(2,4))) + rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3))) + rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list("a",2), list("a",3))) - rdd2 <- parallelize(sc, list(list("a",1), list("b",4))) + rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3))) + rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4))) actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list(1,1), list(2,2))) - rdd2 <- parallelize(sc, list(list(3,3), list(4,4))) + rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) + rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list(3, list(NULL, 3)), list(4, list(NULL, 4))))) - rdd1 <- parallelize(sc, list(list("a",1), list("b",2))) - rdd2 <- parallelize(sc, list(list("c",3), list("d",4))) + rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) + rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) actual <- collect(rightOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list("d", list(NULL, 4)), list("c", list(NULL, 3))))) }) test_that("fullOuterJoin() on pairwise RDDs", { - rdd1 <- parallelize(sc, list(list(1,2), list(1,3), list(3,3))) - rdd2 <- parallelize(sc, list(list(1,1), list(2,4))) + rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3))) + rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4))) actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)), list(3, list(3, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list("a",2), list("a",3), list("c", 1))) - rdd2 <- parallelize(sc, list(list("a",1), list("b",4))) + rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3), list("c", 1))) + rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4))) actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)), list("c", list(1, NULL))) expect_equal(sortKeyValueList(actual), sortKeyValueList(expected)) - rdd1 <- parallelize(sc, list(list(1,1), list(2,2))) - rdd2 <- parallelize(sc, list(list(3,3), list(4,4))) + rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2))) + rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4))) actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)), list(3, list(NULL, 3)), list(4, list(NULL, 4))))) - rdd1 <- parallelize(sc, list(list("a",1), list("b",2))) - rdd2 <- parallelize(sc, list(list("c",3), list("d",4))) + rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2))) + rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4))) actual <- collect(fullOuterJoin(rdd1, rdd2, 2L)) expect_equal(sortKeyValueList(actual), sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)), diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index adef4403bcc33..61acaefa3bdff 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -41,8 +41,8 @@ sqlContext <- sparkRSQL.init(sc) mockLines <- c("{\"name\":\"Michael\"}", "{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"Justin\", \"age\":19}") -jsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp") -parquetPath <- tempfile(pattern="sparkr-test", fileext=".parquet") +jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") +parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet") writeLines(mockLines, jsonPath) # For test nafunctions, like dropna(), fillna(),... @@ -51,7 +51,7 @@ mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}", "{\"name\":\"David\",\"age\":60,\"height\":null}", "{\"name\":\"Amy\",\"age\":null,\"height\":null}", "{\"name\":null,\"age\":null,\"height\":null}") -jsonPathNa <- tempfile(pattern="sparkr-test", fileext=".tmp") +jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLinesNa, jsonPathNa) # For test complex types in DataFrame @@ -59,7 +59,7 @@ mockLinesComplexType <- c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}", "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}", "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}") -complexTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp") +complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLinesComplexType, complexTypeJsonPath) test_that("infer types and check types", { @@ -147,9 +147,9 @@ test_that("create DataFrame from RDD", { expect_equal(as.list(collect(where(df2AsDF, df2AsDF$name == "Bob"))), list(name = "Bob", age = 16, height = 176.5)) - localDF <- data.frame(name=c("John", "Smith", "Sarah"), - age=c(19L, 23L, 18L), - height=c(176.5, 181.4, 173.7)) + localDF <- data.frame(name = c("John", "Smith", "Sarah"), + age = c(19L, 23L, 18L), + height = c(176.5, 181.4, 173.7)) df <- createDataFrame(sqlContext, localDF, schema) expect_is(df, "DataFrame") expect_equal(count(df), 3) @@ -259,7 +259,7 @@ test_that("create DataFrame from list or data.frame", { irisdf <- suppressWarnings(createDataFrame(sqlContext, iris)) iris_collected <- collect(irisdf) - expect_equivalent(iris_collected[,-5], iris[,-5]) + expect_equivalent(iris_collected[, -5], iris[, -5]) expect_equal(iris_collected$Species, as.character(iris$Species)) mtcarsdf <- createDataFrame(sqlContext, mtcars) @@ -325,7 +325,7 @@ test_that("create DataFrame from a data.frame with complex types", { mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}", "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}", "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}") -mapTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp") +mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLinesMapType, mapTypeJsonPath) test_that("Collect DataFrame with complex types", { @@ -396,11 +396,11 @@ test_that("read/write json files", { expect_equal(count(df), 3) # Test write.df - jsonPath2 <- tempfile(pattern="jsonPath2", fileext=".json") - write.df(df, jsonPath2, "json", mode="overwrite") + jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".json") + write.df(df, jsonPath2, "json", mode = "overwrite") # Test write.json - jsonPath3 <- tempfile(pattern="jsonPath3", fileext=".json") + jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json") write.json(df, jsonPath3) # Test read.json()/jsonFile() works with multiple input paths @@ -463,7 +463,7 @@ test_that("insertInto() on a registered table", { lines <- c("{\"name\":\"Bob\", \"age\":24}", "{\"name\":\"James\", \"age\":35}") - jsonPath2 <- tempfile(pattern="jsonPath2", fileext=".tmp") + jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".tmp") parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet") writeLines(lines, jsonPath2) df2 <- read.df(sqlContext, jsonPath2, "json") @@ -527,7 +527,7 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", { textLines <- c("Michael", "Andy, 30", "Justin, 19") - textPath <- tempfile(pattern="sparkr-textLines", fileext=".tmp") + textPath <- tempfile(pattern = "sparkr-textLines", fileext = ".tmp") writeLines(textLines, textPath) textRDD <- textFile(sc, textPath) @@ -548,7 +548,7 @@ test_that("union on mixed serialization types correctly returns a byte RRDD", { }) test_that("objectFile() works with row serialization", { - objectPath <- tempfile(pattern="spark-test", fileext=".tmp") + objectPath <- tempfile(pattern = "spark-test", fileext = ".tmp") df <- read.json(sqlContext, jsonPath) dfRDD <- toRDD(df) saveAsObjectFile(coalesce(dfRDD, 1L), objectPath) @@ -612,7 +612,7 @@ test_that("collect() support Unicode characters", { "{\"name\":\"こんにちは\", \"age\":19}", "{\"name\":\"Xin chào\"}") - jsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(lines, jsonPath) df <- read.df(sqlContext, jsonPath, "json") @@ -692,10 +692,21 @@ test_that("names() colnames() set the column names", { colnames(df) <- c("col3", "col4") expect_equal(names(df)[1], "col3") + expect_error(colnames(df) <- c("sepal.length", "sepal_width"), + "Colum names cannot contain the '.' symbol.") + expect_error(colnames(df) <- c(1, 2), "Invalid column names.") + expect_error(colnames(df) <- c("a"), + "Column names must have the same length as the number of columns in the dataset.") + expect_error(colnames(df) <- c("1", NA), "Column names cannot be NA.") + + # Note: if this test is broken, remove check for "." character on colnames<- method + irisDF <- suppressWarnings(createDataFrame(sqlContext, iris)) + expect_equal(names(irisDF)[1], "Sepal_Length") + # Test base::colnames base::names m2 <- cbind(1, 1:4) expect_equal(colnames(m2, do.NULL = FALSE), c("col1", "col2")) - colnames(m2) <- c("x","Y") + colnames(m2) <- c("x", "Y") expect_equal(colnames(m2), c("x", "Y")) z <- list(a = 1, b = "c", c = 1:3) @@ -735,7 +746,7 @@ test_that("distinct() and unique on DataFrames", { "{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"Justin\", \"age\":19}", "{\"name\":\"Justin\", \"age\":19}") - jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(lines, jsonPathWithDup) df <- read.json(sqlContext, jsonPathWithDup) @@ -776,10 +787,10 @@ test_that("select operators", { expect_is(df[[2]], "Column") expect_is(df[["age"]], "Column") - expect_is(df[,1], "DataFrame") - expect_equal(columns(df[,1]), c("name")) - expect_equal(columns(df[,"age"]), c("age")) - df2 <- df[,c("age", "name")] + expect_is(df[, 1], "DataFrame") + expect_equal(columns(df[, 1]), c("name")) + expect_equal(columns(df[, "age"]), c("age")) + df2 <- df[, c("age", "name")] expect_is(df2, "DataFrame") expect_equal(columns(df2), c("age", "name")) @@ -822,7 +833,7 @@ test_that("select with column", { test_that("subsetting", { # read.json returns columns in random order df <- select(read.json(sqlContext, jsonPath), "name", "age") - filtered <- df[df$age > 20,] + filtered <- df[df$age > 20, ] expect_equal(count(filtered), 1) expect_equal(columns(filtered), c("name", "age")) expect_equal(collect(filtered)$name, "Andy") @@ -841,11 +852,11 @@ test_that("subsetting", { expect_equal(count(df4), 2) expect_equal(columns(df4), c("name", "age")) - df5 <- df[df$age %in% c(19), c(1,2)] + df5 <- df[df$age %in% c(19), c(1, 2)] expect_equal(count(df5), 1) expect_equal(columns(df5), c("name", "age")) - df6 <- subset(df, df$age %in% c(30), c(1,2)) + df6 <- subset(df, df$age %in% c(30), c(1, 2)) expect_equal(count(df6), 1) expect_equal(columns(df6), c("name", "age")) @@ -897,22 +908,22 @@ test_that("test HiveContext", { expect_is(df2, "DataFrame") expect_equal(count(df2), 3) - jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "json2", "json", "append", path = jsonPath2)) df3 <- sql(hiveCtx, "select * from json2") expect_is(df3, "DataFrame") expect_equal(count(df3), 3) unlink(jsonPath2) - hivetestDataPath <- tempfile(pattern="sparkr-test", fileext=".tmp") + hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") invisible(saveAsTable(df, "hivetestbl", path = hivetestDataPath)) df4 <- sql(hiveCtx, "select * from hivetestbl") expect_is(df4, "DataFrame") expect_equal(count(df4), 3) unlink(hivetestDataPath) - parquetDataPath <- tempfile(pattern="sparkr-test", fileext=".tmp") - invisible(saveAsTable(df, "parquetest", "parquet", mode="overwrite", path=parquetDataPath)) + parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") + invisible(saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)) df5 <- sql(hiveCtx, "select * from parquetest") expect_is(df5, "DataFrame") expect_equal(count(df5), 3) @@ -1019,7 +1030,7 @@ test_that("column binary mathfunctions", { "{\"a\":2, \"b\":6}", "{\"a\":3, \"b\":7}", "{\"a\":4, \"b\":8}") - jsonPathWithDup <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(lines, jsonPathWithDup) df <- read.json(sqlContext, jsonPathWithDup) expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5)) @@ -1169,7 +1180,7 @@ test_that("group by, agg functions", { df3 <- agg(gd, age = "stddev") expect_is(df3, "DataFrame") df3_local <- collect(df3) - expect_true(is.nan(df3_local[df3_local$name == "Andy",][1, 2])) + expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2])) df4 <- agg(gd, sumAge = sum(df$age)) expect_is(df4, "DataFrame") @@ -1189,34 +1200,34 @@ test_that("group by, agg functions", { "{\"name\":\"ID1\", \"value\": \"10\"}", "{\"name\":\"ID1\", \"value\": \"22\"}", "{\"name\":\"ID2\", \"value\": \"-3\"}") - jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLines2, jsonPath2) gd2 <- groupBy(read.json(sqlContext, jsonPath2), "name") df6 <- agg(gd2, value = "sum") df6_local <- collect(df6) - expect_equal(42, df6_local[df6_local$name == "ID1",][1, 2]) - expect_equal(-3, df6_local[df6_local$name == "ID2",][1, 2]) + expect_equal(42, df6_local[df6_local$name == "ID1", ][1, 2]) + expect_equal(-3, df6_local[df6_local$name == "ID2", ][1, 2]) df7 <- agg(gd2, value = "stddev") df7_local <- collect(df7) - expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6) - expect_true(is.nan(df7_local[df7_local$name == "ID2",][1, 2])) + expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6) + expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2])) mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"Justin\", \"age\":19}", "{\"name\":\"Justin\", \"age\":1}") - jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLines3, jsonPath3) df8 <- read.json(sqlContext, jsonPath3) gd3 <- groupBy(df8, "name") gd3_local <- collect(sum(gd3)) - expect_equal(60, gd3_local[gd3_local$name == "Andy",][1, 2]) - expect_equal(20, gd3_local[gd3_local$name == "Justin",][1, 2]) + expect_equal(60, gd3_local[gd3_local$name == "Andy", ][1, 2]) + expect_equal(20, gd3_local[gd3_local$name == "Justin", ][1, 2]) expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6) gd3_local <- collect(agg(gd3, var(df8$age))) - expect_equal(162, gd3_local[gd3_local$name == "Justin",][1, 2]) + expect_equal(162, gd3_local[gd3_local$name == "Justin", ][1, 2]) # Test stats::sd, stats::var are working expect_true(abs(sd(1:2) - 0.7071068) < 1e-6) @@ -1229,10 +1240,10 @@ test_that("group by, agg functions", { test_that("arrange() and orderBy() on a DataFrame", { df <- read.json(sqlContext, jsonPath) sorted <- arrange(df, df$age) - expect_equal(collect(sorted)[1,2], "Michael") + expect_equal(collect(sorted)[1, 2], "Michael") sorted2 <- arrange(df, "name", decreasing = FALSE) - expect_equal(collect(sorted2)[2,"age"], 19) + expect_equal(collect(sorted2)[2, "age"], 19) sorted3 <- orderBy(df, asc(df$age)) expect_true(is.na(first(sorted3)$age)) @@ -1240,16 +1251,16 @@ test_that("arrange() and orderBy() on a DataFrame", { sorted4 <- orderBy(df, desc(df$name)) expect_equal(first(sorted4)$name, "Michael") - expect_equal(collect(sorted4)[3,"name"], "Andy") + expect_equal(collect(sorted4)[3, "name"], "Andy") sorted5 <- arrange(df, "age", "name", decreasing = TRUE) - expect_equal(collect(sorted5)[1,2], "Andy") + expect_equal(collect(sorted5)[1, 2], "Andy") - sorted6 <- arrange(df, "age","name", decreasing = c(T, F)) - expect_equal(collect(sorted6)[1,2], "Andy") + sorted6 <- arrange(df, "age", "name", decreasing = c(T, F)) + expect_equal(collect(sorted6)[1, 2], "Andy") sorted7 <- arrange(df, "name", decreasing = FALSE) - expect_equal(collect(sorted7)[2,"age"], 19) + expect_equal(collect(sorted7)[2, "age"], 19) }) test_that("filter() on a DataFrame", { @@ -1282,7 +1293,7 @@ test_that("join() and merge() on a DataFrame", { "{\"name\":\"Andy\", \"test\": \"no\"}", "{\"name\":\"Justin\", \"test\": \"yes\"}", "{\"name\":\"Bob\", \"test\": \"yes\"}") - jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLines2, jsonPath2) df2 <- read.json(sqlContext, jsonPath2) @@ -1334,12 +1345,12 @@ test_that("join() and merge() on a DataFrame", { expect_equal(names(merged), c("age", "name_x", "name_y", "test")) expect_equal(collect(orderBy(merged, merged$name_x))$age[3], 19) - merged <- merge(df, df2, suffixes = c("-X","-Y")) + merged <- merge(df, df2, suffixes = c("-X", "-Y")) expect_equal(count(merged), 3) expect_equal(names(merged), c("age", "name-X", "name-Y", "test")) expect_equal(collect(orderBy(merged, merged$"name-X"))$age[1], 30) - merged <- merge(df, df2, by = "name", suffixes = c("-X","-Y"), sort = FALSE) + merged <- merge(df, df2, by = "name", suffixes = c("-X", "-Y"), sort = FALSE) expect_equal(count(merged), 3) expect_equal(names(merged), c("age", "name-X", "name-Y", "test")) expect_equal(collect(orderBy(merged, merged$"name-Y"))$"name-X"[3], "Michael") @@ -1357,7 +1368,7 @@ test_that("join() and merge() on a DataFrame", { "{\"name\":\"Andy\", \"name_y\":\"Andy\", \"test\": \"no\"}", "{\"name\":\"Justin\", \"name_y\":\"Justin\", \"test\": \"yes\"}", "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}") - jsonPath3 <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLines3, jsonPath3) df3 <- read.json(sqlContext, jsonPath3) expect_error(merge(df, df3), @@ -1385,8 +1396,8 @@ test_that("showDF()", { "|null|Michael|\n", "| 30| Andy|\n", "| 19| Justin|\n", - "+----+-------+\n", sep="") - expect_output(s , expected) + "+----+-------+\n", sep = "") + expect_output(s, expected) }) test_that("isLocal()", { @@ -1400,7 +1411,7 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", { lines <- c("{\"name\":\"Bob\", \"age\":24}", "{\"name\":\"Andy\", \"age\":30}", "{\"name\":\"James\", \"age\":35}") - jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") + jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(lines, jsonPath2) df2 <- read.df(sqlContext, jsonPath2, "json") @@ -1478,7 +1489,7 @@ test_that("mutate(), transform(), rename() and names()", { test_that("read/write Parquet files", { df <- read.df(sqlContext, jsonPath, "json") # Test write.df and read.df - write.df(df, parquetPath, "parquet", mode="overwrite") + write.df(df, parquetPath, "parquet", mode = "overwrite") df2 <- read.df(sqlContext, parquetPath, "parquet") expect_is(df2, "DataFrame") expect_equal(count(df2), 3) @@ -1513,7 +1524,7 @@ test_that("read/write text files", { expect_equal(colnames(df), c("value")) expect_equal(count(df), 3) textPath <- tempfile(pattern = "textPath", fileext = ".txt") - write.df(df, textPath, "text", mode="overwrite") + write.df(df, textPath, "text", mode = "overwrite") # Test write.text and read.text textPath2 <- tempfile(pattern = "textPath2", fileext = ".txt") @@ -1551,13 +1562,13 @@ test_that("dropna() and na.omit() on a DataFrame", { # drop with columns - expected <- rows[!is.na(rows$name),] + expected <- rows[!is.na(rows$name), ] actual <- collect(dropna(df, cols = "name")) expect_identical(expected, actual) actual <- collect(na.omit(df, cols = "name")) expect_identical(expected, actual) - expected <- rows[!is.na(rows$age),] + expected <- rows[!is.na(rows$age), ] actual <- collect(dropna(df, cols = "age")) row.names(expected) <- row.names(actual) # identical on two dataframes does not work here. Don't know why. @@ -1567,13 +1578,13 @@ test_that("dropna() and na.omit() on a DataFrame", { expect_identical(expected$name, actual$name) actual <- collect(na.omit(df, cols = "age")) - expected <- rows[!is.na(rows$age) & !is.na(rows$height),] + expected <- rows[!is.na(rows$age) & !is.na(rows$height), ] actual <- collect(dropna(df, cols = c("age", "height"))) expect_identical(expected, actual) actual <- collect(na.omit(df, cols = c("age", "height"))) expect_identical(expected, actual) - expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),] + expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name), ] actual <- collect(dropna(df)) expect_identical(expected, actual) actual <- collect(na.omit(df)) @@ -1581,31 +1592,31 @@ test_that("dropna() and na.omit() on a DataFrame", { # drop with how - expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),] + expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name), ] actual <- collect(dropna(df)) expect_identical(expected, actual) actual <- collect(na.omit(df)) expect_identical(expected, actual) - expected <- rows[!is.na(rows$age) | !is.na(rows$height) | !is.na(rows$name),] + expected <- rows[!is.na(rows$age) | !is.na(rows$height) | !is.na(rows$name), ] actual <- collect(dropna(df, "all")) expect_identical(expected, actual) actual <- collect(na.omit(df, "all")) expect_identical(expected, actual) - expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name),] + expected <- rows[!is.na(rows$age) & !is.na(rows$height) & !is.na(rows$name), ] actual <- collect(dropna(df, "any")) expect_identical(expected, actual) actual <- collect(na.omit(df, "any")) expect_identical(expected, actual) - expected <- rows[!is.na(rows$age) & !is.na(rows$height),] + expected <- rows[!is.na(rows$age) & !is.na(rows$height), ] actual <- collect(dropna(df, "any", cols = c("age", "height"))) expect_identical(expected, actual) actual <- collect(na.omit(df, "any", cols = c("age", "height"))) expect_identical(expected, actual) - expected <- rows[!is.na(rows$age) | !is.na(rows$height),] + expected <- rows[!is.na(rows$age) | !is.na(rows$height), ] actual <- collect(dropna(df, "all", cols = c("age", "height"))) expect_identical(expected, actual) actual <- collect(na.omit(df, "all", cols = c("age", "height"))) @@ -1613,7 +1624,7 @@ test_that("dropna() and na.omit() on a DataFrame", { # drop with threshold - expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) >= 2,] + expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) >= 2, ] actual <- collect(dropna(df, minNonNulls = 2, cols = c("age", "height"))) expect_identical(expected, actual) actual <- collect(na.omit(df, minNonNulls = 2, cols = c("age", "height"))) @@ -1621,7 +1632,7 @@ test_that("dropna() and na.omit() on a DataFrame", { expected <- rows[as.integer(!is.na(rows$age)) + as.integer(!is.na(rows$height)) + - as.integer(!is.na(rows$name)) >= 3,] + as.integer(!is.na(rows$name)) >= 3, ] actual <- collect(dropna(df, minNonNulls = 3, cols = c("name", "age", "height"))) expect_identical(expected, actual) actual <- collect(na.omit(df, minNonNulls = 3, cols = c("name", "age", "height"))) @@ -1674,7 +1685,7 @@ test_that("crosstab() on a DataFrame", { }) df <- toDF(rdd, list("a", "b")) ct <- crosstab(df, "a", "b") - ordered <- ct[order(ct$a_b),] + ordered <- ct[order(ct$a_b), ] row.names(ordered) <- NULL expected <- data.frame("a_b" = c("a0", "a1", "a2"), "b0" = c(1, 0, 1), "b1" = c(1, 1, 0), stringsAsFactors = FALSE, row.names = NULL) @@ -1702,10 +1713,10 @@ test_that("freqItems() on a DataFrame", { negDoubles = input * -1.0, stringsAsFactors = F) rdf[ input %% 3 == 0, ] <- c(1, "1", -1) df <- createDataFrame(sqlContext, rdf) - multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1) + multiColResults <- freqItems(df, c("numbers", "letters"), support = 0.1) expect_true(1 %in% multiColResults$numbers[[1]]) expect_true("1" %in% multiColResults$letters[[1]]) - singleColResult <- freqItems(df, "negDoubles", support=0.1) + singleColResult <- freqItems(df, "negDoubles", support = 0.1) expect_true(-1 %in% head(singleColResult$negDoubles)[[1]]) l <- lapply(c(0:99), function(i) { @@ -1772,9 +1783,9 @@ test_that("with() on a DataFrame", { test_that("Method coltypes() to get and set R's data types of a DataFrame", { expect_equal(coltypes(irisDF), c(rep("numeric", 4), "character")) - data <- data.frame(c1=c(1,2,3), - c2=c(T,F,T), - c3=c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00")) + data <- data.frame(c1 = c(1, 2, 3), + c2 = c(T, F, T), + c3 = c("2015/01/01 10:00:00", "2015/01/02 10:00:00", "2015/01/03 10:00:00")) schema <- structType(structField("c1", "byte"), structField("c3", "boolean"), @@ -1786,7 +1797,7 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { # Test complex types x <- createDataFrame(sqlContext, list(list(as.environment( - list("a"="b", "c"="d", "e"="f"))))) + list("a" = "b", "c" = "d", "e" = "f"))))) expect_equal(coltypes(x), "map") df <- selectExpr(read.json(sqlContext, jsonPath), "name", "(age * 1.21) as age") @@ -1830,7 +1841,7 @@ test_that("Method str()", { # the number of columns. Therefore, it will suffice to check for the # number of returned rows x <- runif(200, 1, 10) - df <- data.frame(t(as.matrix(data.frame(x,x,x,x,x,x,x,x,x)))) + df <- data.frame(t(as.matrix(data.frame(x, x, x, x, x, x, x, x, x)))) DF <- createDataFrame(sqlContext, df) out <- capture.output(str(DF)) expect_equal(length(out), 103) diff --git a/R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/testthat/test_textFile.R index a9cf83dbdbdb1..e64ef1bb31a3a 100644 --- a/R/pkg/inst/tests/testthat/test_textFile.R +++ b/R/pkg/inst/tests/testthat/test_textFile.R @@ -23,7 +23,7 @@ sc <- sparkR.init() mockFile <- c("Spark is pretty.", "Spark is awesome.") test_that("textFile() on a local file returns an RDD", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) @@ -35,7 +35,7 @@ test_that("textFile() on a local file returns an RDD", { }) test_that("textFile() followed by a collect() returns the same content", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) @@ -45,7 +45,7 @@ test_that("textFile() followed by a collect() returns the same content", { }) test_that("textFile() word count works as expected", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) @@ -63,7 +63,7 @@ test_that("textFile() word count works as expected", { }) test_that("several transformations on RDD created by textFile()", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) # RDD @@ -77,8 +77,8 @@ test_that("several transformations on RDD created by textFile()", { }) test_that("textFile() followed by a saveAsTextFile() returns the same content", { - fileName1 <- tempfile(pattern="spark-test", fileext=".tmp") - fileName2 <- tempfile(pattern="spark-test", fileext=".tmp") + fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp") + fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName1) rdd <- textFile(sc, fileName1, 1L) @@ -91,7 +91,7 @@ test_that("textFile() followed by a saveAsTextFile() returns the same content", }) test_that("saveAsTextFile() on a parallelized list works as expected", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") l <- list(1, 2, 3) rdd <- parallelize(sc, l, 1L) saveAsTextFile(rdd, fileName) @@ -102,8 +102,8 @@ test_that("saveAsTextFile() on a parallelized list works as expected", { }) test_that("textFile() and saveAsTextFile() word count works as expected", { - fileName1 <- tempfile(pattern="spark-test", fileext=".tmp") - fileName2 <- tempfile(pattern="spark-test", fileext=".tmp") + fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp") + fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName1) rdd <- textFile(sc, fileName1) @@ -127,8 +127,8 @@ test_that("textFile() and saveAsTextFile() word count works as expected", { }) test_that("textFile() on multiple paths", { - fileName1 <- tempfile(pattern="spark-test", fileext=".tmp") - fileName2 <- tempfile(pattern="spark-test", fileext=".tmp") + fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp") + fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines("Spark is pretty.", fileName1) writeLines("Spark is awesome.", fileName2) @@ -140,7 +140,7 @@ test_that("textFile() on multiple paths", { }) test_that("Pipelined operations on RDDs created using textFile", { - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) rdd <- textFile(sc, fileName) diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R index 56f14a3bce61e..4218138f641d1 100644 --- a/R/pkg/inst/tests/testthat/test_utils.R +++ b/R/pkg/inst/tests/testthat/test_utils.R @@ -41,7 +41,7 @@ test_that("convertJListToRList() gives back (deserializes) the original JLists test_that("serializeToBytes on RDD", { # File content mockFile <- c("Spark is pretty.", "Spark is awesome.") - fileName <- tempfile(pattern="spark-test", fileext=".tmp") + fileName <- tempfile(pattern = "spark-test", fileext = ".tmp") writeLines(mockFile, fileName) text.rdd <- textFile(sc, fileName) @@ -86,8 +86,8 @@ test_that("cleanClosure on R functions", { f <- function(x) { defUse <- base::as.integer(x) + 1 # Test for access operators `::`. lapply(x, g) + 1 # Test for capturing function call "g"'s closure as a argument of lapply. - l$field[1,1] <- 3 # Test for access operators `$`. - res <- defUse + l$field[1,] # Test for def-use chain of "defUse", and "" symbol. + l$field[1, 1] <- 3 # Test for access operators `$`. + res <- defUse + l$field[1, ] # Test for def-use chain of "defUse", and "" symbol. f(res) # Test for recursive calls. } newF <- cleanClosure(f) @@ -132,7 +132,7 @@ test_that("cleanClosure on R functions", { expect_equal(actual, expected) # Test for broadcast variables. - a <- matrix(nrow=10, ncol=10, data=rnorm(100)) + a <- matrix(nrow = 10, ncol = 10, data = rnorm(100)) aBroadcast <- broadcast(sc, a) normMultiply <- function(x) { norm(aBroadcast$value) * x } newnormMultiply <- SparkR:::cleanClosure(normMultiply) diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R index 3ae072beca11b..b6784dbae3203 100644 --- a/R/pkg/inst/worker/worker.R +++ b/R/pkg/inst/worker/worker.R @@ -55,7 +55,7 @@ serializer <- SparkR:::readString(inputCon) # Include packages as required packageNames <- unserialize(SparkR:::readRaw(inputCon)) for (pkg in packageNames) { - suppressPackageStartupMessages(library(as.character(pkg), character.only=TRUE)) + suppressPackageStartupMessages(library(as.character(pkg), character.only = TRUE)) } # read function dependencies diff --git a/build/mvn b/build/mvn index 7603ea03deb73..c2b142700aa49 100755 --- a/build/mvn +++ b/build/mvn @@ -72,7 +72,7 @@ install_mvn() { local MVN_VERSION="3.3.3" install_app \ - "http://archive.apache.org/dist/maven/maven-3/${MVN_VERSION}/binaries" \ + "https://www.apache.org/dyn/closer.lua?action=download&filename=/maven/maven-3/${MVN_VERSION}/binaries" \ "apache-maven-${MVN_VERSION}-bin.tar.gz" \ "apache-maven-${MVN_VERSION}/bin/mvn" diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java index 842f4550ca023..22348c02eff53 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java @@ -215,8 +215,6 @@ private void writeSortedFile(boolean isLastFile) throws IOException { } } - inMemSorter.reset(); - if (!isLastFile) { // i.e. this is a spill file // The current semantics of `shuffleRecordsWritten` seem to be that it's updated when records // are written to disk, not when they enter the shuffle sorting code. DiskBlockObjectWriter @@ -255,6 +253,10 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { writeSortedFile(false); final long spillSize = freeMemory(); + inMemSorter.reset(); + // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the + // records. Otherwise, if the task is over allocated memory, then without freeing the memory pages, + // we might not be able to get memory for the pointer array. taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); return spillSize; } @@ -320,7 +322,18 @@ private void growPointerArrayIfNecessary() throws IOException { assert(inMemSorter != null); if (!inMemSorter.hasSpaceForAnotherRecord()) { long used = inMemSorter.getMemoryUsage(); - LongArray array = allocateArray(used / 8 * 2); + LongArray array; + try { + // could trigger spilling + array = allocateArray(used / 8 * 2); + } catch (OutOfMemoryError e) { + // should have trigger spilling + if (!inMemSorter.hasSpaceForAnotherRecord()) { + logger.error("Unable to grow the pointer array"); + throw e; + } + return; + } // check if spilling is triggered or not if (inMemSorter.hasSpaceForAnotherRecord()) { freeArray(array); diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java index d74602cd205ad..1afa719f2d6f8 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleInMemorySorter.java @@ -49,9 +49,12 @@ public int compare(PackedRecordPointer left, PackedRecordPointer right) { */ private int pos = 0; + private int initialSize; + public ShuffleInMemorySorter(MemoryConsumer consumer, int initialSize) { this.consumer = consumer; assert (initialSize > 0); + this.initialSize = initialSize; this.array = consumer.allocateArray(initialSize); this.sorter = new Sorter<>(ShuffleSortDataFormat.INSTANCE); } @@ -68,6 +71,10 @@ public int numRecords() { } public void reset() { + if (consumer != null) { + consumer.freeArray(array); + this.array = consumer.allocateArray(initialSize); + } pos = 0; } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 47d311c3d5317..de38c2d261246 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -192,14 +192,17 @@ public long spill(long size, MemoryConsumer trigger) throws IOException { spillWriter.write(baseObject, baseOffset, recordLength, sortedRecords.getKeyPrefix()); } spillWriter.close(); - - inMemSorter.reset(); } final long spillSize = freeMemory(); // Note that this is more-or-less going to be a multiple of the page size, so wasted space in // pages will currently be counted as memory spilled even though that space isn't actually // written to disk. This also counts the space needed to store the sorter's pointer array. + inMemSorter.reset(); + // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the + // records. Otherwise, if the task is over allocated memory, then without freeing the memory pages, + // we might not be able to get memory for the pointer array. + taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); return spillSize; @@ -292,7 +295,18 @@ private void growPointerArrayIfNecessary() throws IOException { assert(inMemSorter != null); if (!inMemSorter.hasSpaceForAnotherRecord()) { long used = inMemSorter.getMemoryUsage(); - LongArray array = allocateArray(used / 8 * 2); + LongArray array; + try { + // could trigger spilling + array = allocateArray(used / 8 * 2); + } catch (OutOfMemoryError e) { + // should have trigger spilling + if (!inMemSorter.hasSpaceForAnotherRecord()) { + logger.error("Unable to grow the pointer array"); + throw e; + } + return; + } // check if spilling is triggered or not if (inMemSorter.hasSpaceForAnotherRecord()) { freeArray(array); diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 308db22631644..05390c8f411ce 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -80,6 +80,8 @@ public int compare(RecordPointerAndKeyPrefix r1, RecordPointerAndKeyPrefix r2) { */ private int pos = 0; + private long initialSize; + public UnsafeInMemorySorter( final MemoryConsumer consumer, final TaskMemoryManager memoryManager, @@ -98,6 +100,7 @@ public UnsafeInMemorySorter( LongArray array) { this.consumer = consumer; this.memoryManager = memoryManager; + this.initialSize = array.size(); this.sorter = new Sorter<>(UnsafeSortDataFormat.INSTANCE); this.sortComparator = new SortComparator(recordComparator, prefixComparator, memoryManager); this.array = array; @@ -114,6 +117,10 @@ public void free() { } public void reset() { + if (consumer != null) { + consumer.freeArray(array); + this.array = consumer.allocateArray(initialSize); + } pos = 0; } diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css index c628a0c706553..6581f55972502 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.css +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css @@ -106,21 +106,22 @@ pre { line-height: 18px; padding: 6px; margin: 0; + word-break: break-word; border-radius: 3px; } .stage-details { - max-height: 100px; overflow-y: auto; margin: 0; + display: block; transition: max-height 0.25s ease-out, padding 0.25s ease-out; } .stage-details.collapsed { - max-height: 0; padding-top: 0; padding-bottom: 0; border: none; + display: none; } .description-input { @@ -143,14 +144,15 @@ pre { max-height: 300px; overflow-y: auto; margin: 0; + display: block; transition: max-height 0.25s ease-out, padding 0.25s ease-out; } .stacktrace-details.collapsed { - max-height: 0; padding-top: 0; padding-bottom: 0; border: none; + display: none; } span.expand-additional-metrics, span.expand-dag-viz { diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index 72355cdfa68b3..998b4d5c10d34 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -384,8 +384,6 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf) * @param numReducers total number of reducers in the shuffle * @param fractionThreshold fraction of total map output size that a location must have * for it to be considered large. - * - * This method is not thread-safe. */ def getLocationsWithLargestOutputs( shuffleId: Int, @@ -394,28 +392,36 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf) fractionThreshold: Double) : Option[Array[BlockManagerId]] = { - if (mapStatuses.contains(shuffleId)) { - val statuses = mapStatuses(shuffleId) - if (statuses.nonEmpty) { - // HashMap to add up sizes of all blocks at the same location - val locs = new HashMap[BlockManagerId, Long] - var totalOutputSize = 0L - var mapIdx = 0 - while (mapIdx < statuses.length) { - val status = statuses(mapIdx) - val blockSize = status.getSizeForBlock(reducerId) - if (blockSize > 0) { - locs(status.location) = locs.getOrElse(status.location, 0L) + blockSize - totalOutputSize += blockSize + val statuses = mapStatuses.get(shuffleId).orNull + if (statuses != null) { + statuses.synchronized { + if (statuses.nonEmpty) { + // HashMap to add up sizes of all blocks at the same location + val locs = new HashMap[BlockManagerId, Long] + var totalOutputSize = 0L + var mapIdx = 0 + while (mapIdx < statuses.length) { + val status = statuses(mapIdx) + // status may be null here if we are called between registerShuffle, which creates an + // array with null entries for each output, and registerMapOutputs, which populates it + // with valid status entries. This is possible if one thread schedules a job which + // depends on an RDD which is currently being computed by another thread. + if (status != null) { + val blockSize = status.getSizeForBlock(reducerId) + if (blockSize > 0) { + locs(status.location) = locs.getOrElse(status.location, 0L) + blockSize + totalOutputSize += blockSize + } + } + mapIdx = mapIdx + 1 + } + val topLocs = locs.filter { case (loc, size) => + size.toDouble / totalOutputSize >= fractionThreshold + } + // Return if we have any locations which satisfy the required threshold + if (topLocs.nonEmpty) { + return Some(topLocs.keys.toArray) } - mapIdx = mapIdx + 1 - } - val topLocs = locs.filter { case (loc, size) => - size.toDouble / totalOutputSize >= fractionThreshold - } - // Return if we have any locations which satisfy the required threshold - if (topLocs.nonEmpty) { - return Some(topLocs.map(_._1).toArray) } } } diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala index 1e2f469214b84..64af5b91fe887 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala @@ -126,16 +126,14 @@ private[spark] class AppClient( registerMasterFutures.set(tryRegisterAllMasters()) registrationRetryTimer.set(registrationRetryThread.scheduleAtFixedRate(new Runnable { override def run(): Unit = { - Utils.tryOrExit { - if (registered.get) { - registerMasterFutures.get.foreach(_.cancel(true)) - registerMasterThreadPool.shutdownNow() - } else if (nthRetry >= REGISTRATION_RETRIES) { - markDead("All masters are unresponsive! Giving up.") - } else { - registerMasterFutures.get.foreach(_.cancel(true)) - registerWithMaster(nthRetry + 1) - } + if (registered.get) { + registerMasterFutures.get.foreach(_.cancel(true)) + registerMasterThreadPool.shutdownNow() + } else if (nthRetry >= REGISTRATION_RETRIES) { + markDead("All masters are unresponsive! Giving up.") + } else { + registerMasterFutures.get.foreach(_.cancel(true)) + registerWithMaster(nthRetry + 1) } } }, REGISTRATION_TIMEOUT_SECONDS, REGISTRATION_TIMEOUT_SECONDS, TimeUnit.SECONDS)) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 5d97c63918856..37b4dd5b01029 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -703,15 +703,28 @@ private[deploy] class Master( * every time a new app joins or resource availability changes. */ private def schedule(): Unit = { - if (state != RecoveryState.ALIVE) { return } + if (state != RecoveryState.ALIVE) { + return + } // Drivers take strict precedence over executors - val shuffledWorkers = Random.shuffle(workers) // Randomization helps balance drivers - for (worker <- shuffledWorkers if worker.state == WorkerState.ALIVE) { - for (driver <- waitingDrivers) { + val shuffledAliveWorkers = Random.shuffle(workers.toSeq.filter(_.state == WorkerState.ALIVE)) + val numWorkersAlive = shuffledAliveWorkers.size + var curPos = 0 + for (driver <- waitingDrivers.toList) { // iterate over a copy of waitingDrivers + // We assign workers to each waiting driver in a round-robin fashion. For each driver, we + // start from the last worker that was assigned a driver, and continue onwards until we have + // explored all alive workers. + var launched = false + var numWorkersVisited = 0 + while (numWorkersVisited < numWorkersAlive && !launched) { + val worker = shuffledAliveWorkers(curPos) + numWorkersVisited += 1 if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) { launchDriver(worker, driver) waitingDrivers -= driver + launched = true } + curPos = (curPos + 1) % numWorkersAlive } } startExecutorsOnWorkers() diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 69a34ddedb98f..b248e122da3cf 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -299,7 +299,7 @@ private[spark] class Executor( logInfo(s"Executor killed $taskName (TID $taskId)") execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled)) - case cDE: CommitDeniedException => + case CausedBy(cDE: CommitDeniedException) => val reason = cDE.toTaskEndReason execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason)) diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala index f7298e8d5c62c..aa8397ce63b0b 100644 --- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala +++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala @@ -81,11 +81,8 @@ object SparkHadoopMapRedUtil extends Logging { * the driver in order to determine whether this attempt can commit (please see SPARK-4879 for * details). * - * Output commit coordinator is only contacted when the following two configurations are both set - * to `true`: - * - * - `spark.speculation` - * - `spark.hadoop.outputCommitCoordination.enabled` + * Output commit coordinator is only used when `spark.hadoop.outputCommitCoordination.enabled` + * is set to true (which is the default). */ def commitTask( committer: MapReduceOutputCommitter, @@ -112,11 +109,10 @@ object SparkHadoopMapRedUtil extends Logging { if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get.conf - // We only need to coordinate with the driver if there are multiple concurrent task - // attempts, which should only occur if speculation is enabled - val speculationEnabled = sparkConf.getBoolean("spark.speculation", defaultValue = false) - // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs - sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", speculationEnabled) + // We only need to coordinate with the driver if there are concurrent task attempts. + // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). + // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. + sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 853dbc276dc21..250601999675b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -1116,9 +1116,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) maybeUpdateOutputMetrics(bytesWrittenCallback, outputMetrics, recordsWritten) recordsWritten += 1 } - } { - writer.close(hadoopContext) - } + }(finallyBlock = writer.close(hadoopContext)) committer.commitTask(hadoopContext) bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) } outputMetrics.setRecordsWritten(recordsWritten) @@ -1202,9 +1200,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) maybeUpdateOutputMetrics(bytesWrittenCallback, outputMetrics, recordsWritten) recordsWritten += 1 } - } { - writer.close() - } + }(finallyBlock = writer.close()) writer.commit() bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) } outputMetrics.setRecordsWritten(recordsWritten) diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index 17304ea19204b..c7b11999fa9ff 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -87,10 +87,16 @@ private[spark] abstract class Task[T]( } try { (runTask(context), context.collectAccumulators()) - } catch { case e: Throwable => - // Catch all errors; run task failure callbacks, and rethrow the exception. - context.markTaskFailed(e) - throw e + } catch { + case e: Throwable => + // Catch all errors; run task failure callbacks, and rethrow the exception. + try { + context.markTaskFailed(e) + } catch { + case t: Throwable => + e.addSuppressed(t) + } + throw e } finally { // Call the task completion callbacks. context.markTaskCompleted() diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index c0f43515197b8..3ca701d9e021f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -555,9 +555,9 @@ private[spark] class TaskSetManager( // Jump to the next locality level, and reset lastLaunchTime so that the next locality // wait timer doesn't immediately expire lastLaunchTime += localityWaits(currentLocalityIndex) - currentLocalityIndex += 1 - logDebug(s"Moving to ${myLocalityLevels(currentLocalityIndex)} after waiting for " + + logDebug(s"Moving to ${myLocalityLevels(currentLocalityIndex + 1)} after waiting for " + s"${localityWaits(currentLocalityIndex)}ms") + currentLocalityIndex += 1 } else { return myLocalityLevels(currentLocalityIndex) } @@ -720,7 +720,25 @@ private[spark] class TaskSetManager( failedExecutors.getOrElseUpdate(index, new HashMap[String, Long]()). put(info.executorId, clock.getTimeMillis()) sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics) - addPendingTask(index) + + if (successful(index)) { + logInfo( + s"Task ${info.id} in stage ${taskSet.id} (TID $tid) failed, " + + "but another instance of the task has already succeeded, " + + "so not re-queuing the task to be re-executed.") + } else { + addPendingTask(index) + } + + if (successful(index)) { + logInfo( + s"Task ${info.id} in stage ${taskSet.id} (TID $tid) failed, " + + "but another instance of the task has already succeeded, " + + "so not re-queuing the task to be re-executed.") + } else { + addPendingTask(index) + } + if (!isZombie && state != TaskState.KILLED && reason.isInstanceOf[TaskFailedReason] && reason.asInstanceOf[TaskFailedReason].countTowardsTaskFailures) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala index 16815d51d4c67..cb896836cb40a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala @@ -422,6 +422,12 @@ private[spark] class MesosClusterScheduler( "--driver-cores", desc.cores.toString, "--driver-memory", s"${desc.mem}M") + val replicatedOptionsBlacklist = Set( + "spark.jars", // Avoids duplicate classes in classpath + "spark.submit.deployMode", // this would be set to `cluster`, but we need client + "spark.master" // this contains the address of the dispatcher, not master + ) + // Assume empty main class means we're running python if (!desc.command.mainClass.equals("")) { options ++= Seq("--class", desc.command.mainClass) @@ -439,9 +445,29 @@ private[spark] class MesosClusterScheduler( .mkString(",") options ++= Seq("--py-files", formattedFiles) } + desc.schedulerProperties + .filter { case (key, _) => !replicatedOptionsBlacklist.contains(key) } + .foreach { case (key, value) => options ++= Seq("--conf", s"$key=${shellEscape(value)}") } options } + /** + * Escape args for Unix-like shells, unless already quoted by the user. + * Based on: http://www.gnu.org/software/bash/manual/html_node/Double-Quotes.html + * and http://www.grymoire.com/Unix/Quote.html + * @param value argument + * @return escaped argument + */ + private[scheduler] def shellEscape(value: String): String = { + val WrappedInQuotes = """^(".+"|'.+')$""".r + val ShellSpecialChars = (""".*([ '<>&|\?\*;!#\\(\)"$`]).*""").r + value match { + case WrappedInQuotes(c) => value // The user quoted his args, don't touch it! + case ShellSpecialChars(c) => "\"" + value.replaceAll("""(["`\$\\])""", """\\$1""") + "\"" + case _: String => value // Don't touch harmless strings + } + } + private class ResourceOffer(val offer: Offer, var cpu: Double, var mem: Double) { override def toString(): String = { s"Offer id: ${offer.getId.getValue}, cpu: $cpu, mem: $mem" diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index a46cc55e29cb4..288f756bca39b 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -1091,7 +1091,6 @@ private[spark] class BlockManager( /** * Remove all blocks belonging to the given RDD. - * * @return The number of blocks removed. */ def removeRdd(rddId: Int): Int = { @@ -1134,10 +1133,15 @@ private[spark] class BlockManager( "the disk, memory, or external block store") } blockInfo.remove(blockId) + val status = getCurrentBlockStatus(blockId, info) if (tellMaster && info.tellMaster) { - val status = getCurrentBlockStatus(blockId, info) reportBlockStatus(blockId, info, status) } + Option(TaskContext.get()).foreach { tc => + val metrics = tc.taskMetrics() + val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]()) + metrics.updatedBlocks = Some(lastUpdatedBlocks ++ Seq((blockId, status))) + } } } finally { pendingToRemove.remove(blockId) @@ -1168,15 +1172,9 @@ private[spark] class BlockManager( try { info.synchronized { val level = info.level - if (level.useMemory) { - memoryStore.remove(id) - } - if (level.useDisk) { - diskStore.remove(id) - } - if (level.useOffHeap) { - externalBlockStore.remove(id) - } + if (level.useMemory) { memoryStore.remove(id) } + if (level.useDisk) { diskStore.remove(id) } + if (level.useOffHeap) { externalBlockStore.remove(id) } iterator.remove() logInfo(s"Dropped block $id") } diff --git a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala index ec711480ebf30..cbcc3dc4e5ad1 100644 --- a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala @@ -59,17 +59,6 @@ class StorageStatusListener extends SparkListener { } } - override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { - val info = taskEnd.taskInfo - val metrics = taskEnd.taskMetrics - if (info != null && metrics != null) { - val updatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]()) - if (updatedBlocks.length > 0) { - updateStorageStatus(info.executorId, updatedBlocks) - } - } - } - override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { updateStorageStatus(unpersistRDD.rddId) } @@ -91,4 +80,15 @@ class StorageStatusListener extends SparkListener { } } + override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { + val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId + val blockId = blockUpdated.blockUpdatedInfo.blockId + val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel + val memSize = blockUpdated.blockUpdatedInfo.memSize + val diskSize = blockUpdated.blockUpdatedInfo.diskSize + val externalSize = blockUpdated.blockUpdatedInfo.externalBlockStoreSize + val blockStatus = BlockStatus(storageLevel, memSize, diskSize, externalSize) + updateStorageStatus(executorId, Seq((blockId, blockStatus))) + } + } diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala index e65ab2c02c969..47d6c3646c331 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala @@ -143,7 +143,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") { | 'Removed at ${UIUtils.formatDate(new Date(event.finishTime.get))}' + | '${ if (event.finishReason.isDefined) { - s"""
Reason: ${event.finishReason.get}""" + s"""
Reason: ${event.finishReason.get.replace("\n", " ")}""" } else { "" } diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala index 2cad0a796913e..6a35f0e0a87a0 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala @@ -123,7 +123,7 @@ private[ui] class JobPage(parent: JobsTab) extends WebUIPage("job") { | 'Removed at ${UIUtils.formatDate(new Date(event.finishTime.get))}' + | '${ if (event.finishReason.isDefined) { - s"""
Reason: ${event.finishReason.get}""" + s"""
Reason: ${event.finishReason.get.replace("\n", " ")}""" } else { "" } diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala index 54ba03e61868c..4feaeba316e4e 100644 --- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala +++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala @@ -17,6 +17,8 @@ package org.apache.spark.ui.scope +import java.util.Objects + import scala.collection.mutable import scala.collection.mutable.{StringBuilder, ListBuffer} @@ -72,6 +74,22 @@ private[ui] class RDDOperationCluster(val id: String, private var _name: String) def getCachedNodes: Seq[RDDOperationNode] = { _childNodes.filter(_.cached) ++ _childClusters.flatMap(_.getCachedNodes) } + + def canEqual(other: Any): Boolean = other.isInstanceOf[RDDOperationCluster] + + override def equals(other: Any): Boolean = other match { + case that: RDDOperationCluster => + (that canEqual this) && + _childClusters == that._childClusters && + id == that.id && + _name == that._name + case _ => false + } + + override def hashCode(): Int = { + val state = Seq(_childClusters, id, _name) + state.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b) + } } private[ui] object RDDOperationGraph extends Logging { diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala index 22e2993b3b5bd..4bcfaebef792e 100644 --- a/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/StorageTab.scala @@ -57,17 +57,6 @@ class StorageListener(storageStatusListener: StorageStatusListener) extends Bloc StorageUtils.updateRddInfo(rddInfosToUpdate, storageStatusList) } - /** - * Assumes the storage status list is fully up-to-date. This implies the corresponding - * StorageStatusSparkListener must process the SparkListenerTaskEnd event before this listener. - */ - override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { - val metrics = taskEnd.taskMetrics - if (metrics != null && metrics.updatedBlocks.isDefined) { - updateRDDInfo(metrics.updatedBlocks.get) - } - } - override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized { val rddInfos = stageSubmitted.stageInfo.rddInfos rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) } @@ -84,4 +73,15 @@ class StorageListener(storageStatusListener: StorageStatusListener) extends Bloc override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { _rddInfoMap.remove(unpersistRDD.rddId) } + + override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { + super.onBlockUpdated(blockUpdated) + val blockId = blockUpdated.blockUpdatedInfo.blockId + val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel + val memSize = blockUpdated.blockUpdatedInfo.memSize + val diskSize = blockUpdated.blockUpdatedInfo.diskSize + val externalSize = blockUpdated.blockUpdatedInfo.externalBlockStoreSize + val blockStatus = BlockStatus(storageLevel, memSize, diskSize, externalSize) + updateRDDInfo(Seq((blockId, blockStatus))) + } } diff --git a/core/src/main/scala/org/apache/spark/util/CausedBy.scala b/core/src/main/scala/org/apache/spark/util/CausedBy.scala new file mode 100644 index 0000000000000..73df446d981cb --- /dev/null +++ b/core/src/main/scala/org/apache/spark/util/CausedBy.scala @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +/** + * Extractor Object for pulling out the root cause of an error. + * If the error contains no cause, it will return the error itself. + * + * Usage: + * try { + * ... + * } catch { + * case CausedBy(ex: CommitDeniedException) => ... + * } + */ +private[spark] object CausedBy { + + def unapply(e: Throwable): Option[Throwable] = { + Option(e.getCause).flatMap(cause => unapply(cause)).orElse(Some(e)) + } +} diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 7cedbbb67b2c8..0bcbf266dde42 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1193,21 +1193,6 @@ private[spark] object Utils extends Logging { } } - /** - * Execute a block of code that evaluates to Unit, re-throwing any non-fatal uncaught - * exceptions as IOException. This is used when implementing Externalizable and Serializable's - * read and write methods, since Java's serializer will not report non-IOExceptions properly; - * see SPARK-4080 for more context. - */ - def tryOrIOException(block: => Unit) { - try { - block - } catch { - case e: IOException => throw e - case NonFatal(t) => throw new IOException(t) - } - } - /** * Execute a block of code that returns a value, re-throwing any non-fatal uncaught * exceptions as IOException. This is used when implementing Externalizable and Serializable's @@ -1218,8 +1203,12 @@ private[spark] object Utils extends Logging { try { block } catch { - case e: IOException => throw e - case NonFatal(t) => throw new IOException(t) + case e: IOException => + logError("Exception encountered", e) + throw e + case NonFatal(e) => + logError("Exception encountered", e) + throw new IOException(e) } } @@ -1270,26 +1259,35 @@ private[spark] object Utils extends Logging { } /** - * Execute a block of code, call the failure callbacks before finally block if there is any - * exceptions happen. But if exceptions happen in the finally block, do not suppress the original - * exception. + * Execute a block of code and call the failure callbacks in the catch block. If exceptions occur + * in either the catch or the finally block, they are appended to the list of suppressed + * exceptions in original exception which is then rethrown. * - * This is primarily an issue with `finally { out.close() }` blocks, where - * close needs to be called to clean up `out`, but if an exception happened - * in `out.write`, it's likely `out` may be corrupted and `out.close` will + * This is primarily an issue with `catch { abort() }` or `finally { out.close() }` blocks, + * where the abort/close needs to be called to clean up `out`, but if an exception happened + * in `out.write`, it's likely `out` may be corrupted and `abort` or `out.close` will * fail as well. This would then suppress the original/likely more meaningful * exception from the original `out.write` call. */ - def tryWithSafeFinallyAndFailureCallbacks[T](block: => T)(finallyBlock: => Unit): T = { + def tryWithSafeFinallyAndFailureCallbacks[T](block: => T) + (catchBlock: => Unit = (), finallyBlock: => Unit = ()): T = { var originalThrowable: Throwable = null try { block } catch { - case t: Throwable => + case cause: Throwable => // Purposefully not using NonFatal, because even fatal exceptions // we don't want to have our finallyBlock suppress - originalThrowable = t - TaskContext.get().asInstanceOf[TaskContextImpl].markTaskFailed(t) + originalThrowable = cause + try { + logError("Aborting task", originalThrowable) + TaskContext.get().asInstanceOf[TaskContextImpl].markTaskFailed(originalThrowable) + catchBlock + } catch { + case t: Throwable => + originalThrowable.addSuppressed(t) + logWarning(s"Suppressing exception in catch: " + t.getMessage, t) + } throw originalThrowable } finally { try { @@ -2028,8 +2026,10 @@ private[spark] object Utils extends Logging { } catch { case e: Exception if isBindCollision(e) => if (offset >= maxRetries) { - val exceptionMessage = - s"${e.getMessage}: Service$serviceString failed after $maxRetries retries!" + val exceptionMessage = s"${e.getMessage}: Service$serviceString failed after " + + s"$maxRetries retries! Consider explicitly setting the appropriate port for the " + + s"service$serviceString (for example spark.ui.port for SparkUI) to an available " + + "port or increasing spark.port.maxRetries." val exception = new BindException(exceptionMessage) // restore original stack trace exception.setStackTrace(e.getStackTrace) diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala index efc2482c74ddf..3bc6a32a8c918 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala @@ -28,6 +28,9 @@ import org.apache.spark.annotation.DeveloperApi * space overhead. * * Under the hood, it uses our OpenHashSet implementation. + * + * NOTE: when using numeric type as the value type, the user of this class should be careful to + * distinguish between the 0/0.0/0L and non-exist value */ @DeveloperApi private[spark] diff --git a/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json index cb622e147249e..83ed44734ced6 100644 --- a/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/executor_list_json_expectation.json @@ -1,8 +1,8 @@ [ { "id" : "", "hostPort" : "localhost:57971", - "rddBlocks" : 8, - "memoryUsed" : 28000128, + "rddBlocks" : 0, + "memoryUsed" : 0, "diskUsed" : 0, "activeTasks" : 0, "failedTasks" : 1, diff --git a/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json b/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json index f79a31022d214..8878e547a7984 100644 --- a/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json +++ b/core/src/test/resources/HistoryServerExpectations/rdd_list_storage_json_expectation.json @@ -1,9 +1 @@ -[ { - "id" : 0, - "name" : "0", - "numPartitions" : 8, - "numCachedPartitions" : 8, - "storageLevel" : "Memory Deserialized 1x Replicated", - "memoryUsed" : 28000128, - "diskUsed" : 0 -} ] \ No newline at end of file +[ ] \ No newline at end of file diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index 4b7fd4f13b692..a13c4566bf85e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -126,8 +126,9 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers "stage task list from multi-attempt app json(2)" -> "applications/local-1426533911241/2/stages/0/0/taskList", - "rdd list storage json" -> "applications/local-1422981780767/storage/rdd", - "one rdd storage json" -> "applications/local-1422981780767/storage/rdd/0" + "rdd list storage json" -> "applications/local-1422981780767/storage/rdd" + // Todo: enable this test when logging the even of onBlockUpdated. See: SPARK-13845 + // "one rdd storage json" -> "applications/local-1422981780767/storage/rdd/0" ) // run a bunch of characterization tests -- just verify the behavior is the same as what is saved diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala index 1ae5b030f0832..e0eeeb80897ee 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala @@ -38,7 +38,7 @@ class OutputCommitCoordinatorIntegrationSuite super.beforeAll() val conf = new SparkConf() .set("master", "local[2,4]") - .set("spark.speculation", "true") + .set("spark.hadoop.outputCommitCoordination.enabled", "true") .set("spark.hadoop.mapred.output.committer.class", classOf[ThrowExceptionOnFirstAttemptOutputCommitter].getCanonicalName) sc = new SparkContext("local[2, 4]", "test", conf) diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala index 7345508bfe995..bbf8874dc50df 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala @@ -78,7 +78,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { val conf = new SparkConf() .setMaster("local[4]") .setAppName(classOf[OutputCommitCoordinatorSuite].getSimpleName) - .set("spark.speculation", "true") + .set("spark.hadoop.outputCommitCoordination.enabled", "true") sc = new SparkContext(conf) { override private[spark] def createSparkEnv( conf: SparkConf, diff --git a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala index f5cef1caaf1ac..60ffb0f943ad5 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/mesos/MesosClusterSchedulerSuite.scala @@ -72,4 +72,40 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi val state = scheduler.getSchedulerState() assert(state.queuedDrivers.isEmpty) } + + test("escapes commandline args for the shell") { + val conf = new SparkConf() + conf.setMaster("mesos://localhost:5050") + conf.setAppName("spark mesos") + val scheduler = new MesosClusterScheduler( + new BlackHoleMesosClusterPersistenceEngineFactory, conf) { + override def start(): Unit = { ready = true } + } + val escape = scheduler.shellEscape _ + def wrapped(str: String): String = "\"" + str + "\"" + + // Wrapped in quotes + assert(escape("'should be left untouched'") === "'should be left untouched'") + assert(escape("\"should be left untouched\"") === "\"should be left untouched\"") + + // Harmless + assert(escape("") === "") + assert(escape("harmless") === "harmless") + assert(escape("har-m.l3ss") === "har-m.l3ss") + + // Special Chars escape + assert(escape("should escape this \" quote") === wrapped("should escape this \\\" quote")) + assert(escape("shouldescape\"quote") === wrapped("shouldescape\\\"quote")) + assert(escape("should escape this $ dollar") === wrapped("should escape this \\$ dollar")) + assert(escape("should escape this ` backtick") === wrapped("should escape this \\` backtick")) + assert(escape("""should escape this \ backslash""") + === wrapped("""should escape this \\ backslash""")) + assert(escape("""\"?""") === wrapped("""\\\"?""")) + + + // Special Chars no escape only wrap + List(" ", "'", "<", ">", "&", "|", "?", "*", ";", "!", "#", "(", ")").foreach(char => { + assert(escape(s"onlywrap${char}this") === wrapped(s"onlywrap${char}this")) + }) + } } diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 333dcdb83b21d..4e66714ecbbb4 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -933,6 +933,17 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val list = List.fill(2)(new Array[Byte](2000)) val bigList = List.fill(8)(new Array[Byte](2000)) + def getUpdatedBlocks(task: => Unit): Seq[(BlockId, BlockStatus)] = { + val context = TaskContext.empty() + try { + TaskContext.setTaskContext(context) + task + } finally { + TaskContext.unset() + } + context.taskMetrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]()) + } + // 1 updated block (i.e. list1) val updatedBlocks1 = store.putIterator("list1", list.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true) @@ -992,6 +1003,16 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE assert(!store.diskStore.contains("list3"), "list3 was in disk store") assert(!store.diskStore.contains("list4"), "list4 was in disk store") assert(!store.diskStore.contains("list5"), "list5 was in disk store") + + // remove block - list2 should be removed from disk + val updatedBlocks6 = getUpdatedBlocks { + store.removeBlock( + "list2", tellMaster = true) + } + assert(updatedBlocks6.size === 1) + assert(updatedBlocks6.head._1 === TestBlockId("list2")) + assert(updatedBlocks6.head._2.storageLevel == StorageLevel.NONE) + assert(!store.diskStore.contains("list2"), "list2 was in disk store") } test("query block statuses") { diff --git a/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala index 1a199beb3558f..8feac01cf839b 100644 --- a/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/StorageStatusListenerSuite.scala @@ -76,48 +76,51 @@ class StorageStatusListenerSuite extends SparkFunSuite { assert(listener.executorIdToStorageStatus("fat").numBlocks === 0) } - test("task end with updated blocks") { + test("updated blocks") { val listener = new StorageStatusListener listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(1L, bm1, 1000L)) listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(1L, bm2, 2000L)) - val taskMetrics1 = new TaskMetrics - val taskMetrics2 = new TaskMetrics - val block1 = (RDDBlockId(1, 1), BlockStatus(StorageLevel.DISK_ONLY, 0L, 100L, 0L)) - val block2 = (RDDBlockId(1, 2), BlockStatus(StorageLevel.DISK_ONLY, 0L, 200L, 0L)) - val block3 = (RDDBlockId(4, 0), BlockStatus(StorageLevel.DISK_ONLY, 0L, 300L, 0L)) - taskMetrics1.updatedBlocks = Some(Seq(block1, block2)) - taskMetrics2.updatedBlocks = Some(Seq(block3)) - - // Task end with new blocks + + val blockUpdateInfos1 = Seq( + BlockUpdatedInfo(bm1, RDDBlockId(1, 1), StorageLevel.DISK_ONLY, 0L, 100L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(1, 2), StorageLevel.DISK_ONLY, 0L, 200L, 0L) + ) + val blockUpdateInfos2 = + Seq(BlockUpdatedInfo(bm2, RDDBlockId(4, 0), StorageLevel.DISK_ONLY, 0L, 300L, 0L)) + + // Add some new blocks assert(listener.executorIdToStorageStatus("big").numBlocks === 0) assert(listener.executorIdToStorageStatus("fat").numBlocks === 0) - listener.onTaskEnd(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo1, taskMetrics1)) + postUpdateBlock(listener, blockUpdateInfos1) assert(listener.executorIdToStorageStatus("big").numBlocks === 2) assert(listener.executorIdToStorageStatus("fat").numBlocks === 0) assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1))) assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2))) assert(listener.executorIdToStorageStatus("fat").numBlocks === 0) - listener.onTaskEnd(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo2, taskMetrics2)) + postUpdateBlock(listener, blockUpdateInfos2) assert(listener.executorIdToStorageStatus("big").numBlocks === 2) assert(listener.executorIdToStorageStatus("fat").numBlocks === 1) assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1))) assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2))) assert(listener.executorIdToStorageStatus("fat").containsBlock(RDDBlockId(4, 0))) - // Task end with dropped blocks - val droppedBlock1 = (RDDBlockId(1, 1), BlockStatus(StorageLevel.NONE, 0L, 0L, 0L)) - val droppedBlock2 = (RDDBlockId(1, 2), BlockStatus(StorageLevel.NONE, 0L, 0L, 0L)) - val droppedBlock3 = (RDDBlockId(4, 0), BlockStatus(StorageLevel.NONE, 0L, 0L, 0L)) - taskMetrics1.updatedBlocks = Some(Seq(droppedBlock1, droppedBlock3)) - taskMetrics2.updatedBlocks = Some(Seq(droppedBlock2, droppedBlock3)) + // Dropped the blocks + val droppedBlockInfo1 = Seq( + BlockUpdatedInfo(bm1, RDDBlockId(1, 1), StorageLevel.NONE, 0L, 0L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(4, 0), StorageLevel.NONE, 0L, 0L, 0L) + ) + val droppedBlockInfo2 = Seq( + BlockUpdatedInfo(bm2, RDDBlockId(1, 2), StorageLevel.NONE, 0L, 0L, 0L), + BlockUpdatedInfo(bm2, RDDBlockId(4, 0), StorageLevel.NONE, 0L, 0L, 0L) + ) - listener.onTaskEnd(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo1, taskMetrics1)) + postUpdateBlock(listener, droppedBlockInfo1) assert(listener.executorIdToStorageStatus("big").numBlocks === 1) assert(listener.executorIdToStorageStatus("fat").numBlocks === 1) assert(!listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1))) assert(listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 2))) assert(listener.executorIdToStorageStatus("fat").containsBlock(RDDBlockId(4, 0))) - listener.onTaskEnd(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo2, taskMetrics2)) + postUpdateBlock(listener, droppedBlockInfo2) assert(listener.executorIdToStorageStatus("big").numBlocks === 1) assert(listener.executorIdToStorageStatus("fat").numBlocks === 0) assert(!listener.executorIdToStorageStatus("big").containsBlock(RDDBlockId(1, 1))) @@ -128,15 +131,14 @@ class StorageStatusListenerSuite extends SparkFunSuite { test("unpersist RDD") { val listener = new StorageStatusListener listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(1L, bm1, 1000L)) - val taskMetrics1 = new TaskMetrics - val taskMetrics2 = new TaskMetrics - val block1 = (RDDBlockId(1, 1), BlockStatus(StorageLevel.DISK_ONLY, 0L, 100L, 0L)) - val block2 = (RDDBlockId(1, 2), BlockStatus(StorageLevel.DISK_ONLY, 0L, 200L, 0L)) - val block3 = (RDDBlockId(4, 0), BlockStatus(StorageLevel.DISK_ONLY, 0L, 300L, 0L)) - taskMetrics1.updatedBlocks = Some(Seq(block1, block2)) - taskMetrics2.updatedBlocks = Some(Seq(block3)) - listener.onTaskEnd(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo1, taskMetrics1)) - listener.onTaskEnd(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo1, taskMetrics2)) + val blockUpdateInfos1 = Seq( + BlockUpdatedInfo(bm1, RDDBlockId(1, 1), StorageLevel.DISK_ONLY, 0L, 100L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(1, 2), StorageLevel.DISK_ONLY, 0L, 200L, 0L) + ) + val blockUpdateInfos2 = + Seq(BlockUpdatedInfo(bm1, RDDBlockId(4, 0), StorageLevel.DISK_ONLY, 0L, 300L, 0L)) + postUpdateBlock(listener, blockUpdateInfos1) + postUpdateBlock(listener, blockUpdateInfos2) assert(listener.executorIdToStorageStatus("big").numBlocks === 3) // Unpersist RDD @@ -149,4 +151,11 @@ class StorageStatusListenerSuite extends SparkFunSuite { listener.onUnpersistRDD(SparkListenerUnpersistRDD(1)) assert(listener.executorIdToStorageStatus("big").numBlocks === 0) } + + private def postUpdateBlock( + listener: StorageStatusListener, updateBlockInfos: Seq[BlockUpdatedInfo]): Unit = { + updateBlockInfos.foreach { updateBlockInfo => + listener.onBlockUpdated(SparkListenerBlockUpdated(updateBlockInfo)) + } + } } diff --git a/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphSuite.scala b/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphSuite.scala new file mode 100644 index 0000000000000..6ddcb5aba1678 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/ui/scope/RDDOperationGraphSuite.scala @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ui.scope + +import org.apache.spark.SparkFunSuite + +class RDDOperationGraphSuite extends SparkFunSuite { + test("Test simple cluster equals") { + // create a 2-cluster chain with a child + val c1 = new RDDOperationCluster("1", "Bender") + val c2 = new RDDOperationCluster("2", "Hal") + c1.attachChildCluster(c2) + c1.attachChildNode(new RDDOperationNode(3, "Marvin", false, "collect!")) + + // create an equal cluster, but without the child node + val c1copy = new RDDOperationCluster("1", "Bender") + val c2copy = new RDDOperationCluster("2", "Hal") + c1copy.attachChildCluster(c2copy) + + assert(c1 == c1copy) + } +} diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala index 37e2670de9685..6c1e3748b4ce2 100644 --- a/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/storage/StorageTabSuite.scala @@ -105,7 +105,7 @@ class StorageTabSuite extends SparkFunSuite with BeforeAndAfter { assert(storageListener.rddInfoList.size === 0) } - test("task end") { + test("block update") { val myRddInfo0 = rddInfo0 val myRddInfo1 = rddInfo1 val myRddInfo2 = rddInfo2 @@ -119,20 +119,14 @@ class StorageTabSuite extends SparkFunSuite with BeforeAndAfter { assert(!storageListener._rddInfoMap(1).isCached) assert(!storageListener._rddInfoMap(2).isCached) - // Task end with no updated blocks. This should not change anything. - bus.postToAll(SparkListenerTaskEnd(0, 0, "obliteration", Success, taskInfo, new TaskMetrics)) - assert(storageListener._rddInfoMap.size === 3) - assert(storageListener.rddInfoList.size === 0) - - // Task end with a few new persisted blocks, some from the same RDD - val metrics1 = new TaskMetrics - metrics1.updatedBlocks = Some(Seq( - (RDDBlockId(0, 100), BlockStatus(memAndDisk, 400L, 0L, 0L)), - (RDDBlockId(0, 101), BlockStatus(memAndDisk, 0L, 400L, 0L)), - (RDDBlockId(0, 102), BlockStatus(memAndDisk, 400L, 0L, 200L)), - (RDDBlockId(1, 20), BlockStatus(memAndDisk, 0L, 240L, 0L)) - )) - bus.postToAll(SparkListenerTaskEnd(1, 0, "obliteration", Success, taskInfo, metrics1)) + // Some blocks updated + val blockUpdateInfos = Seq( + BlockUpdatedInfo(bm1, RDDBlockId(0, 100), memAndDisk, 400L, 0L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(0, 101), memAndDisk, 0L, 400L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(0, 102), memAndDisk, 400L, 0L, 200L), + BlockUpdatedInfo(bm1, RDDBlockId(1, 20), memAndDisk, 0L, 240L, 0L) + ) + postUpdateBlocks(bus, blockUpdateInfos) assert(storageListener._rddInfoMap(0).memSize === 800L) assert(storageListener._rddInfoMap(0).diskSize === 400L) assert(storageListener._rddInfoMap(0).externalBlockStoreSize === 200L) @@ -146,15 +140,14 @@ class StorageTabSuite extends SparkFunSuite with BeforeAndAfter { assert(!storageListener._rddInfoMap(2).isCached) assert(storageListener._rddInfoMap(2).numCachedPartitions === 0) - // Task end with a few dropped blocks - val metrics2 = new TaskMetrics - metrics2.updatedBlocks = Some(Seq( - (RDDBlockId(0, 100), BlockStatus(none, 0L, 0L, 0L)), - (RDDBlockId(1, 20), BlockStatus(none, 0L, 0L, 0L)), - (RDDBlockId(2, 40), BlockStatus(none, 0L, 0L, 0L)), // doesn't actually exist - (RDDBlockId(4, 80), BlockStatus(none, 0L, 0L, 0L)) // doesn't actually exist - )) - bus.postToAll(SparkListenerTaskEnd(2, 0, "obliteration", Success, taskInfo, metrics2)) + // Drop some blocks + val blockUpdateInfos2 = Seq( + BlockUpdatedInfo(bm1, RDDBlockId(0, 100), none, 0L, 0L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(1, 20), none, 0L, 0L, 0L), + BlockUpdatedInfo(bm1, RDDBlockId(2, 40), none, 0L, 0L, 0L), // doesn't actually exist + BlockUpdatedInfo(bm1, RDDBlockId(4, 80), none, 0L, 0L, 0L) // doesn't actually exist + ) + postUpdateBlocks(bus, blockUpdateInfos2) assert(storageListener._rddInfoMap(0).memSize === 400L) assert(storageListener._rddInfoMap(0).diskSize === 400L) assert(storageListener._rddInfoMap(0).externalBlockStoreSize === 200L) @@ -172,24 +165,27 @@ class StorageTabSuite extends SparkFunSuite with BeforeAndAfter { val rddInfo1 = new RDDInfo(1, "rdd1", 1, memOnly, Seq(4)) val stageInfo0 = new StageInfo(0, 0, "stage0", 1, Seq(rddInfo0), Seq.empty, "details") val stageInfo1 = new StageInfo(1, 0, "stage1", 1, Seq(rddInfo1), Seq.empty, "details") - val taskMetrics0 = new TaskMetrics - val taskMetrics1 = new TaskMetrics - val block0 = (RDDBlockId(0, 1), BlockStatus(memOnly, 100L, 0L, 0L)) - val block1 = (RDDBlockId(1, 1), BlockStatus(memOnly, 200L, 0L, 0L)) - taskMetrics0.updatedBlocks = Some(Seq(block0)) - taskMetrics1.updatedBlocks = Some(Seq(block1)) + val blockUpdateInfos1 = Seq(BlockUpdatedInfo(bm1, RDDBlockId(0, 1), memOnly, 100L, 0L, 0L)) + val blockUpdateInfos2 = Seq(BlockUpdatedInfo(bm1, RDDBlockId(1, 1), memOnly, 200L, 0L, 0L)) bus.postToAll(SparkListenerBlockManagerAdded(1L, bm1, 1000L)) bus.postToAll(SparkListenerStageSubmitted(stageInfo0)) assert(storageListener.rddInfoList.size === 0) - bus.postToAll(SparkListenerTaskEnd(0, 0, "big", Success, taskInfo, taskMetrics0)) + postUpdateBlocks(bus, blockUpdateInfos1) assert(storageListener.rddInfoList.size === 1) bus.postToAll(SparkListenerStageSubmitted(stageInfo1)) assert(storageListener.rddInfoList.size === 1) bus.postToAll(SparkListenerStageCompleted(stageInfo0)) assert(storageListener.rddInfoList.size === 1) - bus.postToAll(SparkListenerTaskEnd(1, 0, "small", Success, taskInfo1, taskMetrics1)) + postUpdateBlocks(bus, blockUpdateInfos2) assert(storageListener.rddInfoList.size === 2) bus.postToAll(SparkListenerStageCompleted(stageInfo1)) assert(storageListener.rddInfoList.size === 2) } + + private def postUpdateBlocks( + bus: SparkListenerBus, blockUpdateInfos: Seq[BlockUpdatedInfo]): Unit = { + blockUpdateInfos.foreach { blockUpdateInfo => + bus.postToAll(SparkListenerBlockUpdated(blockUpdateInfo)) + } + } } diff --git a/core/src/test/scala/org/apache/spark/util/CausedBySuite.scala b/core/src/test/scala/org/apache/spark/util/CausedBySuite.scala new file mode 100644 index 0000000000000..4a80e3f1f452d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/CausedBySuite.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import org.apache.spark.SparkFunSuite + +class CausedBySuite extends SparkFunSuite { + + test("For an error without a cause, should return the error") { + val error = new Exception + + val causedBy = error match { + case CausedBy(e) => e + } + + assert(causedBy === error) + } + + test("For an error with a cause, should return the cause of the error") { + val cause = new Exception + val error = new Exception(cause) + + val causedBy = error match { + case CausedBy(e) => e + } + + assert(causedBy === cause) + } + + test("For an error with a cause that itself has a cause, return the root cause") { + val causeOfCause = new Exception + val cause = new Exception(causeOfCause) + val error = new Exception(cause) + + val causedBy = error match { + case CausedBy(e) => e + } + + assert(causedBy === causeOfCause) + } +} diff --git a/dev/deps/spark-deps-hadoop-1 b/dev/deps/spark-deps-hadoop-1 index d5e85ad387009..573431b244ccb 100644 --- a/dev/deps/spark-deps-hadoop-1 +++ b/dev/deps/spark-deps-hadoop-1 @@ -58,7 +58,6 @@ eigenbase-properties-1.1.5.jar geronimo-annotation_1.0_spec-1.1.1.jar geronimo-jaspic_1.0_spec-1.0.jar geronimo-jta_1.1_spec-1.1.1.jar -groovy-all-2.1.6.jar hadoop-client-1.2.1.jar hadoop-core-1.2.1.jar hsqldb-1.8.0.10.jar @@ -90,7 +89,7 @@ jettison-1.1.jar jetty-all-7.6.0.v20120127.jar jline-2.10.5.jar jline-2.12.jar -joda-time-2.9.jar +joda-time-2.9.3.jar jodd-core-3.5.2.jar jpam-1.1.jar json-20090211.jar @@ -144,7 +143,7 @@ servlet-api-2.5.jar slf4j-api-1.7.10.jar slf4j-log4j12-1.7.10.jar snappy-0.2.jar -snappy-java-1.1.2.jar +snappy-java-1.1.2.1.jar spire-macros_2.10-0.7.4.jar spire_2.10-0.7.4.jar stax-api-1.0.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 0364905c3897b..c9cb071532f44 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -64,7 +64,6 @@ grizzly-http-2.1.2.jar grizzly-http-server-2.1.2.jar grizzly-http-servlet-2.1.2.jar grizzly-rcm-2.1.2.jar -groovy-all-2.1.6.jar guice-3.0.jar guice-servlet-3.0.jar hadoop-annotations-2.2.0.jar @@ -119,7 +118,7 @@ jetty-all-7.6.0.v20120127.jar jetty-util-6.1.26.jar jline-2.10.5.jar jline-2.12.jar -joda-time-2.9.jar +joda-time-2.9.3.jar jodd-core-3.5.2.jar jpam-1.1.jar json-20090211.jar @@ -174,7 +173,7 @@ servlet-api-2.5.jar slf4j-api-1.7.10.jar slf4j-log4j12-1.7.10.jar snappy-0.2.jar -snappy-java-1.1.2.jar +snappy-java-1.1.2.1.jar spire-macros_2.10-0.7.4.jar spire_2.10-0.7.4.jar stax-api-1.0.1.jar diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index b3481ac6863d4..ed2d6292fd40c 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -59,7 +59,6 @@ eigenbase-properties-1.1.5.jar geronimo-annotation_1.0_spec-1.1.1.jar geronimo-jaspic_1.0_spec-1.0.jar geronimo-jta_1.1_spec-1.1.1.jar -groovy-all-2.1.6.jar guice-3.0.jar guice-servlet-3.0.jar hadoop-annotations-2.3.0.jar @@ -110,7 +109,7 @@ jetty-all-7.6.0.v20120127.jar jetty-util-6.1.26.jar jline-2.10.5.jar jline-2.12.jar -joda-time-2.9.jar +joda-time-2.9.3.jar jodd-core-3.5.2.jar jpam-1.1.jar json-20090211.jar @@ -165,7 +164,7 @@ servlet-api-2.5.jar slf4j-api-1.7.10.jar slf4j-log4j12-1.7.10.jar snappy-0.2.jar -snappy-java-1.1.2.jar +snappy-java-1.1.2.1.jar spire-macros_2.10-0.7.4.jar spire_2.10-0.7.4.jar stax-api-1.0-2.jar diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index 7ddff0709f71d..22a9b8ecef427 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -59,7 +59,6 @@ eigenbase-properties-1.1.5.jar geronimo-annotation_1.0_spec-1.1.1.jar geronimo-jaspic_1.0_spec-1.0.jar geronimo-jta_1.1_spec-1.1.1.jar -groovy-all-2.1.6.jar guice-3.0.jar guice-servlet-3.0.jar hadoop-annotations-2.4.0.jar @@ -111,7 +110,7 @@ jetty-all-7.6.0.v20120127.jar jetty-util-6.1.26.jar jline-2.10.5.jar jline-2.12.jar -joda-time-2.9.jar +joda-time-2.9.3.jar jodd-core-3.5.2.jar jpam-1.1.jar json-20090211.jar @@ -166,7 +165,7 @@ servlet-api-2.5.jar slf4j-api-1.7.10.jar slf4j-log4j12-1.7.10.jar snappy-0.2.jar -snappy-java-1.1.2.jar +snappy-java-1.1.2.1.jar spire-macros_2.10-0.7.4.jar spire_2.10-0.7.4.jar stax-api-1.0-2.jar diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 4e510bd6f9178..87e6c2a234722 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -63,7 +63,6 @@ eigenbase-properties-1.1.5.jar geronimo-annotation_1.0_spec-1.1.1.jar geronimo-jaspic_1.0_spec-1.0.jar geronimo-jta_1.1_spec-1.1.1.jar -groovy-all-2.1.6.jar gson-2.2.4.jar guice-3.0.jar guice-servlet-3.0.jar @@ -117,7 +116,7 @@ jetty-all-7.6.0.v20120127.jar jetty-util-6.1.26.jar jline-2.10.5.jar jline-2.12.jar -joda-time-2.9.jar +joda-time-2.9.3.jar jodd-core-3.5.2.jar jpam-1.1.jar json-20090211.jar @@ -172,7 +171,7 @@ servlet-api-2.5.jar slf4j-api-1.7.10.jar slf4j-log4j12-1.7.10.jar snappy-0.2.jar -snappy-java-1.1.2.jar +snappy-java-1.1.2.1.jar spire-macros_2.10-0.7.4.jar spire_2.10-0.7.4.jar stax-api-1.0-2.jar diff --git a/dev/lint-r.R b/dev/lint-r.R index 999eef571b824..87ee36d5c9b68 100644 --- a/dev/lint-r.R +++ b/dev/lint-r.R @@ -27,7 +27,7 @@ if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) { # Installs lintr from Github in a local directory. # NOTE: The CRAN's version is too old to adapt to our rules. if ("lintr" %in% row.names(installed.packages()) == FALSE) { - devtools::install_github("jimhester/lintr") + devtools::install_github("jimhester/lintr@a769c0b") } library(lintr) diff --git a/dev/run-tests.py b/dev/run-tests.py index d20edacbc4807..4a18d1a7469c6 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -101,6 +101,8 @@ def determine_modules_to_test(changed_modules): >>> sorted(x.name for x in determine_modules_to_test([modules.root])) ['root'] + >>> [x.name for x in determine_modules_to_test([modules.build])] + ['root'] >>> sorted(x.name for x in determine_modules_to_test([modules.graphx])) ['examples', 'graphx'] >>> x = sorted(x.name for x in determine_modules_to_test([modules.sql])) @@ -108,14 +110,12 @@ def determine_modules_to_test(changed_modules): ['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \ 'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql'] """ - # If we're going to have to run all of the tests, then we can just short-circuit - # and return 'root'. No module depends on root, so if it appears then it will be - # in changed_modules. - if modules.root in changed_modules: - return [modules.root] modules_to_test = set() for module in changed_modules: modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) + # If we need to run all of the tests, then we should short-circuit and return 'root' + if modules.root in modules_to_test: + return [modules.root] return modules_to_test.union(set(changed_modules)) diff --git a/docs/building-spark.md b/docs/building-spark.md index 3d38edbdad4bc..5f694dc3a0a83 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -117,9 +117,9 @@ mvn -Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0 -Dyarn.version=2.2.0 -DskipTests # Building With Hive and JDBC Support To enable Hive integration for Spark SQL along with its JDBC server and CLI, add the `-Phive` and `Phive-thriftserver` profiles to your existing build options. -By default Spark will build with Hive 0.13.1 bindings. +By default Spark will build with Hive 1.2.1 bindings. {% highlight bash %} -# Apache Hadoop 2.4.X with Hive 13 support +# Apache Hadoop 2.4.X with Hive 1.2.1 support mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -Phive -Phive-thriftserver -DskipTests clean package {% endhighlight %} diff --git a/docs/configuration.md b/docs/configuration.md index 2ba48e1308ad0..64a1899d69769 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -749,7 +749,7 @@ Apart from these, the following properties are also available, and may be useful spark.memory.offHeap.size 0 - The absolute amount of memory which can be used for off-heap allocation. + The absolute amount of memory in bytes which can be used for off-heap allocation. This setting has no impact on heap memory usage, so if your executors' total memory consumption must fit within some hard limit then be sure to shrink your JVM heap size accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true. diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 1edc0fe34706b..113d09ef145f7 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -872,10 +872,7 @@ val runningCounts = pairs.updateStateByKey[Int](updateFunction _) {% endhighlight %} The update function will be called for each word, with `newValues` having a sequence of 1's (from -the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete -Scala code, take a look at the example -[StatefulNetworkWordCount.scala]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache -/spark/examples/streaming/StatefulNetworkWordCount.scala). +the `(word, 1)` pairs) and the `runningCount` having the previous count.
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md index acbb0f298fe47..ea52c350289e4 100644 --- a/docs/submitting-applications.md +++ b/docs/submitting-applications.md @@ -30,7 +30,7 @@ dependencies, and can support different cluster managers and deploy modes that S {% highlight bash %} ./bin/spark-submit \ - --class + --class \ --master \ --deploy-mode \ --conf = \ @@ -58,8 +58,7 @@ for applications that involve the REPL (e.g. Spark shell). Alternatively, if your application is submitted from a machine far from the worker machines (e.g. locally on your laptop), it is common to use `cluster` mode to minimize network latency between -the drivers and the executors. Note that `cluster` mode is currently not supported for -Mesos clusters. Currently only YARN supports cluster mode for Python applications. +the drivers and the executors. Currently only YARN supports cluster mode for Python applications. For Python applications, simply pass a `.py` file in the place of `` instead of a JAR, and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`. @@ -92,8 +91,8 @@ run it with `--help`. Here are a few examples of common options: ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ --master spark://207.184.161.138:7077 \ - --deploy-mode cluster - --supervise + --deploy-mode cluster \ + --supervise \ --executor-memory 20G \ --total-executor-cores 100 \ /path/to/examples.jar \ @@ -120,8 +119,8 @@ export HADOOP_CONF_DIR=XXX ./bin/spark-submit \ --class org.apache.spark.examples.SparkPi \ --master mesos://207.184.161.138:7077 \ - --deploy-mode cluster - --supervise + --deploy-mode cluster \ + --supervise \ --executor-memory 20G \ --total-executor-cores 100 \ http://path/to/examples.jar \ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala index 9827dfab8684a..e27c44d03c317 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala @@ -269,7 +269,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali if (Random.nextDouble() < probability) { Some(vidVvals._1) } else { None } } - if (selectedVertices.count > 1) { + if (selectedVertices.count > 0) { found = true val collectedVertices = selectedVertices.collect() retVal = collectedVertices(Random.nextInt(collectedVertices.size)) diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala index 1f5e27d5508b8..9acbd7960e12f 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala @@ -428,4 +428,12 @@ class GraphSuite extends SparkFunSuite with LocalSparkContext { } } + test("SPARK-14219: pickRandomVertex") { + withSpark { sc => + val vert = sc.parallelize(List((1L, "a")), 1) + val edges = sc.parallelize(List(Edge[Long](1L, 1L)), 1) + val g0 = Graph(vert, edges) + assert(g0.pickRandomVertex() === 1L) + } + } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala index 4b2b3f8489fd0..eb57ac89b5bab 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala @@ -99,7 +99,10 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] with M val stages: Param[Array[PipelineStage]] = new Param(this, "stages", "stages of the pipeline") /** @group setParam */ - def setStages(value: Array[PipelineStage]): this.type = { set(stages, value); this } + def setStages(value: Array[_ <: PipelineStage]): this.type = { + set(stages, value.asInstanceOf[Array[PipelineStage]]) + this + } // Below, we clone stages so that modifications to the list of stages will not change // the Param value in the Pipeline. diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index b6b25ecd01b3d..807ab337488ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -39,11 +39,12 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui def this() = this(Identifiable.randomUID("regEval")) /** - * param for metric name in evaluation (supports `"rmse"` (default), `"mse"`, `"r2"`, and `"mae"`) + * Param for metric name in evaluation. Supports: + * - `"rmse"` (default): root mean squared error + * - `"mse"`: mean squared error + * - `"r2"`: R^2^ metric + * - `"mae"`: mean absolute error * - * Because we will maximize evaluation value (ref: `CrossValidator`), - * when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`), - * we take and output the negative of this metric. * @group param */ @Since("1.4.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala index 5c43a41bee3b4..564c867d7d5a6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala @@ -101,6 +101,7 @@ class RFormula(override val uid: String) extends Estimator[RFormulaModel] with R encoderStages += new StringIndexer() .setInputCol(term) .setOutputCol(indexCol) + prefixesToRewrite(indexCol + "_") = term + "_" (term, indexCol) case _ => (term, term) @@ -198,7 +199,7 @@ class RFormulaModel private[feature]( override def copy(extra: ParamMap): RFormulaModel = copyValues( new RFormulaModel(uid, resolvedFormula, pipelineModel)) - override def toString: String = s"RFormulaModel(${resolvedFormula}) (uid=$uid)" + override def toString: String = s"RFormulaModel($resolvedFormula) (uid=$uid)" private def transformLabel(dataset: DataFrame): DataFrame = { val labelName = resolvedFormula.label @@ -268,14 +269,10 @@ private class VectorAttributeRewriter( val group = AttributeGroup.fromStructField(dataset.schema(vectorCol)) val attrs = group.attributes.get.map { attr => if (attr.name.isDefined) { - val name = attr.name.get - val replacement = prefixesToRewrite.filter { case (k, _) => name.startsWith(k) } - if (replacement.nonEmpty) { - val (k, v) = replacement.headOption.get - attr.withName(v + name.stripPrefix(k)) - } else { - attr + val name = prefixesToRewrite.foldLeft(attr.name.get) { case (curName, (from, to)) => + curName.replace(from, to) } + attr.withName(name) } else { attr } @@ -284,7 +281,7 @@ private class VectorAttributeRewriter( } val otherCols = dataset.columns.filter(_ != vectorCol).map(dataset.col) val rewrittenCol = dataset.col(vectorCol).as(vectorCol, metadata) - dataset.select((otherCols :+ rewrittenCol): _*) + dataset.select(otherCols :+ rewrittenCol : _*) } override def transformSchema(schema: StructType): StructType = { diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala index c09f4d076c964..f5509c4805ece 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala @@ -68,8 +68,7 @@ class SQLTransformer @Since("1.6.0") (override val uid: String) extends Transfor val tableName = Identifiable.randomUID(uid) dataset.registerTempTable(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) - val outputDF = dataset.sqlContext.sql(realStatement) - outputDF + dataset.sqlContext.sql(realStatement) } @Since("1.6.0") @@ -78,8 +77,11 @@ class SQLTransformer @Since("1.6.0") (override val uid: String) extends Transfor val sqlContext = SQLContext.getOrCreate(sc) val dummyRDD = sc.parallelize(Seq(Row.empty)) val dummyDF = sqlContext.createDataFrame(dummyRDD, schema) - dummyDF.registerTempTable(tableIdentifier) - val outputSchema = sqlContext.sql($(statement)).schema + val tableName = Identifiable.randomUID(uid) + val realStatement = $(statement).replace(tableIdentifier, tableName) + dummyDF.registerTempTable(tableName) + val outputSchema = sqlContext.sql(realStatement).schema + sqlContext.dropTempTable(tableName) outputSchema } diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index b3413a1bcd3e7..a843cc8ef686e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -160,15 +160,14 @@ class StringIndexerModel ( } val metadata = NominalAttribute.defaultAttr - .withName($(inputCol)).withValues(labels).toMetadata() + .withName($(outputCol)).withValues(labels).toMetadata() // If we are skipping invalid records, filter them out. - val filteredDataset = (getHandleInvalid) match { - case "skip" => { + val filteredDataset = getHandleInvalid match { + case "skip" => val filterer = udf { label: String => labelToIndex.contains(label) } dataset.where(filterer(dataset($(inputCol)))) - } case _ => dataset } filteredDataset.select(col("*"), diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 4b06fad0089a8..6cc1a2aae1618 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -449,10 +449,11 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } Iterator((stat, gammaPart)) } - val statsSum: BDM[Double] = stats.map(_._1).reduce(_ += _) + val statsSum: BDM[Double] = stats.map(_._1).treeAggregate(BDM.zeros[Double](k, vocabSize))( + _ += _, _ += _) expElogbetaBc.unpersist() val gammat: BDM[Double] = breeze.linalg.DenseMatrix.vertcat( - stats.map(_._2).reduce(_ ++ _).map(_.toDenseMatrix): _*) + stats.map(_._2).flatMap(list => list).collect().map(_.toDenseMatrix): _*) val batchResult = statsSum :* expElogbeta.t // Note that this is an optimization to avoid batch.count diff --git a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala index 72d3aabc9b1f4..e316cabd97880 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicCheckpointer.scala @@ -51,7 +51,8 @@ import org.apache.spark.storage.StorageLevel * - This class removes checkpoint files once later Datasets have been checkpointed. * However, references to the older Datasets will still return isCheckpointed = true. * - * @param checkpointInterval Datasets will be checkpointed at this interval + * @param checkpointInterval Datasets will be checkpointed at this interval. + * If this interval was set as -1, then checkpointing will be disabled. * @param sc SparkContext for the Datasets given to this checkpointer * @tparam T Dataset type, such as RDD[Double] */ @@ -88,7 +89,8 @@ private[mllib] abstract class PeriodicCheckpointer[T]( updateCount += 1 // Handle checkpointing (after persisting) - if ((updateCount % checkpointInterval) == 0 && sc.getCheckpointDir.nonEmpty) { + if (checkpointInterval != -1 && (updateCount % checkpointInterval) == 0 + && sc.getCheckpointDir.nonEmpty) { // Add new checkpoint before removing old checkpoints. checkpoint(newData) checkpointQueue.enqueue(newData) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala index 11a059536c50c..20db6084d0e0d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/impl/PeriodicGraphCheckpointer.scala @@ -69,7 +69,8 @@ import org.apache.spark.storage.StorageLevel * // checkpointed: graph4 * }}} * - * @param checkpointInterval Graphs will be checkpointed at this interval + * @param checkpointInterval Graphs will be checkpointed at this interval. + * If this interval was set as -1, then checkpointing will be disabled. * @tparam VD Vertex descriptor type * @tparam ED Edge descriptor type * diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 8879dcf75c9bf..7d35c8a0a3aac 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -584,7 +584,7 @@ class SparseMatrix @Since("1.3.0") ( private[mllib] def update(i: Int, j: Int, v: Double): Unit = { val ind = index(i, j) - if (ind == -1) { + if (ind < 0) { throw new NoSuchElementException("The given row and column indices correspond to a zero " + "value. Only non-zero elements in Sparse Matrices can be updated.") } else { @@ -879,8 +879,16 @@ object Matrices { case dm: BDM[Double] => new DenseMatrix(dm.rows, dm.cols, dm.data, dm.isTranspose) case sm: BSM[Double] => + // Spark-11507. work around breeze issue 479. + val mat = if (sm.colPtrs.last != sm.data.length) { + val matCopy = sm.copy + matCopy.compact() + matCopy + } else { + sm + } // There is no isTranspose flag for sparse matrices in Breeze - new SparseMatrix(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data) + new SparseMatrix(mat.rows, mat.cols, mat.colPtrs, mat.rowIndices, mat.data) case _ => throw new UnsupportedOperationException( s"Do not support conversion from type ${breeze.getClass.getName}.") diff --git a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala index 8c86767456368..9749df6783cde 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/PipelineSuite.scala @@ -174,6 +174,13 @@ class PipelineSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul } } } + + test("Pipeline.setStages should handle Java Arrays being non-covariant") { + val stages0 = Array(new UnWritableStage("b")) + val stages1 = Array(new WritableStage("a")) + val steps = stages0 ++ stages1 + val p = new Pipeline().setStages(steps) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala index 553e0b870216c..e213e17d0d9de 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/SQLTransformerSuite.scala @@ -21,6 +21,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @@ -49,4 +50,13 @@ class SQLTransformerSuite .setStatement("select * from __THIS__") testDefaultReadWrite(t) } + + test("transformSchema") { + val df = sqlContext.range(10) + val outputSchema = new SQLTransformer() + .setStatement("SELECT id + 1 AS id1 FROM __THIS__") + .transformSchema(df.schema) + val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) + assert(outputSchema === expected) + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index 26f4613f8ff9d..6ba4aaaf69eb2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -210,4 +210,17 @@ class StringIndexerSuite .setLabels(Array("a", "b", "c")) testDefaultReadWrite(t) } + + test("StringIndexer metadata") { + val data = sc.parallelize(Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")), 2) + val df = sqlContext.createDataFrame(data).toDF("id", "label") + val indexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("labelIndex") + .fit(df) + val transformed = indexer.transform(df) + val attrs = + NominalAttribute.decodeStructField(transformed.schema("labelIndex"), preserveName = true) + assert(attrs.name.nonEmpty && attrs.name.get === "labelIndex") + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index 1833cf3833671..0230aac82504e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.mllib.linalg import java.util.Random +import breeze.linalg.{CSCMatrix, Matrix => BM} import org.mockito.Mockito.when import org.scalatest.mock.MockitoSugar._ import scala.collection.mutable.{Map => MutableMap} @@ -150,6 +151,10 @@ class MatricesSuite extends SparkFunSuite { sparseMat.update(0, 0, 10.0) } + intercept[NoSuchElementException] { + sparseMat.update(2, 1, 10.0) + } + sparseMat.update(0, 1, 10.0) assert(sparseMat(0, 1) === 10.0) assert(sparseMat.values(2) === 10.0) @@ -494,4 +499,15 @@ class MatricesSuite extends SparkFunSuite { assert(sm1.numNonzeros === 1) assert(sm1.numActives === 3) } + + test("fromBreeze with sparse matrix") { + // colPtr.last does NOT always equal to values.length in breeze SCSMatrix and + // invocation of compact() may be necessary. Refer to SPARK-11507 + val bm1: BM[Double] = new CSCMatrix[Double]( + Array(1.0, 1, 1), 3, 3, Array(0, 1, 2, 3), Array(0, 1, 2)) + val bm2: BM[Double] = new CSCMatrix[Double]( + Array(1.0, 2, 2, 4), 3, 3, Array(0, 0, 2, 4), Array(1, 2, 1, 2)) + val sum = bm1 + bm2 + Matrices.fromBreeze(sum) + } } diff --git a/network/common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java b/network/common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java index d686a951467cf..44403eede4d59 100644 --- a/network/common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java +++ b/network/common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java @@ -18,6 +18,7 @@ package org.apache.spark.network.protocol; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.channels.WritableByteChannel; import com.google.common.base.Preconditions; @@ -39,6 +40,14 @@ class MessageWithHeader extends AbstractReferenceCounted implements FileRegion { private final long bodyLength; private long totalBytesTransferred; + /** + * When the write buffer size is larger than this limit, I/O will be done in chunks of this size. + * The size should not be too large as it will waste underlying memory copy. e.g. If network + * available buffer is smaller than this limit, the data cannot be sent within one single write + * operation while it still will make memory copy with this size. + */ + private static final int NIO_BUFFER_LIMIT = 256 * 1024; + MessageWithHeader(ByteBuf header, Object body, long bodyLength) { Preconditions.checkArgument(body instanceof ByteBuf || body instanceof FileRegion, "Body must be a ByteBuf or a FileRegion."); @@ -102,8 +111,27 @@ protected void deallocate() { } private int copyByteBuf(ByteBuf buf, WritableByteChannel target) throws IOException { - int written = target.write(buf.nioBuffer()); + ByteBuffer buffer = buf.nioBuffer(); + int written = (buffer.remaining() <= NIO_BUFFER_LIMIT) ? + target.write(buffer) : writeNioBuffer(target, buffer); buf.skipBytes(written); return written; } + + private int writeNioBuffer( + WritableByteChannel writeCh, + ByteBuffer buf) throws IOException { + int originalLimit = buf.limit(); + int ret = 0; + + try { + int ioSize = Math.min(buf.remaining(), NIO_BUFFER_LIMIT); + buf.limit(buf.position() + ioSize); + ret = writeCh.write(buf); + } finally { + buf.limit(originalLimit); + } + + return ret; + } } diff --git a/network/common/src/main/java/org/apache/spark/network/util/JavaUtils.java b/network/common/src/main/java/org/apache/spark/network/util/JavaUtils.java index b3d8e0cd7cdcd..0c01c3ceafcd3 100644 --- a/network/common/src/main/java/org/apache/spark/network/util/JavaUtils.java +++ b/network/common/src/main/java/org/apache/spark/network/util/JavaUtils.java @@ -237,11 +237,11 @@ private static long parseByteString(String str, ByteUnit unit) { } } catch (NumberFormatException e) { - String timeError = "Size must be specified as bytes (b), " + + String byteError = "Size must be specified as bytes (b), " + "kibibytes (k), mebibytes (m), gibibytes (g), tebibytes (t), or pebibytes(p). " + "E.g. 50b, 100k, or 250m."; - throw new NumberFormatException(timeError + "\n" + e.getMessage()); + throw new NumberFormatException(byteError + "\n" + e.getMessage()); } } diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java b/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java index a466c729154aa..5b011bbbfc942 100644 --- a/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java +++ b/network/common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java @@ -141,7 +141,7 @@ private ByteBuf decodeNext() throws Exception { } // Otherwise, create a composite buffer. - CompositeByteBuf frame = buffers.getFirst().alloc().compositeBuffer(); + CompositeByteBuf frame = buffers.getFirst().alloc().compositeBuffer(Integer.MAX_VALUE); while (remaining > 0) { ByteBuf next = nextBufferForFrame(remaining); remaining -= next.readableBytes(); diff --git a/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index ba6d30a74c673..4bc3c1a3c8a64 100644 --- a/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/network/yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -24,6 +24,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.server.api.*; import org.slf4j.Logger; @@ -118,7 +119,7 @@ protected void serviceInit(Configuration conf) { // an application was stopped while the NM was down, we expect yarn to call stopApplication() // when it comes back registeredExecutorFile = - findRegisteredExecutorFile(conf.getStrings("yarn.nodemanager.local-dirs")); + findRegisteredExecutorFile(conf.getTrimmedStrings("yarn.nodemanager.local-dirs")); TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); // If authentication is enabled, set up the shuffle server to use a @@ -191,12 +192,12 @@ public void stopContainer(ContainerTerminationContext context) { private File findRegisteredExecutorFile(String[] localDirs) { for (String dir: localDirs) { - File f = new File(dir, "registeredExecutors.ldb"); + File f = new File(new Path(dir).toUri().getPath(), "registeredExecutors.ldb"); if (f.exists()) { return f; } } - return new File(localDirs[0], "registeredExecutors.ldb"); + return new File(new Path(localDirs[0]).toUri().getPath(), "registeredExecutors.ldb"); } /** diff --git a/pom.xml b/pom.xml index a85037b53b382..996a30a4e4e22 100644 --- a/pom.xml +++ b/pom.xml @@ -144,7 +144,7 @@ 1.7.0 1.6.0 1.2.4 - 8.1.14.v20131031 + 8.1.19.v20160209 3.0.0.v201112011016 0.5.0 2.4.0 @@ -169,7 +169,7 @@ org.scala-lang 1.9.13 2.4.4 - 1.1.2 + 1.1.2.1 1.1.2 1.2.0-incubating 1.10 @@ -180,7 +180,7 @@ 3.2.10 2.7.8 1.9 - 2.9 + 2.9.3 3.5.2 1.3.9 0.9.2 @@ -1407,6 +1407,10 @@ commons-logging commons-logging + + org.codehaus.groovy + groovy-all + @@ -1478,6 +1482,10 @@ commons-logging commons-logging + + org.codehaus.groovy + groovy-all + @@ -1572,6 +1580,10 @@ commons-logging commons-logging + + org.codehaus.groovy + groovy-all + @@ -1617,6 +1629,10 @@ org.apache.thrift libthrift + + org.codehaus.groovy + groovy-all + @@ -1665,6 +1681,10 @@ commons-logging commons-logging + + org.codehaus.groovy + groovy-all + diff --git a/python/pyspark/cloudpickle.py b/python/pyspark/cloudpickle.py index 95b3abc74244b..e56e22a9b920e 100644 --- a/python/pyspark/cloudpickle.py +++ b/python/pyspark/cloudpickle.py @@ -241,6 +241,7 @@ def save_function_tuple(self, func): save(f_globals) save(defaults) save(dct) + save(func.__module__) write(pickle.TUPLE) write(pickle.REDUCE) # applies _fill_function on the tuple @@ -698,13 +699,14 @@ def _genpartial(func, args, kwds): return partial(func, *args, **kwds) -def _fill_function(func, globals, defaults, dict): +def _fill_function(func, globals, defaults, dict, module): """ Fills in the rest of function data into the skeleton function object that were created via _make_skel_func(). """ func.__globals__.update(globals) func.__defaults__ = defaults func.__dict__ = dict + func.__module__ = module return func diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5599b8f3ecd88..7a300e7647f6a 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -468,7 +468,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") >>> si_model = stringIndexer.fit(df) >>> td = si_model.transform(df) - >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed") + >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) >>> model = gbt.fit(td) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True @@ -491,12 +491,12 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", - maxIter=20, stepSize=0.1): + maxIter=20, stepSize=0.1, seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( @@ -507,7 +507,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -516,12 +516,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1): + lossType="logistic", maxIter=20, stepSize=0.1, seed=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) Sets params for Gradient Boosted Tree Classification. """ kwargs = self.setParams._input_kwargs diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ea874dcd8f5ee..bc63f6d5592e7 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1402,7 +1402,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " + "sensitive comparison over the stop words") stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords - defaultStopWords = stopWordsObj.English() + defaultStopWords = list(stopWordsObj.English()) self._setDefault(stopWords=defaultStopWords) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index a0bb8ceed8861..b09356a146ac9 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -591,7 +591,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> gbt = GBTRegressor(maxIter=5, maxDepth=2) + >>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) >>> model = gbt.fit(df) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True @@ -614,12 +614,12 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1): + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None) """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) @@ -629,7 +629,8 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred "Supported options: " + ", ".join(GBTParams.supportedLossTypes)) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, + seed=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -638,12 +639,12 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1): + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None) Sets params for Gradient Boosted Tree Regression. """ kwargs = self.setParams._input_kwargs diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 674dbe937038a..b2bd5696927c4 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -20,6 +20,9 @@ """ import sys +if sys.version > '3': + basestring = str + try: import xmlrunner except ImportError: @@ -283,6 +286,8 @@ def test_stopwordsremover(self): self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) + self.assertEqual(type(stopWordRemover.getStopWords()), list) + self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py index c7bd109f02122..b525d26204371 100644 --- a/python/pyspark/mllib/linalg/__init__.py +++ b/python/pyspark/mllib/linalg/__init__.py @@ -293,7 +293,7 @@ def parse(s): s = s[start + 1: end] try: - values = [float(val) for val in s.split(',')] + values = [float(val) for val in s.split(',') if val] except ValueError: raise ValueError("Unable to parse values from %s" % s) return DenseVector(values) @@ -584,7 +584,7 @@ def parse(s): new_s = s[ind_start + 1: ind_end] ind_list = new_s.split(',') try: - indices = [int(ind) for ind in ind_list] + indices = [int(ind) for ind in ind_list if ind] except ValueError: raise ValueError("Unable to parse indices from %s." % new_s) s = s[ind_end + 1:].strip() @@ -597,7 +597,7 @@ def parse(s): raise ValueError("Values array should end with ']'.") val_list = s[val_start + 1: val_end].split(',') try: - values = [float(val) for val in val_list] + values = [float(val) for val in val_list if val] except ValueError: raise ValueError("Unable to parse values from %s." % s) return SparseVector(size, indices, values) diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 26db87ea546a0..b056246ab0cff 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -388,14 +388,20 @@ def test_dense_matrix_is_transposed(self): self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9])) def test_parse_vector(self): + a = DenseVector([]) + self.assertEqual(str(a), '[]') + self.assertEqual(Vectors.parse(str(a)), a) a = DenseVector([3, 4, 6, 7]) - self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]') - self.assertTrue(Vectors.parse(str(a)), a) + self.assertEqual(str(a), '[3.0,4.0,6.0,7.0]') + self.assertEqual(Vectors.parse(str(a)), a) + a = SparseVector(4, [], []) + self.assertEqual(str(a), '(4,[],[])') + self.assertEqual(SparseVector.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) - self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])') - self.assertTrue(Vectors.parse(str(a)), a) + self.assertEqual(str(a), '(4,[0,2],[3.0,4.0])') + self.assertEqual(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) - self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a) + self.assertEqual(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a) def test_norms(self): a = DenseVector([0, 2, 3, -1]) diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 00bb9a62e904a..1ed098c076ae5 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -115,7 +115,7 @@ def _parse_memory(s): 2048 """ units = {'g': 1024, 'm': 1, 't': 1 << 20, 'k': 1.0 / 1024} - if s[-1] not in units: + if s[-1].lower() not in units: raise ValueError("invalid format: " + s) return int(float(s[:-1]) * units[s[-1].lower()]) diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py index 57bbe340bbd4d..46663f69a0881 100644 --- a/python/pyspark/sql/window.py +++ b/python/pyspark/sql/window.py @@ -60,7 +60,7 @@ def partitionBy(*cols): @since(1.4) def orderBy(*cols): """ - Creates a :class:`WindowSpec` with the partitioning defined. + Creates a :class:`WindowSpec` with the ordering defined. """ sc = SparkContext._active_spark_context jspec = sc._jvm.org.apache.spark.sql.expressions.Window.orderBy(_to_java_cols(cols)) diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 5bd94476597ab..7e072c05ebd9b 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -228,6 +228,12 @@ def test_itemgetter(self): getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d)) + def test_function_module_name(self): + ser = CloudPickleSerializer() + func = lambda x: x + func2 = ser.loads(ser.dumps(func)) + self.assertEqual(func.__module__, func2.__module__) + def test_attrgetter(self): from operator import attrgetter ser = CloudPickleSerializer() @@ -1960,6 +1966,18 @@ def test_startTime(self): self.assertGreater(sc.startTime, 0) +class ConfTests(unittest.TestCase): + def test_memory_conf(self): + memoryList = ["1T", "1G", "1M", "1024K"] + for memory in memoryList: + sc = SparkContext(conf=SparkConf().set("spark.python.worker.memory", memory)) + l = list(range(1024)) + random.shuffle(l) + rdd = sc.parallelize(l, 4) + self.assertEqual(sorted(l), rdd.sortBy(lambda x: x).collect()) + sc.stop() + + @unittest.skipIf(not _have_scipy, "SciPy not installed") class SciPyTests(PySparkTestCase): diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index dbcbd6854b474..470af350fb0f6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -677,7 +677,7 @@ object HiveTypeCoercion { case e if !e.childrenResolved => e // Find tightest common type for If, if the true value and false value have different types. case i @ If(pred, left, right) if left.dataType != right.dataType => - findTightestCommonTypeToString(left.dataType, right.dataType).map { widestType => + findWiderTypeForTwo(left.dataType, right.dataType).map { widestType => val newLeft = if (left.dataType == widestType) left else Cast(left, widestType) val newRight = if (right.dataType == widestType) right else Cast(right, widestType) If(pred, newLeft, newRight) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala index 40b1eec63e551..c4e5b844299a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala @@ -110,6 +110,14 @@ trait CaseWhenLike extends Expression { // If no value is nullable and no elseValue is provided, the whole statement defaults to null. thenList.exists(_.nullable) || (elseValue.map(_.nullable).getOrElse(true)) } + + /** + * Whether should it fallback to interpret mode or not. + * @return + */ + protected def shouldFallback: Boolean = { + branches.length > 20 + } } // scalastyle:off @@ -119,7 +127,7 @@ trait CaseWhenLike extends Expression { * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions */ // scalastyle:on -case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike { +case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike with CodegenFallback { // Use private[this] Array to speed up evaluation. @transient private[this] lazy val branchesArr = branches.toArray @@ -157,6 +165,11 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike { } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + if (shouldFallback) { + // Fallback to interpreted mode if there are too many branches, as it may reach the + // 64K limit (limit on bytecode size for a single function). + return super[CodegenFallback].genCode(ctx, ev) + } val len = branchesArr.length val got = ctx.freshName("got") @@ -213,7 +226,8 @@ case class CaseWhen(branches: Seq[Expression]) extends CaseWhenLike { * https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-ConditionalFunctions */ // scalastyle:on -case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseWhenLike { +case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) + extends CaseWhenLike with CodegenFallback { // Use private[this] Array to speed up evaluation. @transient private[this] lazy val branchesArr = branches.toArray @@ -257,6 +271,11 @@ case class CaseKeyWhen(key: Expression, branches: Seq[Expression]) extends CaseW } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { + if (shouldFallback) { + // Fallback to interpreted mode if there are too many branches, as it may reach the + // 64K limit (limit on bytecode size for a single function). + return super[CodegenFallback].genCode(ctx, ev) + } val keyEval = key.gen(ctx) val len = branchesArr.length val got = ctx.freshName("got") diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala index 28f616fbb9ca5..8ab8179ea3da9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala @@ -739,7 +739,7 @@ case class Round(child: Expression, scale: Expression) if (f.isNaN || f.isInfinite) { f } else { - BigDecimal(f).setScale(_scale, HALF_UP).toFloat + BigDecimal(f.toDouble).setScale(_scale, HALF_UP).toFloat } case DoubleType => val d = input1.asInstanceOf[Double] @@ -795,39 +795,21 @@ case class Round(child: Expression, scale: Expression) s"${ev.value} = ${ce.value};" } case FloatType => // if child eval to NaN or Infinity, just return it. - if (_scale == 0) { - s""" - if (Float.isNaN(${ce.value}) || Float.isInfinite(${ce.value})){ - ${ev.value} = ${ce.value}; - } else { - ${ev.value} = Math.round(${ce.value}); - }""" - } else { - s""" - if (Float.isNaN(${ce.value}) || Float.isInfinite(${ce.value})){ - ${ev.value} = ${ce.value}; - } else { - ${ev.value} = java.math.BigDecimal.valueOf(${ce.value}). - setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).floatValue(); - }""" - } + s""" + if (Float.isNaN(${ce.value}) || Float.isInfinite(${ce.value})){ + ${ev.value} = ${ce.value}; + } else { + ${ev.value} = java.math.BigDecimal.valueOf(${ce.value}). + setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).floatValue(); + }""" case DoubleType => // if child eval to NaN or Infinity, just return it. - if (_scale == 0) { - s""" - if (Double.isNaN(${ce.value}) || Double.isInfinite(${ce.value})){ - ${ev.value} = ${ce.value}; - } else { - ${ev.value} = Math.round(${ce.value}); - }""" - } else { - s""" - if (Double.isNaN(${ce.value}) || Double.isInfinite(${ce.value})){ - ${ev.value} = ${ce.value}; - } else { - ${ev.value} = java.math.BigDecimal.valueOf(${ce.value}). - setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).doubleValue(); - }""" - } + s""" + if (Double.isNaN(${ce.value}) || Double.isInfinite(${ce.value})){ + ${ev.value} = ${ce.value}; + } else { + ${ev.value} = java.math.BigDecimal.valueOf(${ce.value}). + setScale(${_scale}, java.math.BigDecimal.ROUND_HALF_UP).doubleValue(); + }""" } if (scaleV == null) { // if scale is null, no need to eval its child at all diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala index 304b438c84ba4..77e8506e81452 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala @@ -416,7 +416,7 @@ case class EqualNullSafe(left: Expression, right: Expression) extends BinaryComp ev.isNull = "false" eval1.code + eval2.code + s""" boolean ${ev.value} = (${eval1.isNull} && ${eval2.isNull}) || - (!${eval1.isNull} && $equalCode); + (!${eval1.isNull} && !${eval2.isNull} && $equalCode); """ } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala index 1939633b256f7..a3090d8cbd668 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala @@ -204,8 +204,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru } /** - * Extracts a [[StructField]] of the given name. If the [[StructType]] object does not - * have a name matching the given name, `null` will be returned. + * Extracts the [[StructField]] with the given name. + * + * @throws IllegalArgumentException if a field with the given name does not exist */ def apply(name: String): StructField = { nameToField.getOrElse(name, @@ -214,7 +215,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru /** * Returns a [[StructType]] containing [[StructField]]s of the given names, preserving the - * original order of fields. Those names which do not have matching fields will be ignored. + * original order of fields. + * + * @throws IllegalArgumentException if a field cannot be found for any of the given names */ def apply(names: Set[String]): StructType = { val nonExistFields = names -- fieldNamesSet @@ -227,7 +230,9 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru } /** - * Returns index of a given field + * Returns the index of a given field. + * + * @throws IllegalArgumentException if a field with the given name does not exist */ def fieldIndex(name: String): Int = { nameToIndex.getOrElse(name, diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala index 142915056f451..3b5bd77b8f436 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala @@ -299,6 +299,20 @@ class HiveTypeCoercionSuite extends PlanTest { ) } + test("test for SPARK-13772") { + val rule = HiveTypeCoercion.IfCoercion + ruleTest(rule, + If(Literal(true), Literal(1.0), Cast(Literal(1.0), DecimalType(19, 0))), + If(Literal(true), Literal(1.0), Cast(Cast(Literal(1.0), DecimalType(19, 0)), DoubleType)) + ) + + ruleTest(rule, + If(Literal(true), Literal(Decimal(1)), Cast(Literal(1.0), DecimalType(19, 9))), + If(Literal(true), Cast(Literal(Decimal(1)), DecimalType(19, 9)), + Cast(Literal(1.0), DecimalType(19, 9))) + ) + } + test("type coercion for CaseKeyWhen") { ruleTest(HiveTypeCoercion.CaseWhenCoercion, CaseKeyWhen(Literal(1.toShort), Seq(Literal(1), Literal("a"))), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index cd2ef7dcd0cd3..66a7e228f84bb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -59,6 +59,28 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("SPARK-13242: case-when expression with large number of branches (or cases)") { + val cases = 50 + val clauses = 20 + + // Generate an individual case + def generateCase(n: Int): Seq[Expression] = { + val condition = (1 to clauses) + .map(c => EqualTo(BoundReference(0, StringType, false), Literal(s"$c:$n"))) + .reduceLeft[Expression]((l, r) => Or(l, r)) + Seq(condition, Literal(n)) + } + + val expression = CaseWhen((1 to cases).flatMap(generateCase(_))) + + val plan = GenerateMutableProjection.generate(Seq(expression))() + val input = new GenericMutableRow(Array[Any](UTF8String.fromString(s"${clauses}:${cases}"))) + val actual = plan(input).toSeq(Seq(expression.dataType)) + + assert(actual(0) == cases) + } + + test("test generated safe and unsafe projection") { val schema = new StructType(Array( StructField("a", StringType, true), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala index 88ed9fdd6465f..aacc56fc44186 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala @@ -552,5 +552,9 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(Round(Literal.create(null, dataType), Literal.create(null, IntegerType)), null) } + + checkEvaluation(Round(-3.5, 0), -4.0) + checkEvaluation(Round(-0.35, 1), -0.4) + checkEvaluation(Round(-35, -1), -40) } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala index 03e7611fce8ff..b3f20692b2dfc 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala @@ -273,7 +273,8 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper { } test("BinaryComparison: null test") { - val normalInt = Literal(1) + // Use -1 (default value for codegen) which can trigger some weird bugs, e.g. SPARK-14757 + val normalInt = Literal(-1) val nullInt = Literal.create(null, IntegerType) def nullTest(op: (Expression, Expression) => Expression): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala index eaafc96e4d2e7..4d01b78c3c10f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.columnar +import scala.collection.mutable + import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -88,7 +90,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera case array: ArrayType => classOf[ArrayColumnAccessor].getName case t: MapType => classOf[MapColumnAccessor].getName } - ctx.addMutableState(accessorCls, accessorName, s"$accessorName = null;") + ctx.addMutableState(accessorCls, accessorName, "") val createCode = dt match { case t if ctx.isPrimitiveType(dt) => @@ -97,7 +99,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));" case other => s"""$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder), - (${dt.getClass.getName}) columnTypes[$index]);""" + (${dt.getClass.getName}) columnTypes[$index]);""" } val extract = s"$accessorName.extractTo(mutableRow, $index);" @@ -114,6 +116,42 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera (createCode, extract + patch) }.unzip + /* + * 200 = 6000 bytes / 30 (up to 30 bytes per one call)) + * the maximum byte code size to be compiled for HotSpot is 8000. + * We should keep less than 8000 + */ + val numberOfStatementsThreshold = 200 + val (initializerAccessorCalls, extractorCalls) = + if (initializeAccessors.length <= numberOfStatementsThreshold) { + (initializeAccessors.mkString("\n"), extractors.mkString("\n")) + } else { + val groupedAccessorsItr = initializeAccessors.grouped(numberOfStatementsThreshold) + val groupedExtractorsItr = extractors.grouped(numberOfStatementsThreshold) + var groupedAccessorsLength = 0 + groupedAccessorsItr.zipWithIndex.map { case (body, i) => + groupedAccessorsLength += 1 + val funcName = s"accessors$i" + val funcCode = s""" + |private void $funcName() { + | ${body.mkString("\n")} + |} + """.stripMargin + ctx.addNewFunction(funcName, funcCode) + } + groupedExtractorsItr.zipWithIndex.map { case (body, i) => + val funcName = s"extractors$i" + val funcCode = s""" + |private void $funcName() { + | ${body.mkString("\n")} + |} + """.stripMargin + ctx.addNewFunction(funcName, funcCode) + } + ((0 to groupedAccessorsLength - 1).map { i => s"accessors$i();" }.mkString("\n"), + (0 to groupedAccessorsLength - 1).map { i => s"extractors$i();" }.mkString("\n")) + } + val code = s""" import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -149,8 +187,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera this.nativeOrder = ByteOrder.nativeOrder(); this.buffers = new byte[${columnTypes.length}][]; this.mutableRow = new MutableUnsafeRow(rowWriter); - - ${initMutableStates(ctx)} } public void initialize(Iterator input, DataType[] columnTypes, int[] columnIndexes) { @@ -159,6 +195,8 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera this.columnIndexes = columnIndexes; } + ${declareAddedFunctions(ctx)} + public boolean hasNext() { if (currentRow < numRowsInBatch) { return true; @@ -173,7 +211,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera for (int i = 0; i < columnIndexes.length; i ++) { buffers[i] = batch.buffers()[columnIndexes[i]]; } - ${initializeAccessors.mkString("\n")} + ${initializerAccessorCalls} return hasNext(); } @@ -182,7 +220,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera currentRow += 1; bufferHolder.reset(); rowWriter.initialize(bufferHolder, $numFields); - ${extractors.mkString("\n")} + ${extractorCalls} unsafeRow.pointTo(bufferHolder.buffer, $numFields, bufferHolder.totalSize()); return unsafeRow; } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala index 735d52f808868..7a94c077fe60d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelation.scala @@ -75,11 +75,9 @@ private[sql] case class InsertIntoHadoopFsRelation( case (SaveMode.ErrorIfExists, true) => throw new AnalysisException(s"path $qualifiedOutputPath already exists.") case (SaveMode.Overwrite, true) => - Utils.tryOrIOException { - if (!fs.delete(qualifiedOutputPath, true /* recursively */)) { - throw new IOException(s"Unable to clear output " + - s"directory $qualifiedOutputPath prior to writing to it") - } + if (!fs.delete(qualifiedOutputPath, true /* recursively */)) { + throw new IOException(s"Unable to clear output " + + s"directory $qualifiedOutputPath prior to writing to it") } true case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala index 83b4eaf050eea..f413185a56561 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.sources.{HadoopFsRelation, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.{StructType, StringType} -import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.util.{SerializableConfiguration, Utils} private[sql] abstract class BaseWriterContainer( @@ -257,19 +257,16 @@ private[sql] class DefaultWriterContainer( // If anything below fails, we should abort the task. try { - while (iterator.hasNext) { - val internalRow = iterator.next() - writer.writeInternal(internalRow) - } - - commitTask() + Utils.tryWithSafeFinallyAndFailureCallbacks { + while (iterator.hasNext) { + val internalRow = iterator.next() + writer.writeInternal(internalRow) + } + commitTask() + }(catchBlock = abortTask()) } catch { - case cause: Throwable => - logError("Aborting task.", cause) - // call failure callbacks first, so we could have a chance to cleanup the writer. - TaskContext.get().asInstanceOf[TaskContextImpl].markTaskFailed(cause) - abortTask() - throw new SparkException("Task failed while writing rows.", cause) + case t: Throwable => + throw new SparkException("Task failed while writing rows", t) } def commitTask(): Unit = { @@ -343,81 +340,81 @@ private[sql] class DynamicPartitionWriterContainer( // If anything below fails, we should abort the task. var currentWriter: OutputWriter = null try { - // This will be filled in if we have to fall back on sorting. - var sorter: UnsafeKVExternalSorter = null - while (iterator.hasNext && sorter == null) { - val inputRow = iterator.next() - val currentKey = getPartitionKey(inputRow) - currentWriter = outputWriters.get(currentKey) - - if (currentWriter == null) { - if (outputWriters.size < maxOpenFiles) { - currentWriter = newOutputWriter(currentKey) - outputWriters.put(currentKey.copy(), currentWriter) - currentWriter.writeInternal(getOutputRow(inputRow)) + Utils.tryWithSafeFinallyAndFailureCallbacks { + // This will be filled in if we have to fall back on sorting. + var sorter: UnsafeKVExternalSorter = null + while (iterator.hasNext && sorter == null) { + val inputRow = iterator.next() + val currentKey = getPartitionKey(inputRow) + currentWriter = outputWriters.get(currentKey) + + if (currentWriter == null) { + if (outputWriters.size < maxOpenFiles) { + currentWriter = newOutputWriter(currentKey) + outputWriters.put(currentKey.copy(), currentWriter) + currentWriter.writeInternal(getOutputRow(inputRow)) + } else { + logInfo(s"Maximum partitions reached, falling back on sorting.") + sorter = new UnsafeKVExternalSorter( + StructType.fromAttributes(partitionColumns), + StructType.fromAttributes(dataColumns), + SparkEnv.get.blockManager, + TaskContext.get().taskMemoryManager().pageSizeBytes) + sorter.insertKV(currentKey, getOutputRow(inputRow)) + } } else { - logInfo(s"Maximum partitions reached, falling back on sorting.") - sorter = new UnsafeKVExternalSorter( - StructType.fromAttributes(partitionColumns), - StructType.fromAttributes(dataColumns), - SparkEnv.get.blockManager, - TaskContext.get().taskMemoryManager().pageSizeBytes) - sorter.insertKV(currentKey, getOutputRow(inputRow)) + currentWriter.writeInternal(getOutputRow(inputRow)) } - } else { - currentWriter.writeInternal(getOutputRow(inputRow)) - } - } - // current writer is included in outputWriters - currentWriter = null - - // If the sorter is not null that means that we reached the maxFiles above and need to finish - // using external sort. - if (sorter != null) { - while (iterator.hasNext) { - val currentRow = iterator.next() - sorter.insertKV(getPartitionKey(currentRow), getOutputRow(currentRow)) } + // current writer is included in outputWriters + currentWriter = null + + // If the sorter is not null that means that we reached the maxFiles above and need to + // finish using external sort. + if (sorter != null) { + while (iterator.hasNext) { + val currentRow = iterator.next() + sorter.insertKV(getPartitionKey(currentRow), getOutputRow(currentRow)) + } - logInfo(s"Sorting complete. Writing out partition files one at a time.") - - val sortedIterator = sorter.sortedIterator() - var currentKey: InternalRow = null - while (sortedIterator.next()) { - if (currentKey != sortedIterator.getKey) { - if (currentWriter != null) { - currentWriter.close() - currentWriter = null + logInfo(s"Sorting complete. Writing out partition files one at a time.") + + val sortedIterator = sorter.sortedIterator() + var currentKey: InternalRow = null + while (sortedIterator.next()) { + if (currentKey != sortedIterator.getKey) { + if (currentWriter != null) { + currentWriter.close() + currentWriter = null + } + currentKey = sortedIterator.getKey.copy() + logDebug(s"Writing partition: $currentKey") + + // Either use an existing file from before, or open a new one. + currentWriter = outputWriters.remove(currentKey) + if (currentWriter == null) { + currentWriter = newOutputWriter(currentKey) + } } - currentKey = sortedIterator.getKey.copy() - logDebug(s"Writing partition: $currentKey") - // Either use an existing file from before, or open a new one. - currentWriter = outputWriters.remove(currentKey) - if (currentWriter == null) { - currentWriter = newOutputWriter(currentKey) - } + currentWriter.writeInternal(sortedIterator.getValue) + } + if (currentWriter != null) { + currentWriter.close() + currentWriter = null } - - currentWriter.writeInternal(sortedIterator.getValue) - } - if (currentWriter != null) { - currentWriter.close() - currentWriter = null } - } - commitTask() - } catch { - case cause: Throwable => - logError("Aborting task.", cause) - // call failure callbacks first, so we could have a chance to cleanup the writer. - TaskContext.get().asInstanceOf[TaskContextImpl].markTaskFailed(cause) + commitTask() + }(catchBlock = { if (currentWriter != null) { currentWriter.close() } abortTask() - throw new SparkException("Task failed while writing rows.", cause) + }) + } catch { + case t: Throwable => + throw new SparkException("Task failed while writing rows", t) } /** Open and returns a new OutputWriter given a partition key. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala index 49646a99d68c8..83d308ad0f9bc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala @@ -55,6 +55,12 @@ private[ui] class AllExecutionsPage(parent: SQLTab) extends WebUIPage("") with L } _content } + content ++= + UIUtils.headerSparkPage("SQL", content, parent, Some(5000)) } } @@ -118,14 +124,12 @@ private[ui] abstract class ExecutionTable( {failedJobs} }} - {detailCell(executionUIData.physicalPlanDescription)} } private def descriptionCell(execution: SQLExecutionUIData): Seq[Node] = { val details = if (execution.details.nonEmpty) { - + +details ++